diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 69192299a3f..00000000000 --- a/.flake8 +++ /dev/null @@ -1,6 +0,0 @@ -[flake8] -# Ignore some errors, since we autoformat them away already wherever possible -# from https://github.com/psf/black/blob/main/.flake8 -# E302 is ignored to support jupytext files -ignore = E203, E266, E501, W503, E302 -exclude = .ipynb_checkpoints,*_cookiecutter,cookiecutter,etl/steps/archive,etl/snapshots/archive diff --git a/.vscode/settings.json b/.vscode/settings.json index 666dc3fa9a5..cb06b055745 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -37,17 +37,11 @@ "**/docs/architecture/*.md" ], "files.exclude": { - "etl/steps/archive": true, - "snapshots/archive": true, "**/dataset_*_config.json": true, "**/dataset_*_values.json": true, "**/dataset_*.json.dvc": true, "**/dataset_*.feather.dvc": true }, - "search.exclude": { - "etl/steps/archive": true, - "snapshots/archive": true - }, "yaml.format.printWidth": 999, "ruff.path": [ ".venv/bin/ruff" diff --git a/apps/anomalist/anomalist_api.py b/apps/anomalist/anomalist_api.py index 2ffab345ab7..765347434bc 100644 --- a/apps/anomalist/anomalist_api.py +++ b/apps/anomalist/anomalist_api.py @@ -1,3 +1,4 @@ +import random import tempfile import time from pathlib import Path @@ -90,7 +91,13 @@ def get_variables_views_in_charts( if len(df) == 0: df = pd.DataFrame(columns=["variable_id", "chart_id", "chart_slug", "views_7d", "views_14d", "views_365d"]) - return df + return df.astype( + { + "views_7d": "Int64", + "views_14d": "Int64", + "views_365d": "Int64", + } + ).fillna(0) def renormalize_score( @@ -114,14 +121,14 @@ def renormalize_score( # Function to format population numbers. def pretty_print_number(number): - if number >= 1e9: + if pd.isna(number): + return "?" + elif int(number) >= 1e9: return f"{number/1e9:.1f}B" elif number >= 1e6: return f"{number/1e6:.1f}M" elif number >= 1e3: return f"{number/1e3:.1f}k" - elif pd.isna(number): - return "?" else: return f"{int(number)}" @@ -394,6 +401,9 @@ def add_analytics_score(df_reduced: pd.DataFrame) -> pd.DataFrame: # Fill them with a low value (e.g. 0.1) to avoid zeros when calculating the final score. df_reduced["score_analytics"] = df_reduced["score_analytics"].fillna(fillna_value) + # Fill missing views + df_reduced["views"] = df_reduced["views"].fillna(0) + return df_reduced @@ -436,6 +446,7 @@ def anomaly_detection( dry_run: bool = False, force: bool = False, reset_db: bool = False, + sample_n: Optional[int] = None, ) -> None: """Detect anomalies.""" engine = get_engine() @@ -475,11 +486,15 @@ def anomaly_detection( dataset_variable_ids[variable.datasetId].append(variable) for dataset_id, variables_in_dataset in dataset_variable_ids.items(): + # Limit the number of variables. + if sample_n and len(variables_in_dataset) > sample_n: + variables_in_dataset = _sample_variables(variables_in_dataset, sample_n) + # Get dataset's checksum with Session(engine) as session: dataset = gm.Dataset.load_dataset(session, dataset_id) - log.info("loading_data.start") + log.info("loading_data.start", variables=len(variables_in_dataset)) variables_old = [ variables[variable_id_old] for variable_id_old in variable_mapping.keys() @@ -487,9 +502,18 @@ def anomaly_detection( ] variables_old_and_new = variables_in_dataset + variables_old t = time.time() - df = load_data_for_variables(engine=engine, variables=variables_old_and_new) + try: + df = load_data_for_variables(engine=engine, variables=variables_old_and_new) + except FileNotFoundError as e: + # This happens when a dataset is in DB, but not in a local catalog. + log.error("loading_data.error", error=str(e)) + continue + log.info("loading_data.end", t=time.time() - t) + if df.empty: + continue + for anomaly_type in anomaly_types: # Instantiate the anomaly detector. if anomaly_type not in ANOMALY_DETECTORS: @@ -628,6 +652,10 @@ def load_data_for_variables(engine: Engine, variables: list[gm.Variable]) -> pd. df = pd.DataFrame(variable_data_table_from_catalog(engine, variables=variables)) df = df.rename(columns={"country": "entity_name"}) + if "year" not in df.columns and "date" in df.columns: + log.warning("Anomalist does not work for datasets with `date` column yet.") + return pd.DataFrame() + # Define the list of columns that are not index columns. data_columns = [v.id for v in variables] @@ -642,7 +670,7 @@ def load_data_for_variables(engine: Engine, variables: list[gm.Variable]) -> pd. # Sort data (which may be needed for some detectors). # NOTE: Here, we first convert the entity_name to string, because otherwise the sorting will be based on categorical order (which can be arbitrary). - df = df.astype({"entity_name": str}).sort_values(INDEX_COLUMNS).reset_index(drop=True) + df = df.astype({"entity_name": "string[pyarrow]"}).sort_values(INDEX_COLUMNS).reset_index(drop=True) return df @@ -681,3 +709,30 @@ def combine_and_reduce_scores_df(anomalies: List[gm.Anomaly]) -> pd.DataFrame: # df = df.astype({"year": int}) return df_reduced + + +def _sample_variables(variables: List[gm.Variable], n: int) -> List[gm.Variable]: + """Sample n variables. Prioritize variables that are used in charts, then fill the rest + with random variables.""" + if len(variables) <= n: + return variables + + # Include all variables that are used in charts. + # NOTE: if we run this before indicator upgrader, none of the charts will be in charts yet. So the + # first round of anomalies with random sampling won't be very useful. Next runs should be useful + # though + df_views = get_variables_views_in_charts(variable_ids=[v.id for v in variables]) + sample_ids = set(df_views.sort_values("views_365d", ascending=False).head(n)["variable_id"]) + + # Fill the rest with random variables. + unused_ids = list(set(v.id for v in variables) - sample_ids) + random.seed(1) + if len(sample_ids) < n: + sample_ids |= set(np.random.choice(unused_ids, n - len(sample_ids), replace=False)) + + log.info( + "sampling_variables", + original_n=len(variables), + new_n=len(sample_ids), + ) + return [v for v in variables if v.id in sample_ids] diff --git a/apps/anomalist/cli.py b/apps/anomalist/cli.py index 048180616ef..a2db3b9e232 100644 --- a/apps/anomalist/cli.py +++ b/apps/anomalist/cli.py @@ -62,6 +62,12 @@ type=bool, help="Drop anomalies table and recreate it. This is useful for development when the schema changes.", ) +@click.option( + "--sample-n", + type=int, + default=500, + help="Sample at most N variables from a dataset", +) def cli( anomaly_types: Optional[Tuple[str, ...]], dataset_ids: Optional[list[int]], @@ -70,6 +76,7 @@ def cli( dry_run: bool, force: bool, reset_db: bool, + sample_n: Optional[int], ) -> None: """TBD @@ -140,6 +147,7 @@ def cli( dry_run=dry_run, force=force, reset_db=reset_db, + sample_n=sample_n, ) diff --git a/apps/anomalist/detectors.py b/apps/anomalist/detectors.py index 4ca26ee5e4c..244c176f9ec 100644 --- a/apps/anomalist/detectors.py +++ b/apps/anomalist/detectors.py @@ -1,6 +1,5 @@ from typing import Dict, List -import numpy as np import pandas as pd import structlog from sklearn.ensemble import IsolationForest @@ -71,17 +70,10 @@ def get_scale_df(self, df: pd.DataFrame, variable_ids: List[int], variable_mappi def get_zeros_df(self, df: pd.DataFrame, variable_ids: List[int]) -> pd.DataFrame: # Create a dataframe of zeros. - df_zeros = pd.DataFrame(np.zeros_like(df), columns=df.columns)[INDEX_COLUMNS + variable_ids] - df_zeros[INDEX_COLUMNS] = df[INDEX_COLUMNS].copy() + df_zeros = df[INDEX_COLUMNS + variable_ids].copy() + df_zeros[variable_ids] = 0 return df_zeros - def get_nans_df(self, df: pd.DataFrame, variable_ids: List[int]) -> pd.DataFrame: - # Create a dataframe of nans. - df_nans = pd.DataFrame(np.empty_like(df), columns=df.columns)[INDEX_COLUMNS + variable_ids] - df_nans[variable_ids] = np.nan - df_nans[INDEX_COLUMNS] = df[INDEX_COLUMNS].copy() - return df_nans - class AnomalyUpgradeMissing(AnomalyDetector): """New data misses entity-years that used to exist in old version.""" diff --git a/apps/anomalist/gp_detector.py b/apps/anomalist/gp_detector.py index 8269def1bb9..3dc6392c751 100644 --- a/apps/anomalist/gp_detector.py +++ b/apps/anomalist/gp_detector.py @@ -100,6 +100,7 @@ def get_score_df(self, df: pd.DataFrame, variable_ids: List[int], variable_mappi return pd.DataFrame() # Create a processing queue with (entity_name, variable_id) pairs + # TODO: we could make probabilities proportional to "relevance" score in anomalist items = _processing_queue( items=list(df_wide.index.unique()), ) @@ -118,8 +119,8 @@ def get_score_df(self, df: pd.DataFrame, variable_ids: List[int], variable_mappi # Get the data for the current entity and variable group = df_wide.loc[(entity_name, variable_id)] - # Skip if the series has only one or fewer data points - if isinstance(group, pd.Series) or len(group) <= 1: + # Skip if the series has only three or fewer data points + if isinstance(group, pd.Series) or len(group) <= 3: continue # Prepare the input features (X) and target values (y) for Gaussian Process @@ -161,9 +162,13 @@ def get_score_df(self, df: pd.DataFrame, variable_ids: List[int], variable_mappi df_score_long["p_value"] = 2 * (1 - norm.cdf(np.abs(df_score_long["z"]))) # Adjust p-values for multiple testing - df_score_long["adj_p_value"] = df_score_long.groupby(["entity_name", "variable_id"]).p_value.transform( - lambda p: multipletests(p, method="fdr_bh")[1] - ) + df_score_long["adj_p_value"] = df_score_long.groupby( + ["entity_name", "variable_id"], observed=True + ).p_value.transform(lambda p: multipletests(p, method="fdr_bh")[1]) + + # Anomalies with adj p-value < 0.1 are not interesting, drop them. This could be + # even stricter + df_score_long = df_score_long[df_score_long["adj_p_value"] < 0.1] # Final score is 1 - p-value df_score_long["anomaly_score"] = 1 - df_score_long["adj_p_value"] @@ -285,20 +290,25 @@ def viz(self, df: pd.DataFrame, variable: gm.Variable, country: Optional[str] = def get_scale_df(self, df: pd.DataFrame, variable_ids: List[int], variable_mapping: Dict[int, int]) -> pd.DataFrame: # NOTE: Ideally, for this detector, the scale should be the difference between a value and the mean, divided by the range of values of the variable. But calculating that may be hard to implement in an efficient way. + log.info("gp_outlier.get_scale_df.start") + t = time.time() + # Create a dataframe of zeros. df_scale = self.get_zeros_df(df, variable_ids) - for variable_id in variable_ids: - # The scale is given by the size of changes in consecutive points (for a given country), as a fraction of the maximum range of values of that variable. - df_scale[variable_id] = abs(df[variable_id].diff().fillna(0)) / ( - df[variable_id].max() - df[variable_id].min() - ) + # The scale is given by the size of changes in consecutive points (for a given country), as a fraction of the maximum range of values of that variable. + ranges = df[variable_ids].max() - df[variable_ids].min() + diff = df[variable_ids].diff().fillna(0).abs() # The previous procedure includes the calculation of the deviation between the last point of an entity and the first point of the next, which is meaningless. # Therefore, make zero the first point of each entity_name for all columns. - df_scale.loc[df_scale["entity_name"] != df_scale["entity_name"].shift(), variable_ids] = 0 + diff.loc[df["entity_name"] != df["entity_name"].shift(), :] = 0 + + df_scale[variable_ids] = diff / ranges # Since this anomaly detector return a long dataframe, we need to melt it. df_scale = df_scale.melt(id_vars=["entity_name", "year"], var_name="variable_id", value_name="score_scale") + log.info("gp_outlier.get_scale_df.end", t=time.time() - t) + return df_scale diff --git a/apps/backport/migrate/garden_cookiecutter/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py b/apps/backport/migrate/garden_cookiecutter/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py index a4968d65180..02d9f00c5ef 100644 --- a/apps/backport/migrate/garden_cookiecutter/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py +++ b/apps/backport/migrate/garden_cookiecutter/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # snap = paths.load_snapshot() - tb = snap.read().set_index(["country", "year"]) + tb = snap.read(safe_types=False).set_index(["country", "year"]) # # Save outputs. diff --git a/apps/chart_animation/cli.py b/apps/chart_animation/cli.py new file mode 100644 index 00000000000..75f8b4fbe05 --- /dev/null +++ b/apps/chart_animation/cli.py @@ -0,0 +1,497 @@ +"""Create a GIF or video for a given chart URL. + +""" +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from urllib.parse import parse_qs, urlencode, urlparse, urlunparse + +import click +import requests +from moviepy import ImageSequenceClip +from PIL import Image +from rich_click.rich_command import RichCommand +from structlog import get_logger +from tqdm.auto import tqdm + +# Initialize log. +log = get_logger() + +# Define default downloads folder (to use if either output_gif is None or png_folder is None). +DOWNLOADS_DIR = Path.home() / ".chart_animation" + +# Default maximum number of years to fetch images for. +MAX_NUM_YEARS = 100 + + +def get_chart_metadata(chart_url): + # Given a chart URL, get the chart metadata. + base_url = urlunparse(urlparse(chart_url)._replace(query="")) + chart_metadata_url = str(base_url).rstrip("/") + ".metadata.json" + log.info(f"Fetching metadata from: {chart_metadata_url}") + response = requests.get(chart_metadata_url) + response.raise_for_status() + chart_metadata = response.json() + + return chart_metadata + + +def get_indicator_metadata(indicator_metadata_url): + # Given an indicator metadata URL, get the indicator metadata. + response = requests.get(indicator_metadata_url) + response.raise_for_status() + return response.json() + + +def get_indicators_metadata_from_chart_metadata(chart_metadata, max_workers=None): + # Given a chart metadata, get the metadata for all the indicators in the chart. + + # Get indicator API URLs. + indicator_metadata_urls = [ + column["fullMetadata"] + for column in chart_metadata["columns"].values() + if "undefined" not in column["fullMetadata"] + ] + + # Get metadata for each of the indicators in the chart. + indicators_metadata = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit tasks for each URL + future_to_url = {executor.submit(get_indicator_metadata, url): url for url in indicator_metadata_urls} + for future in as_completed(future_to_url): + try: + indicators_metadata.append(future.result()) + except Exception as e: + print(f"Error fetching metadata from {future_to_url[future]}: {e}") + + return indicators_metadata + + +def get_years_in_chart(chart_url): + # Given a chart URL, get the years available in the chart. + chart_metadata = get_chart_metadata(chart_url) + indicators_metadata = get_indicators_metadata_from_chart_metadata(chart_metadata) + years = sorted( + set( + [ + year["id"] + for year in sum( + [ + values["values"] + for indicator in indicators_metadata + for dimension, values in indicator["dimensions"].items() + if dimension in ["years"] + ], + [], + ) + ] + ) + ) + return years + + +def get_query_parameters_in_chart(chart_url, all_years): + # Select default values. + year_range_open = True + year_start, year_end = min(all_years), max(all_years) + tab = "map" + + # Attempt to get those parameters from the chart URL. + query_params = parse_qs(urlparse(chart_url).query) + if "time" in query_params: + time = query_params["time"][0] + if ".." in time: + year_range_open = True + year_start, year_end = time.split("..") + if year_start == "earliest": + year_start = min(all_years) + if year_end == "latest": + year_end = max(all_years) + else: + year_range_open = False + year_start = int(time) + year_end = year_start + if "tab" in query_params: + tab = query_params["tab"][0] + + params = { + "year_range_open": year_range_open, + "year_min": int(year_start), + "year_max": int(year_end), + "tab": tab, + } + + return params + + +def modify_chart_url(chart_url, year, year_range_open, tab, social_media_square): + # Take a chart URL, modify its parameters, and create a new URL for the PNG download. + parsed_url = urlparse(chart_url) + path = parsed_url.path + if not path.endswith(".png"): + path += ".png" + + query_params = parse_qs(parsed_url.query) + if year_range_open: + query_params["time"] = [f"earliest..{year}"] + else: + query_params["time"] = [str(year)] + + if social_media_square: + query_params["imType"] = ["social-media-square"] + query_params["imSquareSize"] = ["1080"] + + query_params["tab"] = [tab] + query_params["download"] = ["png"] + updated_query = urlencode(query_params, doseq=True) + png_url = urlunparse(parsed_url._replace(path=path, query=updated_query)) + return png_url + + +def download_chart_png(png_url, output_file): + # Download a PNG file from a given URL. + output_file = Path(output_file) + + # Skip download if the file already exists. + if output_file.exists(): + log.info(f"File {output_file} already exists. Skipping download.") + return output_file + + # Ensure the directory exists. + output_file.parent.mkdir(parents=True, exist_ok=True) + + # Download PNG. + try: + response = requests.get(png_url, stream=True) + response.raise_for_status() + with open(output_file, "wb") as file: + for chunk in response.iter_content(chunk_size=8192): + file.write(chunk) + return output_file + except Exception as e: + log.error(f"Failed to create file {output_file}: {e}") + return None + + +def get_chart_slug(chart_url): + # Given a chart URL, get the chart slug. + return urlparse(chart_url).path.split("/")[-1] + + +def create_image_file_name(year, year_range_open, tab, social_media_square): + return ( + f"{year}_{'open' if year_range_open else 'close'}_{tab}_{'square' if social_media_square else 'nonsquare'}.png" + ) + + +def get_images_from_chart_url( + chart_url, + png_folder, + tab=None, + social_media_square=False, + years=None, + year_range_open=True, + max_workers=None, + max_num_years=MAX_NUM_YEARS, +): + # Given a chart URL, download the PNGs into a folder. If they already exists, skip them. + + # If the tab parameter is not provided, extract it from the chart URL. + if tab is None: + # Extract query parameters + tab = parse_qs(urlparse(chart_url).query).get("tab", [None])[0] + if tab is None: + # Default to "map" if the tab parameter is not found. + tab = "map" + + if years is None: + years = get_years_in_chart(chart_url=chart_url) + + if not years: + log.error("No years available.") + return None + + if year_range_open: + if len(years) < 2: + log.error("Cannot generate year ranges with less than two years.") + return None + years = years[1:] + + if max_num_years is not None and len(years) > max_num_years: + log.error( + f"Number of years ({len(years)}) exceeds the maximum number of years ({max_num_years}). Consider setting years explicitly or increasing max_num_years. Years available: {years}" + ) + return None + + # Download PNGs in parallel. + log.info("Downloading images in parallel.") + image_paths = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit( + download_chart_png, + modify_chart_url(chart_url, year, year_range_open, tab, social_media_square), + Path(png_folder) + / create_image_file_name( + year=year, year_range_open=year_range_open, tab=tab, social_media_square=social_media_square + ), + ): year + for year in years + } + + for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading PNGs"): + try: + image_path = future.result() + if image_path: + image_paths.append(image_path) + except Exception as e: + log.error(f"Error downloading image: {e}") + + return image_paths + + +def prepare_images(image_paths, remove_duplicate_frames, repetitions_last_frame): + # Create a list of images from a list of image paths. + images = [Image.open(img) for img in sorted(image_paths)] + if remove_duplicate_frames: + # Sometimes, even though the list of years is correct for all countries, for the specifically selected ones there may not be any data. + # In this case, the PNGs will be the same, so we can remove duplicates. + images = [images[i] for i in range(len(images)) if i == 0 or images[i] != images[i - 1]] + + # Optionally repeat the last frame. + images += [images[-1]] * repetitions_last_frame + + return images + + +def create_gif_from_images( + image_paths, + output_file, + duration=200, + loops=0, + remove_duplicate_frames=True, + repetitions_last_frame=0, + duration_of_animation=False, +): + # Prepare a list of image objects. + images = prepare_images( + image_paths=image_paths, + remove_duplicate_frames=remove_duplicate_frames, + repetitions_last_frame=repetitions_last_frame, + ) + + if duration_of_animation: + duration = duration // len(images) + + # There seems to be a PIL bug when specifying loops. + if loops == 1: + # Repeat loop only once. + images[0].save(output_file, save_all=True, append_images=images[1:], optimize=True, duration=duration) + elif loops == 0: + # Infinite loop. + images[0].save( + output_file, save_all=True, append_images=images[1:], optimize=True, duration=duration, loop=loops + ) + else: + # Repeat loop a fixed number of times. + images[0].save( + output_file, save_all=True, append_images=images[1:], optimize=True, duration=duration, loop=loops - 1 + ) + log.info(f"GIF successfully created at {output_file}") + return output_file + + +def create_mp4_from_images( + image_paths, + output_file, + duration, + remove_duplicate_frames=True, + repetitions_last_frame=0, + duration_of_animation=False, +): + # Prepare a list of image objects. + images = prepare_images( + image_paths=image_paths, + remove_duplicate_frames=remove_duplicate_frames, + repetitions_last_frame=repetitions_last_frame, + ) + + if duration_of_animation: + duration = duration / len(images) + + # Calculate frame rate from duration per frame. + frame_rate = 1 / (duration / 1000) + + temp_image_paths = [] + for idx, img in enumerate(images): + temp_path = f"/tmp/temp_image_{idx}.png" + img.save(temp_path) + temp_image_paths.append(temp_path) + + clip = ImageSequenceClip(temp_image_paths, fps=frame_rate) + clip.write_videofile(output_file, codec="libx264", fps=frame_rate, preset="slow", audio=False) + + return output_file + + +@click.command(name="chart_animation", cls=RichCommand, help=__doc__) +@click.argument("chart_url", type=str) +@click.option( + "--output-file", + type=str, + default=None, + help=f"Output file path. If None, creates a file in {DOWNLOADS_DIR}.", +) +@click.option( + "--output-format", + type=click.Choice(["gif", "mp4"]), + default="gif", + help="Output format (either gif or mp4).", +) +@click.option( + "--tab", + type=click.Choice(["map", "chart"]), + default=None, + help="Chart tab view (either map or chart). If not specified, it is inferred from URL, and otherwise defaults to map.", +) +@click.option( + "--social-media-square", + is_flag=True, + help="Create a square image for social media.", +) +@click.option( + "--years", + type=str, + default=None, + help="Comma-separated list of years to plot. If None, uses all years in the chart. To avoid many queries, a parameter --max-num-years is defined.", +) +@click.option( + "--year-range-open/--year-range-closed", + default=True, + help="Whether the year range is open or closed. If open, the range is from earliest to the year. If closed, the range is only the year.", +) +@click.option( + "--duration", + type=int, + default=200, + help="Duration in milliseconds (of each frame, or of the entire GIF).", +) +@click.option( + "--loops", + type=int, + default=0, + help="Number of times the GIF should loop. 0 = infinite looping.", +) +@click.option( + "--repetitions-last-frame", + type=int, + default=0, + help="Number of repetitions of the last frame.", +) +@click.option( + "--max-workers", + type=int, + default=None, + help="Maximum number of parallel threads. If None, uses the number of CPUs available.", +) +@click.option( + "--png-folder", + type=str, + default=None, + help="Directory to save downloaded PNG images. If None, use Downloads folder.", +) +@click.option( + "--max-num-years", + type=int, + default=MAX_NUM_YEARS, + help="Maximum number of years to download. If the number of years in the chart exceeds this value, the script will stop.", +) +@click.option( + "--duration-of-animation/--duration-of-frame", + default=False, + help="Whether the duration is for each frame or the entire animation.", + is_flag=True, +) +@click.option( + "--remove-duplicate-frames", + is_flag=True, + help="Remove duplicate frames from the GIF.", +) +def cli( + chart_url, + output_format, + output_file, + tab, + social_media_square, + years, + year_range_open, + duration, + loops, + repetitions_last_frame, + max_workers, + png_folder, + max_num_years, + duration_of_animation, + remove_duplicate_frames, +): + # Given a chart URL, create a GIF with the chart data. + + # Parse years. + if years is not None: + years = [int(year) for year in years.split(",")] + + # Get chart slug. + slug = get_chart_slug(chart_url) + + # Determine the default directory for PNGs. + if png_folder is None: + png_folder = DOWNLOADS_DIR / slug + png_folder.mkdir(parents=True, exist_ok=True) + log.info(f"Using Downloads folder for PNGs: {png_folder}") + else: + Path(png_folder).mkdir(parents=True, exist_ok=True) + + # Define output file for the GIF. + if output_file is None: + output_file = DOWNLOADS_DIR / f"{slug}.{output_format}" + + # Get images from chart URL. + image_paths = get_images_from_chart_url( + chart_url=chart_url, + png_folder=png_folder, + tab=tab, + social_media_square=social_media_square, + years=years, + year_range_open=year_range_open, + max_workers=max_workers, + max_num_years=max_num_years, + ) + + if image_paths: + if output_format == "mp4": + log.info("Creating GIF...") + return create_mp4_from_images( + image_paths=image_paths, + output_file=output_file, + duration=duration, + remove_duplicate_frames=remove_duplicate_frames, + repetitions_last_frame=repetitions_last_frame, + duration_of_animation=duration_of_animation, + ) + else: + log.info("Creating video...") + return create_gif_from_images( + image_paths=image_paths, + output_file=output_file, + duration=duration, + loops=loops, + remove_duplicate_frames=remove_duplicate_frames, + repetitions_last_frame=repetitions_last_frame, + duration_of_animation=duration_of_animation, + ) + else: + log.error("Could not create animation because there are no images downloaded.") + return None + + +if __name__ == "__main__": + cli() diff --git a/apps/cli/__init__.py b/apps/cli/__init__.py index eb9a8e6d80c..38007973251 100644 --- a/apps/cli/__init__.py +++ b/apps/cli/__init__.py @@ -73,7 +73,6 @@ def _lazy_load(self, cmd_name): "run-python-step": "etl.run_python_step.main", "map-datasets": "apps.utils.map_datasets.cli", "scan-chart-diff": "apps.utils.scan_chart_diff.cli", - "draft-pr": "apps.utils.draft_pull_request.cli", "profile": "apps.utils.profile.cli", }, }, @@ -168,7 +167,8 @@ def cli_back() -> None: "update": "apps.step_update.cli.cli", "archive": "apps.step_update.cli.archive_cli", "explorer-update": "apps.explorer_update.cli.cli", - "pr": "apps.utils.draft_pull_request.cli", + "prr": "apps.utils.draft_pull_request.cli", + "pr": "apps.pr.cli.cli", }, }, { diff --git a/apps/owidbot/anomalist.py b/apps/owidbot/anomalist.py index f62f17cd606..35a1e231954 100644 --- a/apps/owidbot/anomalist.py +++ b/apps/owidbot/anomalist.py @@ -1,10 +1,12 @@ import time +from sqlalchemy.orm import Session from structlog import get_logger from apps.anomalist.anomalist_api import anomaly_detection from apps.anomalist.cli import load_datasets_new_ids from apps.wizard.app_pages.anomalist.utils import load_variable_mapping +from apps.wizard.utils.io import get_new_grapher_datasets_and_their_previous_versions from etl import grapher_model as gm from etl.config import OWIDEnv from etl.db import read_sql @@ -23,6 +25,12 @@ def run(branch: str) -> None: # Load new dataset ids datasets_new_ids = load_datasets_new_ids(source_engine) + # Append datasets with changed local files. This is done to be compatible with the Anomalist streamlit app. + with Session(source_engine) as session: + datasets_new_ids = list( + set(datasets_new_ids) | set(get_new_grapher_datasets_and_their_previous_versions(session=session)) + ) + if not datasets_new_ids: log.info("No new datasets found.") return diff --git a/apps/owidbot/cli.py b/apps/owidbot/cli.py index 70b5352d63a..3db6c9ec83b 100644 --- a/apps/owidbot/cli.py +++ b/apps/owidbot/cli.py @@ -9,7 +9,7 @@ from rich_click.rich_command import RichCommand from apps.owidbot import anomalist, chart_diff, data_diff, grapher -from etl.config import get_container_name +from etl.config import OWIDBOT_ACCESS_TOKEN, get_container_name from . import github_utils as gh_utils @@ -57,7 +57,7 @@ def cli( if repo_name not in get_args(REPOS): raise AssertionError("Invalid repo") - repo = gh_utils.get_repo(repo_name) + repo = gh_utils.get_repo(repo_name, access_token=OWIDBOT_ACCESS_TOKEN) pr = gh_utils.get_pr(repo, branch) if pr is None: log.warning(f"No open PR found for branch {branch}") @@ -139,8 +139,8 @@ def create_comment_body(branch: str, services: Dict[str, str], start_time: float body = f""" Quick links (staging server): -[Site](http://{container_name}/) | [Admin](http://{container_name}/admin/login) | [Wizard](http://{container_name}/etl/wizard/) | [Docs](http://{container_name}/etl/docs/) -|--------------------------------|---|---|---| +[Site Dev](http://{container_name}/) | [Site Preview](https://{branch}.owid.pages.dev/) | [Admin](http://{container_name}/admin) | [Wizard](http://{container_name}/etl/wizard/) | [Docs](http://{container_name}/etl/docs/) +|--------------------------------|----------------------------------|---|---|---| **Login**: `ssh owid@{container_name}` diff --git a/apps/owidbot/github_utils.py b/apps/owidbot/github_utils.py index 082824a2841..abd5f138ab1 100644 --- a/apps/owidbot/github_utils.py +++ b/apps/owidbot/github_utils.py @@ -17,9 +17,12 @@ def get_repo(repo_name: str, access_token: Optional[str] = None) -> github.Repository.Repository: if not access_token: - assert config.OWIDBOT_ACCESS_TOKEN, "OWIDBOT_ACCESS_TOKEN is not set" - access_token = config.OWIDBOT_ACCESS_TOKEN - auth = Auth.Token(access_token) + # Don't auth, be aware that you won't be able to do write operations. You should + # set up your access token on https://github.com/settings/tokens. + auth = None + else: + auth = Auth.Token(access_token) + g = Github(auth=auth) return g.get_repo(f"owid/{repo_name}") @@ -96,6 +99,37 @@ def compute_git_blob_sha1(content: bytes) -> str: return sha1.hexdigest() +def _github_access_token(): + # Use GITHUB_TOKEN if set, otherwise use OWIDBOT_ACCESS_TOKEN + if config.GITHUB_TOKEN: + return config.GITHUB_TOKEN + elif config.OWIDBOT_ACCESS_TOKEN: + return config.OWIDBOT_ACCESS_TOKEN + else: + raise AssertionError("You need to set GITHUB_TOKEN or OWIDBOT_ACCESS_TOKEN in your .env file to commit.") + + +def create_branch_if_not_exists(repo_name: str, branch: str, dry_run: bool) -> None: + """Create a branch if it doesn't exist.""" + repo = get_repo(repo_name, access_token=_github_access_token()) + try: + repo.get_branch(branch) + except github.GithubException as e: + if e.status == 404: + if not dry_run: + try: + master_ref = repo.get_branch("main").commit.sha + log.info(f"Using 'main' branch as reference for creating {branch}.") + except github.GithubException: + master_ref = repo.get_branch("master").commit.sha + log.info(f"Using 'master' branch as reference for creating {branch}.") + log.info(f"Creating branch {branch} with reference {master_ref}.") + repo.create_git_ref(ref=f"refs/heads/{branch}", sha=master_ref) + log.info(f"Branch {branch} created in {repo.name}.") + else: + raise e + + def commit_file_to_github( content: str, repo_name: str, @@ -106,7 +140,7 @@ def commit_file_to_github( ) -> None: """Commit a table to a GitHub repository using the GitHub API.""" # Get the repository object - repo = get_repo(repo_name) + repo = get_repo(repo_name, access_token=_github_access_token()) new_content_checksum = compute_git_blob_sha1(content.encode("utf-8")) try: diff --git a/apps/pr/__init__.py b/apps/pr/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/apps/pr/categories.py b/apps/pr/categories.py new file mode 100644 index 00000000000..d5d9a5fc487 --- /dev/null +++ b/apps/pr/categories.py @@ -0,0 +1,69 @@ +# Add EMOJIs for each PR type +PR_CATEGORIES = { + "data": { + "emoji": "📊", + "emoji_raw": ":bar_chart:", + "description": "data update or addition", + }, + "bug": { + "emoji": "🐛", + "emoji_raw": ":bug:", + "description": "bug fix for the user", + }, + "refactor": { + "emoji": "🔨", + "emoji_raw": ":hammer:", + "description": "a code change that neither fixes a bug nor adds a feature for the user", + }, + "enhance": { + "emoji": "✨", + "emoji_raw": ":sparkles:", + "description": "visible improvement over a current implementation without adding a new feature or fixing a bug", + }, + "feature": { + "emoji": "🎉", + "emoji_raw": ":tada:", + "description": "new feature for the user", + }, + "docs": { + "emoji": "📜", + "emoji_raw": ":scroll:", + "description": "documentation only changes", + "shortcut_key": "0", + }, + "chore": { + "emoji": "🐝", + "emoji_raw": ":honeybee:", + "description": "upgrading dependencies, tooling, etc. No production code change", + }, + "style": { + "emoji": "💄", + "emoji_raw": ":lipstick:", + "description": "formatting, missing semi colons, etc. No production code change", + }, + "wip": { + "emoji": "🚧", + "emoji_raw": ":construction:", + "description": "work in progress - intermediate commits that will be explained later on", + }, + "tests": { + "emoji": "✅", + "emoji_raw": ":white_check_mark:", + "description": "adding missing tests, refactoring tests, etc. No production code change", + }, +} +PR_CATEGORIES_MD_DESCRIPTION = "- " + "\n- ".join( + f"**{choice}**: {choice_params['description']}" for choice, choice_params in PR_CATEGORIES.items() +) +PR_CATEGORIES_CHOICES = [ + { + "title": f"{v['emoji']} {k}", + "value": k, + "shortcut_key": v.get("shortcut_key", k[0]), + } + for k, v in PR_CATEGORIES.items() +] +PR_CATEGORIES_CHOICES = sorted(PR_CATEGORIES_CHOICES, key=lambda x: x["shortcut_key"]) +assert len(set([x["shortcut_key"].lower() for x in PR_CATEGORIES_CHOICES])) == len( + PR_CATEGORIES_CHOICES +), "Shortcut keys must be unique" diff --git a/apps/pr/cli.py b/apps/pr/cli.py new file mode 100644 index 00000000000..3de16f52888 --- /dev/null +++ b/apps/pr/cli.py @@ -0,0 +1,445 @@ +"""This script creates a new draft pull request in GitHub, which starts a new staging server. + +Arguments: + +`TITLE`: The title of the PR. This must be given. + +`CATEGORY`: The category of the PR. This is optional. If not given, the user will be prompted to choose one. + +**Main use case**: Branch out from `master` to a temporary `work_branch`, and create a PR to merge `work_branch` -> `master`. You will be asked to choose a category. The value of `work_branch` will be auto-generated based on the title and the category. + +```shell +# Without specifying a category (you will be prompted for a category) +etl pr "some title for the PR" + +# With a category +etl pr "some title for the PR" data + +# With private stating server +etl pr "some title for the PR" --private +``` + +**Custom use case (1)**: Same as main use case, but with a specific branch name for the `work_branch`. + +```shell +etl pr "some title for the PR" --work-branch "this-temporary-branch" +# Shorter +etl pr "some title for the PR" -w "this-temporary-branch" +``` + +**Custom use case (2)**: Create a pull request from `current_branch` to `master`. + +```shell +etl pr "some title for the PR" --direct +``` + +**Custom use case (3)**: Create a pull request from branch `this-temporary-branch` -> `develop`. + +```shell +etl pr "some title for the PR" --direct --base-branch "develop" --work-branch "this-temporary-branch" +# Shorter +etl pr "some title for the PR" --direct -b "develop" -w "this-temporary-branch" +``` +""" + +import hashlib +import os +import re +import uuid +from typing import Optional, cast + +import click +import questionary +import requests +from git import GitCommandError, Repo +from rich_click.rich_command import RichCommand +from structlog import get_logger + +from apps.pr.categories import PR_CATEGORIES, PR_CATEGORIES_CHOICES +from apps.utils.gpt import OpenAIWrapper +from etl.config import GITHUB_TOKEN +from etl.paths import BASE_DIR + +# Initialize logger. +log = get_logger() + +# URL of the Github API, to be used to create a draft pull request in the ETL repos. +GITHUB_API_URL = "https://api.github.com/repos/owid/etl/pulls" + +# Style for questionary +SHELL_FORM_STYLE = questionary.Style( + [ + ("qmark", "fg:#fac800 bold"), # token in front of the question + ("question", "bold"), # question text + ("answer", "fg:#fac800 bold"), # submitted answer text behind the question + ("pointer", "fg:#fac800 bold"), # pointer used in select and checkbox prompts + ("highlighted", "bg:#fac800 fg:#000000 bold"), # pointed-at choice in select and checkbox prompts + ("selected", "fg:#54cc90"), # style for a selected item of a checkbox + ("separator", "fg:#cc5454"), # separator in lists + # ('instruction', ''), # user instructions for select, rawselect, checkbox + ("text", ""), # plain text + # ('disabled', 'fg:#858585 italic') # disabled choices for select and checkbox prompts + ] +) + + +@click.command( + name="pr", + cls=RichCommand, + help=__doc__, +) +@click.argument( + "title", + type=str, + required=True, +) +@click.argument( + "category", + type=click.Choice(list(PR_CATEGORIES.keys()), case_sensitive=False), + required=False, + default=None, +) +@click.option( + "--scope", + "-s", + help="Scope of the PR (only relevant if --title is given). This text will be preprended to the PR title. **Examples**: 'demography' for data work on this field, 'etl.db' if working on specific modules, 'wizard', etc.", + default=None, +) +@click.option( + "--work-branch", + "-w", + "work_branch", + type=str, + default=None, + help="The name of the work branch to create. It is auto-generated based on the title and the category. If --direct is used, this is the PR source branch and defaults to the current branch.", +) +@click.option( + "--base-branch", + "-b", + "base_branch", + type=str, + default="master", + help="Name of the base branch. This is the branch to branch out from and merge back into. If --direct is used, this is the PR target branch.", +) +@click.option( + "--direct", + "-d", + is_flag=True, + help="Directly create a PR from the current branch to the target branch (default: master).", +) +@click.option( + "--private", + "-p", + is_flag=True, + help="By default, staging server site (not admin) will be publicly accessible. Use --private to have it private instead. This does not apply when using --direct mode.", +) +@click.option( + "--no-llm", + "-n", + is_flag=True, + help="We briefly use LLMs to simplify the title and use it in the branch name. Disable this by using -n flag.", +) +def cli( + title: str, + category: Optional[str], + scope: Optional[str], + work_branch: Optional[str], + base_branch: str, + direct: bool, + private: bool, + no_llm: bool, + # base_branch: Optional[str] = None, +) -> None: + # Check that the user has set up a GitHub token. + check_gh_token() + + # Validate title + _validate_title(title) + + # Get category + category = ensure_category(category) + + # Create title + pr_title = PRTitle( + title=title, + category=category, + scope=scope, + ) + + # Initialize repository, get remote branches + repo, remote_branches = init_repo() + + # Get the new branch + work_branch = ensure_work_branch( + repo=repo, + work_branch=work_branch, + direct=direct, + pr_title=pr_title, + remote_branches=remote_branches, + no_llm=no_llm, + ) + + # Check branches main & work make sense! + check_branches_valid(base_branch, work_branch, remote_branches) + + # Auto PR mode: Create a new branch from the base branch. + if not direct: + if private: + if not work_branch.endswith("-private"): + work_branch = f"{work_branch}-private" + branch_out(repo, base_branch, work_branch) + + # Create PR + create_pr(repo, work_branch, base_branch, pr_title) + + +def check_gh_token(): + if not GITHUB_TOKEN: + raise click.ClickException( + """A github token is needed. To create one: +- Go to: https://github.com/settings/tokens +- Click on the dropdown "Generate new token" and select "Generate new token (classic)". +- Give the token a name (e.g., "etl-work"), set an expiration time, and select the scope "repo". +- Click on "Generate token". +- Copy the token and save it in your .env file as GITHUB_TOKEN. +- Run this tool again. +""" + ) + + +def _validate_title(title): + if not bool(re.search(r"\w+", title)): + raise click.ClickException("Invalid title! Use at least one word!") + + +def ensure_category(category: Optional[str]): + """Get category if not provided.""" + if category is None: + # show suggestions + choices = [questionary.Choice(**choice) for choice in PR_CATEGORIES_CHOICES] # type: ignore + category = questionary.select( + message="Please choose a PR category", + choices=choices, + use_shortcuts=True, + style=SHELL_FORM_STYLE, + instruction="(Use shortcuts or arrow keys)", + ).unsafe_ask() + + category = cast(str, category) + + return category + + +class PRTitle: + def __init__(self, title, category, scope): + self.title = title + self.category = category + self.scope = scope + + def __str__(self) -> str: + title_actual = _generate_pr_title(self.title, self.category, self.scope) + if title_actual is None: + raise click.ClickException("Failed to generate PR title.") + return title_actual + + +def init_repo(): + # Initialize a repos object at the root folder of the etl repos. + repo = Repo(BASE_DIR) + # Update the list of remote branches in the local repository. + origin = repo.remote(name="origin") + # NOTE: The option prune=True removes local references to branches that no longer exist on the remote repository. + # Otherwise, this script might raise an error claiming that your proposed branch exists in remote, even if that + # branch was already deleted. + origin.fetch(prune=True) + # List all remote branches. + remote_branches = [ref.name.split("origin/")[-1] for ref in origin.refs if ref.remote_head != "HEAD"] + + return repo, remote_branches + + +def ensure_work_branch(repo, work_branch, direct, pr_title, remote_branches, no_llm): + """Get name of new branch if not provided.""" + # If no name for new branch is given + if work_branch is None: + if not direct: + # Generate name for new branch + work_branch = bake_branch_name(repo, pr_title, no_llm, remote_branches) + else: + # If not explicitly given, the new branch will be the current branch. + work_branch = repo.active_branch.name + if work_branch == "master": + message = "You're currently on 'master' branch. Pass the name of a branch as an argument to create a new branch." + raise click.ClickException(message) + # If a name is given, and not in direct mode + elif (work_branch is not None) & (not direct): + local_branches = [branch.name for branch in repo.branches] + if work_branch in local_branches: + message = ( + f"Branch '{work_branch}' already exists locally." + "Either choose a different name for the new branch to be created, " + "or switch to the new branch and run this tool without specifying a new branch." + ) + raise click.ClickException(message) + return work_branch + + +def check_branches_valid(base_branch, work_branch, remote_branches): + """Ensure the base branch exists in remote (this should always be true for 'master').""" + # Check base branch (main) + if base_branch not in remote_branches: + raise click.ClickException( + f"Base branch '{base_branch}' does not exist in remote. " + "Either push that branch (git push origin base-branch-name) or use 'master' as a base branch. " + "Then run this tool again." + ) + # Check work branch + if work_branch in remote_branches: + raise click.ClickException( + f"New branch '{work_branch}' already exists in remote. " + "Either manually create a pull request from github, or use a different name for the new branch." + ) + + +def branch_out(repo, base_branch, work_branch): + """Branch out from base_branch and create branch 'work_branch'.""" + try: + log.info( + f"Switching to base branch '{base_branch}', creating new branch '{work_branch}' from there, and switching to it." + ) + repo.git.checkout(base_branch) + repo.git.checkout("-b", work_branch) + except GitCommandError as e: + raise click.ClickException(f"Failed to create a new branch from '{base_branch}':\n{e}") + + +def create_pr(repo, work_branch, base_branch, pr_title): + """Create a draft pull request work_branch -> base_branch.""" + pr_title_str = str(pr_title) + + log.info("Creating an empty commit.") + repo.git.commit("--allow-empty", "-m", pr_title_str or f"Start a new staging server for branch '{work_branch}'") + + log.info("Pushing the new branch to remote.") + repo.git.push("origin", work_branch) + + log.info("Creating a draft pull request.") + headers = {"Authorization": f"token {GITHUB_TOKEN}", "Accept": "application/vnd.github.v3+json"} + data = { + "title": pr_title_str or f":construction: Draft PR for branch {work_branch}", + "head": work_branch, + "base": base_branch, + "body": "", + "draft": True, + } + response = requests.post(GITHUB_API_URL, json=data, headers=headers) + if response.status_code == 201: + js = response.json() + log.info(f"Draft pull request created successfully at {js['html_url']}.") + else: + raise click.ClickException(f"Failed to create draft pull request:\n{response.json()}") + + +def _generate_pr_title(title: str, category: str, scope: str | None) -> Optional[str]: + """Generate the PR title. + + title + category + scope -> 'category scope: title' + title + category -> 'category title' + """ + if title is not None: + prefix = "" + # Add emoji for PR mode chosen if applicable + if category in PR_CATEGORIES: + prefix = PR_CATEGORIES[category]["emoji"] + else: + raise ValueError(f"Invalid PR type '{category}'. Choose one of {list(PR_CATEGORIES.keys())}.") + # Add scope + if scope is not None: + if prefix != "": + prefix += " " + prefix += f"{scope}:" + + # Add prefix + title = f"{prefix} {title}" + return title + + +def bake_branch_name(repo, pr_title, no_llm, remote_branches): + # Get user + # git_config = repo.config_reader() + # user = git_config.get_value("user", "name").lower() + + # Get category + category = pr_title.category + + # Get input title (without emoji, scope, etc.) + title = _extract_relevant_title_for_branch_name(pr_title.title, category, not no_llm) + + # Bake complete PR branch name + # name = f"{user}-{category}-{title}" + name = f"{category}-{title}" + + # If branch name collision + # if name in remote_branches: + # log.info("Generating a hash for this branch name to prevent name collisions.") + # name = f"{name}-{user}" + local_branches = [branch.name for branch in repo.branches] + if (name in remote_branches) or (name in local_branches): + log.info("Generating a hash for this branch name to prevent name collisions.") + name = f"{name}-{generate_short_hash()}" + return name + + +def _extract_relevant_title_for_branch_name(text_in: str, category: str, use_llm) -> str: + """ + Process the input string by: + 1. Removing all symbols, keeping only letters and numbers. + 2. Splitting into a list of words/tokens. + 3. Keeping only the first three tokens (or fewer if not available). + 4. Combining the tokens with a '-'. + + Args: + text_in (str): The input text string. + + Returns: + str: The processed string. + """ + if use_llm: + if "OPENAI_API_KEY" in os.environ: + text_in = summarize_title_llm(text_in) + + cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", text_in) + + # Split into tokens/words + tokens = cleaned_text.split() + + # Clean if there is word included in category + tokens = [t for t in tokens if t.lower() != category] + + # Keep only the first 3 tokens + tokens = tokens[:3] + + # Combine tokens with '-' + name = "-".join(tokens).lower() + + return name + + +def generate_short_hash() -> str: + """ + Generate a random short hash (6 characters) using SHA256. + + Returns: + str: A 6-character random hash string. + """ + random_data = uuid.uuid4().hex # Generate random data + random_hash = hashlib.sha256(random_data.encode()).hexdigest() # Create hash + return random_hash[:6] # Return the first 6 characters + + +def summarize_title_llm(title) -> str: + sys_prompt = "You are given a title of a pull request. I need a 2-3 keyword summary, separated by a space. These words will be used to create a branch name." + api = OpenAIWrapper() + log.info("Querying GPT!") + response = api.query_gpt_fast(title, sys_prompt, model="gpt-4o-mini") + return response diff --git a/apps/step_update/cli.py b/apps/step_update/cli.py index fa7325a2822..f315be84f6c 100644 --- a/apps/step_update/cli.py +++ b/apps/step_update/cli.py @@ -363,7 +363,8 @@ def update_steps( # Tell user how to automatically create PR short_name = steps[-1].split("/")[-1].split(".")[0] - cmd = f'etl pr update-{short_name} --title ":bar_chart: Update {short_name}"' + # cmd = f'etl pro update-{short_name} --title ":bar_chart: Update {short_name}"' + cmd = f'etl pr "{short_name}" data' log.info(f"Create the PR automatically with:\n {cmd}") def _archive_step(self, step: str) -> None: diff --git a/apps/utils/draft_pull_request.py b/apps/utils/draft_pull_request.py index f1b075a3ba5..194a44cfc7b 100644 --- a/apps/utils/draft_pull_request.py +++ b/apps/utils/draft_pull_request.py @@ -96,7 +96,7 @@ def _branch_exists_remotely(new_branch, remote_branches): return False -@click.command(name="draft-pr", cls=RichCommand, help=__doc__) +@click.command(name="pro", cls=RichCommand, help=__doc__) @click.argument( "new-branch", type=str, diff --git a/apps/utils/gpt.py b/apps/utils/gpt.py index 965b19e4472..4550465fbcf 100644 --- a/apps/utils/gpt.py +++ b/apps/utils/gpt.py @@ -217,6 +217,21 @@ def query_gpt( else: raise ValueError("message_content is expected to be a string!") + def query_gpt_fast(self, user_prompt: str, system_prompt: str, model: str = MODEL_DEFAULT) -> str: + """Query Chat GPT to get message content from the chat completion.""" + query = GPTQuery( + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + ) + response = self.query_gpt(query=query, model=model) + + if isinstance(response, GPTResponse): + return response.message_content + else: + raise ValueError("message_content is expected to be a string!") + def get_number_tokens(text: str, model_name: str) -> int: """Get number of tokens of text. diff --git a/apps/utils/scan_chart_diff.py b/apps/utils/scan_chart_diff.py index 7b7ef4d8e94..4a3cec921fc 100644 --- a/apps/utils/scan_chart_diff.py +++ b/apps/utils/scan_chart_diff.py @@ -1,5 +1,6 @@ import click import requests +import streamlit as st from rich_click.rich_command import RichCommand from sqlalchemy.exc import OperationalError, ProgrammingError from structlog import get_logger @@ -34,6 +35,9 @@ def cli(dry_run: bool) -> None: args.append("--dry-run") try: + # Make sure to clear state, otherwise we'd be using cached state from previous + # branch. + st.session_state.clear() owidbot_cli(args, standalone_mode=False) except ProgrammingError as e: # MySQL is being refreshed and tables are not ready diff --git a/apps/wizard/app_pages/analytics.py b/apps/wizard/app_pages/analytics.py new file mode 100644 index 00000000000..09a829efeb9 --- /dev/null +++ b/apps/wizard/app_pages/analytics.py @@ -0,0 +1,13 @@ +import streamlit as st + +external_url = "http://dashboard.owid.io" + +redirect_script = f""" + + + If you are not redirected automatically, follow this link. +""" + +st.markdown(redirect_script, unsafe_allow_html=True) diff --git a/apps/wizard/app_pages/anomalist/app.py b/apps/wizard/app_pages/anomalist/app.py index 680f3100020..fb4c92dc65c 100644 --- a/apps/wizard/app_pages/anomalist/app.py +++ b/apps/wizard/app_pages/anomalist/app.py @@ -34,7 +34,7 @@ ) from apps.wizard.utils import cached, set_states, url_persist from apps.wizard.utils.chart_config import bake_chart_config -from apps.wizard.utils.components import Pagination, grapher_chart, st_horizontal, tag_in_md +from apps.wizard.utils.components import Pagination, grapher_chart, st_horizontal, st_multiselect_wider, tag_in_md from apps.wizard.utils.db import WizardDB from etl.config import OWID_ENV from etl.grapher_io import load_variables @@ -161,6 +161,19 @@ def llm_ask(df: pd.DataFrame): ) +@st.fragment() +def download_anomalies(df: pd.DataFrame): + csv_data = convert_df_to_csv(df) + st.download_button( + "Export data (CSV)", + data=csv_data, + file_name="data.csv", + mime="text/csv", + icon=":material/download:", + help="Download the anomalies as a CSV file. Selected filters apply!", + ) + + @st.dialog("AI summary of anomalies", width="large") def llm_dialog(df: pd.DataFrame): """Ask LLM for summary of the anomalies.""" @@ -360,6 +373,13 @@ def _sort_df(df: pd.DataFrame, sort_strategy: Union[str, List[str]]) -> Tuple[pd return df, columns_sort +# Function to convert DataFrame to CSV +@st.cache_data +def convert_df_to_csv(df): + df["indicator_uri"] = df["indicator_id"].apply(lambda x: st.session_state.anomalist_indicators.get(x)) + return df.to_csv(index=False).encode("utf-8") + + # Functions to show the anomalies @st.fragment def show_anomaly_compact(index, df): @@ -425,6 +445,11 @@ def show_anomaly_compact(index, df): else: config = bake_chart_config(variable_id=indicator_id, selected_entities=entities) config["hideAnnotationFieldsInTitle"]["time"] = True + config["hideFacetControl"] = False + config["hideShareButton"] = True + config["hideExploreTheDataButton"] = True + # config["isSocialMediaExport"] = False + # Actually plot grapher_chart(chart_config=config, owid_env=OWID_ENV) @@ -549,15 +574,7 @@ def _score_table(df: pd.DataFrame) -> pd.DataFrame: # 2/ DATASET FORM # Ask user to select datasets. By default, we select the new datasets (those that are new in the current PR compared to master). -st.markdown( - """ - """, - unsafe_allow_html=True, -) +st_multiselect_wider() with st.form(key="dataset_search"): query_dataset_ids = [int(v) for v in st.query_params.get_all("anomalist_datasets_selected")] @@ -777,8 +794,11 @@ def _score_table(df: pd.DataFrame) -> pd.DataFrame: # Show anomalies with time and version changes if not df.empty: - # LLM summary option - llm_ask(df) + # Top option buttons + with st_horizontal(): + # LLM summary option + llm_ask(df) + download_anomalies(df) # st.dataframe(df_change) groups = df.groupby(["indicator_id", "type"], sort=False, observed=True) @@ -799,6 +819,7 @@ def _score_table(df: pd.DataFrame) -> pd.DataFrame: # Show controls only if needed if len(items) > items_per_page: pagination.show_controls(mode="bar") - +else: + st.success("Ha! We did not find any no anomalies in the selected datasets! What were the odds of that?") # Reset state set_states({"anomalist_datasets_submitted": False}) diff --git a/apps/wizard/app_pages/chart_animation.py b/apps/wizard/app_pages/chart_animation.py new file mode 100644 index 00000000000..93be313043b --- /dev/null +++ b/apps/wizard/app_pages/chart_animation.py @@ -0,0 +1,267 @@ +import streamlit as st +from structlog import get_logger + +from apps.chart_animation.cli import ( + DOWNLOADS_DIR, + MAX_NUM_YEARS, + create_gif_from_images, + create_image_file_name, + create_mp4_from_images, + get_chart_slug, + get_images_from_chart_url, + get_query_parameters_in_chart, + get_years_in_chart, +) +from apps.wizard.utils import set_states +from apps.wizard.utils.components import grapher_chart_from_url, st_horizontal, st_info + +# Initialize log. +log = get_logger() + +# PAGE CONFIG +st.set_page_config( + page_title="Wizard: Chart animation", + page_icon="🪄", +) + +# Session state config +# Initialize session state for generated files. +st.session_state.chart_animation_images_folder = st.session_state.get("chart_animation_images_folder", DOWNLOADS_DIR) +st.session_state.chart_animation_image_paths = st.session_state.get("chart_animation_image_paths", None) +st.session_state.chart_animation_images_exist = st.session_state.get("chart_animation_images_exist", False) +st.session_state.chart_animation_gif_file = st.session_state.get("chart_animation_gif_file", None) +st.session_state.chart_animation_iframe_html = st.session_state.get("chart_animation_iframe_html", None) +st.session_state.chart_animation_show_image_settings = st.session_state.get( + "chart_animation_show_image_settings", False +) +# NOTE: The range of years will be loaded automatically from the chart's metadata. We just define this range here to avoid typing issues. +st.session_state.chart_animation_years = st.session_state.get("chart_animation_years", range(2000, 2022)) +st.session_state.chart_animation_years_selected = st.session_state.get( + "chart_animation_years_selected", st.session_state.chart_animation_years +) +st.session_state.chart_animation_max_num_years = MAX_NUM_YEARS + + +# FUNCTIONS +def add_icons_to_tabs(tab_name): + if tab_name == "map": + return ":material/map: map" + elif tab_name == "chart": + return ":material/show_chart: chart" + return f":material/{tab_name}: {tab_name}" + + +######################################################################################################################## +# RENDER +######################################################################################################################## + +# Streamlit app layout. +st.title(":material/animated_images: Chart animation") + +# 1/ INPUT CHART & GET YEARS +chart_url = st.text_input( + "Enter grapher URL", + "", + placeholder="https://ourworldindata.org/grapher/share-electricity-low-carbon?tab=chart&country=OWID_WRL~OWID_EUR~OWID_AFR", + help="Paste the URL of the chart you want to animate. Note that some parameters cannot be extracted from the URL (e.g. the type of tab view). But you can modify them afterwards.", +) + +# Get slug from URL. +slug = get_chart_slug(chart_url=chart_url) + +# Set images folder and output file. +st.session_state.chart_animation_images_folder = DOWNLOADS_DIR / slug +if not st.session_state.chart_animation_images_folder.exists(): + # Create the default output folder if it doesn't exist. + st.session_state.chart_animation_images_folder.mkdir(parents=True) +image_paths = [image for image in st.session_state.chart_animation_images_folder.iterdir() if image.suffix == ".png"] +st.session_state.chart_animation_gif_file = DOWNLOADS_DIR / f"{slug}.gif" +st.session_state.chart_animation_image_paths = image_paths +st.session_state.chart_animation_images_exist = len(image_paths) > 0 + +# Button +if st.button( + "Get chart", + type="primary", +): + if not chart_url: + st.error("Please enter a valid chart URL.") + st.stop() + # Embed the iframe in the app. + set_states( + { + "chart_animation_years": get_years_in_chart(chart_url), + "chart_animation_show_image_settings": True, + } + ) + +# 2/ CONTINUE IF NO ERROR +if st.session_state.chart_animation_show_image_settings: + # Display iframe if it was fetched. + with st.expander("**Preview**", expanded=True): + st.session_state.chart_animation_iframe_html = grapher_chart_from_url(chart_url) + st_info( + "Modify chart as you wish, click on share -> copy link, and paste it in the box above.", + ) + + # 2.1/ CONFIGURE INPUT: CHART EDIT + with st.container(border=True): + st.caption("**Chart settings**") + # Configure the chart (input to animation generation). + query_parameters = get_query_parameters_in_chart( + chart_url, all_years=st.session_state.chart_animation_years + ) + with st_horizontal(): + tab = st.segmented_control( + "Select tab", ["map", "chart"], format_func=add_icons_to_tabs, default=query_parameters["tab"] + ) + st.session_state.chart_animation_max_num_years = st.number_input( + "Maximum number of years", + value=MAX_NUM_YEARS, + help="Maximum number of years to generate images for (to avoid too many API call).", + ) + + # Create a slider to select min and max years. + year_min, year_max = st.select_slider( + "Select year range", + options=st.session_state.chart_animation_years, + value=(query_parameters["year_min"], query_parameters["year_max"]), + ) + + # Get the selected subset of years. + years = [year for year in st.session_state.chart_animation_years if year_min <= year <= year_max] + + if len(years) > st.session_state.chart_animation_max_num_years: + st.error( + f"Number of years in the chart ({len(years)}) is higher than the maximum number of years ({st.session_state.chart_animation_max_num_years}). You can either increase the maximum number of years or select a smaller range." + ) + st.stop() + + # 2.2/ SHOW OPTIONS FOR IMAGE GENERATION + with st.expander("**Output settings**", expanded=True): + with st_horizontal(): + # Choose: GIF or Video + output_type = st.segmented_control( + "Output format", + ["GIF", "Video"], + default="GIF", + ) + # Social media? + output_style = st.segmented_control( + "Output style", + ["Classic", "Square format"], + default="Classic", + help="Use 'square format' for mobile or social media.", + ) + social_media_square = output_style == "Square format" + + st.session_state.chart_animation_gif_file = st.session_state.chart_animation_gif_file.with_suffix( + ".gif" if output_type == "GIF" else ".mp4" + ) + + # If chart, show option to just show single year + if tab == "chart": + year_range_open = not st.toggle( + "Show single year", + value=not query_parameters["year_range_open"], + help="Only relevant for the chart view. If checked, the animated chart will only display a single year per frame. For LineCharts, this means a sequence of bar charts. For ScatterCharts, this means a sequence of bubbles (and not vectors).", + ) + else: + year_range_open = True + + with st.container(border=True): + st.caption("**Frame settings**") + remove_duplicates = not st.toggle( + "Allow duplicate frames", + value=False, + help="Some charts may have duplicate frames. If checked, these frames will be shown.", + ) + + with st_horizontal(): + duration = st.number_input( + "Duration (ms)", + value=200, + step=10, + help="Duration (in ms) of each frame, or the entire animation.", + # label_visibility="collapsed", + ) + duration_of = st.segmented_control( + "Duration of", + ["Each frame", "Entire animation"], + help="Choose if the duration parameter refers to each frame, or the entire animation. Note that each frame cannot be shorter than 20ms.", + default="Each frame", + # label_visibility="collapsed", + ) + repetitions_last_frame = st.number_input( + "Duration of the last frame (ms)", + min_value=duration, + value=duration, + step=duration, + help="Increase this to make the last frame last longer.", + ) + repetitions_last_frame = repetitions_last_frame // duration - 1 + if output_type == "GIF": + loop_count = st.number_input("Number of Loops (0 = Infinite)", value=0, step=1) + + # Fetch all needed images (skipping the ones that already exist). + st.session_state.chart_animation_image_paths = get_images_from_chart_url( + chart_url=chart_url, + png_folder=st.session_state.chart_animation_images_folder, + tab=tab, + social_media_square=social_media_square, + years=years, + year_range_open=year_range_open, + max_workers=None, + max_num_years=st.session_state.chart_animation_max_num_years, + ) + st.session_state.chart_animation_images_exist = len(st.session_state.chart_animation_image_paths) > 0 # type: ignore + + # Select only images that match the required parameters. + image_paths_selected = [ + st.session_state.chart_animation_images_folder + / create_image_file_name( + year=year, year_range_open=year_range_open, tab=tab, social_media_square=social_media_square + ) + for year in years + ] + + # GIF/Video generation. + with st.spinner("Generating animation. This can take few seconds..."): + if output_type == "GIF": + st.session_state.chart_animation_gif_file = create_gif_from_images( + image_paths=image_paths_selected, + output_file=st.session_state.chart_animation_gif_file, + duration=duration, + loops=loop_count, # type: ignore + remove_duplicate_frames=remove_duplicates, + repetitions_last_frame=repetitions_last_frame, + duration_of_animation=duration_of == "Entire animation", + ) + # GIF preview. + st.image(str(st.session_state.chart_animation_gif_file), use_container_width=True) + st_info('Animation preview. Right click and "Save Image As..." to download it.') + else: + st.session_state.chart_animation_gif_file = create_mp4_from_images( + image_paths=image_paths_selected, + output_file=st.session_state.chart_animation_gif_file, + duration=duration, + remove_duplicate_frames=remove_duplicates, + repetitions_last_frame=repetitions_last_frame, + duration_of_animation=duration_of == "Entire animation", + ) + # Video preview + with open(str(st.session_state.chart_animation_gif_file), "rb") as video_file: + st.video(video_file.read(), format="video/mp4", autoplay=True) + st_info('Animation preview. Right click and "Save video as..." to download it.') + + # Button to delete all images in the folder. + if st.button( + "Delete images", + disabled=not st.session_state.chart_animation_images_exist, + help=f"To generate the animation, several chart images were downloaded and saved in in folder: `{st.session_state.chart_animation_images_folder}`. Click this button to delete them.", + ): + for image in st.session_state.chart_animation_image_paths: # type: ignore + image.unlink() + # Update session state to reflect that images are deleted. + st.session_state.chart_animation_images_exist = False + st.toast("✅ Images deleted.") diff --git a/apps/wizard/app_pages/chart_diff/chart_diff.py b/apps/wizard/app_pages/chart_diff/chart_diff.py index 4fe27bd967d..3ed8b71f3a0 100644 --- a/apps/wizard/app_pages/chart_diff/chart_diff.py +++ b/apps/wizard/app_pages/chart_diff/chart_diff.py @@ -1,5 +1,6 @@ import datetime as dt import difflib +import json import pprint from typing import Any, Dict, List, Optional @@ -10,8 +11,11 @@ from structlog import get_logger from apps.wizard.utils import get_staging_creation_time +from apps.wizard.utils.io import get_all_changed_catalog_paths from etl import grapher_model as gm +from etl.config import OWID_ENV from etl.db import read_sql +from etl.git_helpers import get_changed_files log = get_logger() @@ -164,7 +168,8 @@ def in_conflict(self) -> bool: return False # Check if chart has been edited in production - chart_edited_in_prod = self.target_chart.updatedAt > get_staging_creation_time() + with Session(OWID_ENV.engine) as session: + chart_edited_in_prod = self.target_chart.updatedAt > get_staging_creation_time(session) # If edited, check if conflict was resolved if chart_edited_in_prod: @@ -198,7 +203,9 @@ def change_types(self) -> list[str]: self._change_types.append("data") if self.modified_checksum["metadataChecksum"].any(): self._change_types.append("metadata") - if self.target_chart and not self.configs_are_equal(): + # NOTE: configs might differ and edited_in_staging is False if the chart had just + # data / metadata changes + if self.edited_in_staging and self.target_chart and not self.configs_are_equal(): self._change_types.append("config") # TODO: Should uncomment this maybe? @@ -351,7 +358,7 @@ def set_conflict_to_resolved(self, session: Session) -> None: def configs_are_equal(self) -> bool: """Compare two chart configs, ignoring version, id and isPublished.""" assert self.target_chart is not None, "Target chart is None!" - return configs_are_equal(self.source_chart.config, self.target_chart.config) + return configs_are_equal(self.source_chart.config, self.target_chart.config, verbose=False) @property def details(self): @@ -579,6 +586,19 @@ def _modified_data_metadata_on_staging( params["chart_ids"] = tuple(chart_ids) source_df = read_sql(query_source, source_session, params=params) + # no charts, return empty dataframe + if source_df.empty: + return pd.DataFrame(columns=["chartId", "dataEdited", "metadataEdited"]).set_index("chartId") + + # Get all changed files and their catalog paths, including downstream dependencies. + files_changed = get_changed_files() + catalog_paths = get_all_changed_catalog_paths(files_changed) + + # Exclude variables that haven't been changed by updating the files. This is to prevent showing + # spurious changes from lagging behind master. + dataset_paths = source_df.catalogPath.str.split("/").str[:4].str.join("/") + source_df = source_df[dataset_paths.isin(catalog_paths)] + # no charts, return empty dataframe if source_df.empty: return pd.DataFrame(columns=["chartId", "dataEdited", "metadataEdited"]).set_index("chartId") @@ -634,6 +654,7 @@ def _modified_chart_configs_on_staging( select c.id as chartId, MD5(cc.full) as chartChecksum, + cc.full as chartConfig, c.lastEditedByUserId as chartLastEditedByUserId, c.publishedByUserId as chartPublishedByUserId, c.lastEditedAt as chartLastEditedAt @@ -680,6 +701,20 @@ def _modified_chart_configs_on_staging( diff = source_df.copy() diff["configEdited"] = source_df["chartChecksum"] != target_df["chartChecksum"] + # Go through edited configs and do a more detailed comparison + ix = diff["configEdited"] & target_df["chartChecksum"].notnull() + equal_configs = [] + for chart_id, row in diff.loc[ix].iterrows(): + source_config = json.loads(row["chartConfig"]) + target_config = json.loads(target_df.loc[chart_id, "chartConfig"]) + + # Compare configs + if configs_are_equal(source_config, target_config): + equal_configs.append(chart_id) + + # Exclude configs that have different chartChecksum, but are actually the same (e.g. have just different version) + diff = diff[~diff.index.isin(equal_configs)] + # Add flag 'edited in staging' diff["chartEditedInStaging"] = True diff --git a/apps/wizard/app_pages/fasttrack/load.py b/apps/wizard/app_pages/fasttrack/load.py index 7490f088b2b..8f85c72270c 100644 --- a/apps/wizard/app_pages/fasttrack/load.py +++ b/apps/wizard/app_pages/fasttrack/load.py @@ -294,7 +294,7 @@ def _parse_sources(sources_meta_df: pd.DataFrame) -> Optional[Source]: source = sources[0] if pd.isnull(source.get("date_accessed")): - source.pop("date_accessed") + source.pop("date_accessed", None) if pd.isnull(source.get("publication_year")): source.pop("publication_year") diff --git a/apps/wizard/app_pages/fasttrack/process.py b/apps/wizard/app_pages/fasttrack/process.py index 8f78ac1f5ac..ee4575f8f2d 100644 --- a/apps/wizard/app_pages/fasttrack/process.py +++ b/apps/wizard/app_pages/fasttrack/process.py @@ -151,7 +151,7 @@ def _convert_percentages(data: pd.DataFrame, variables_meta_dict: Dict[str, Vari """Convert percentages to numbers.""" for col in data.columns: if getattr(variables_meta_dict.get(col, {}), "unit", "") == "%": - data[col] = data[col].str.replace("%", "").astype(float) + data[col] = data[col].astype(str).str.replace("%", "").astype(float) return data diff --git a/apps/wizard/app_pages/indicator_upgrade/charts_update.py b/apps/wizard/app_pages/indicator_upgrade/charts_update.py index b3993fe4669..7707ea540d4 100644 --- a/apps/wizard/app_pages/indicator_upgrade/charts_update.py +++ b/apps/wizard/app_pages/indicator_upgrade/charts_update.py @@ -10,6 +10,7 @@ import etl.grapher_model as gm from apps.chart_sync.admin_api import AdminAPI from apps.wizard.utils import set_states, st_page_link, st_toast_error +from apps.wizard.utils.cached import get_grapher_user_id from apps.wizard.utils.db import WizardDB from etl.config import OWID_ENV from etl.helpers import get_schema_from_url @@ -92,8 +93,14 @@ def get_affected_charts_and_preview(indicator_mapping: Dict[int, int]) -> List[g def push_new_charts(charts: List[gm.Chart]) -> None: """Updating charts in the database.""" + # Use Tailscale user if it is available, otherwise use GRAPHER_USER_ID from env + if "X-Forwarded-For" in st.context.headers: + grapher_user_id = get_grapher_user_id(st.context.headers["X-Forwarded-For"]) + else: + grapher_user_id = None + # API to interact with the admin tool - api = AdminAPI(OWID_ENV) + api = AdminAPI(OWID_ENV, grapher_user_id=grapher_user_id) # Update charts progress_text = "Updating charts..." bar = st.progress(0, progress_text) @@ -113,7 +120,7 @@ def push_new_charts(charts: List[gm.Chart]) -> None: chart_id = chart.config["id"] else: raise ValueError(f"Chart {chart} does not have an ID in config.") - api.update_chart(chart_id=chart_id, chart_config=config_new, user_id=chart.lastEditedByUserId) + api.update_chart(chart_id=chart_id, chart_config=config_new) # Show progress bar percent_complete = int(100 * (i + 1) / len(charts)) bar.progress(percent_complete, text=f"{progress_text} {percent_complete}%") diff --git a/apps/wizard/app_pages/insight_search.py b/apps/wizard/app_pages/insight_search.py new file mode 100644 index 00000000000..bc342f1ef46 --- /dev/null +++ b/apps/wizard/app_pages/insight_search.py @@ -0,0 +1,336 @@ +import json +import os +import re +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Dict, Tuple + +import pandas as pd +import streamlit as st +from sentence_transformers import SentenceTransformer, util +from structlog import get_logger +from tqdm.auto import tqdm + +from apps.wizard.utils.components import Pagination, st_horizontal, st_multiselect_wider, tag_in_md +from etl.db import read_sql + +# Initialize log. +log = get_logger() + +# PAGE CONFIG +st.set_page_config( + page_title="Wizard: Insight Search", + page_icon="🪄", +) + + +######################################################################################################################## +# FUNCTIONS +######################################################################################################################## +@st.cache_data(show_spinner=False) +def get_model(): + "Load the pre-trained model." + with st.spinner("Loading model..."): + model = SentenceTransformer("all-MiniLM-L6-v2") + return model + + +MODEL = get_model() + + +def get_raw_data_insights() -> pd.DataFrame: + """Get the content of data insights that exist in the database.""" + # Get all data insights from the database. + query = """ + SELECT id, slug, content, published, publishedAt, markdown + FROM posts_gdocs + WHERE type = 'data-insight' + """ + df = read_sql(query) + + return df + + +def extract_text_from_raw_data_insight(content: Dict[str, Any]) -> str: + """Extract the text from the raw data insight, ignoring URLs and other fields.""" + texts = [] + + # Iterate through each element in the 'body' field. + for element in content.get("body", []): + # Check if the element has a 'value' field that contains text. + if "value" in element and isinstance(element["value"], list): + for value_item in element["value"]: + if "text" in value_item: + texts.append(value_item["text"]) + # Include text from children if present. + if "children" in value_item and isinstance(value_item["children"], list): + for child in value_item["children"]: + if "text" in child: + texts.append(child["text"]) + + # Join texts and do some minor cleaning. + clean_text = " ".join(texts).replace(" .", ".").replace(" ,", ",").replace(" ", " ") + + return clean_text + + +def extract_image_urls_from_raw_data_insight(content) -> Tuple[str | None, str | None]: + url_img_desktop = None + url_img_mobile = None + + for element in content.get("body", []): + if "type" in element and element["type"] == "image": + if "filename" in element: + fname = element["filename"] + name, extension = os.path.splitext(fname) + url_img_desktop = f"https://ourworldindata.org/images/published/{name}_1350{extension}" + if "smallFilename" in element: + fname = element["smallFilename"] + name, extension = os.path.splitext(fname) + url_img_mobile = f"https://ourworldindata.org/images/published/{name}_850{extension}" + break + return url_img_desktop, url_img_mobile + + +def extract_video_urls_from_raw_data_insight(content) -> str | None: + url_video = None + + for element in content.get("body", []): + if "type" in element and element["type"] == "video": + if "url" in element: + url_video = element["url"] + return url_video + + +@st.cache_data(show_spinner=False) +def get_data_insights() -> list[Dict[str, Any]]: + with st.spinner("Loading data insights..."): + # Get the raw data insights from the database. + df = get_raw_data_insights() + + # Parse data insights and construct a convenient dictionary. + insights = [] + for _, di in df.iterrows(): + content = json.loads(di["content"]) + + # Get multimedia urls + url_img_desktop, url_img_mobile = extract_image_urls_from_raw_data_insight(content) + url_vid = extract_video_urls_from_raw_data_insight(content) + + # Get markdown + markdown = di["markdown"] + pattern = r"<(Video|Image|Chart)\b[^>]*\/>" + if markdown is not None: + markdown = re.sub(pattern, "", markdown) + else: + markdown = extract_text_from_raw_data_insight(content) + + # Build DI dictionary + di_dict = { + "id": di["id"], + "title": content["title"], + "raw_text": extract_text_from_raw_data_insight(content), + "authors": content["authors"], + "url_img_desktop": url_img_desktop, + "url_img_mobile": url_img_mobile, + "url_vid": url_vid, + "slug": di["slug"], + "is_public": bool(di["published"]), + "date_published": di["publishedAt"], + "markdown": markdown, + } + + if di_dict["is_public"]: + di_dict["url"] = f"https://ourworldindata.org/data-insights/{di_dict['slug']}" + + insights.append(di_dict) + + return insights + + +def _encode_text(text): + return MODEL.encode(text, convert_to_tensor=True) + + +@st.cache_data(show_spinner=False) +def get_insights_embeddings(insights: list[Dict[str, Any]]) -> list: + with st.spinner("Generating embeddings..."): + # Combine the title, body and authors of each insight into a single string. + insights_texts = [ + insight["title"] + " " + insight["raw_text"] + " " + " ".join(insight["authors"]) for insight in insights + ] + + # Run embedding generation in parallel. + with ThreadPoolExecutor() as executor: + embeddings = list(tqdm(executor.map(_encode_text, insights_texts), total=len(insights_texts))) + + return embeddings + + +def get_sorted_documents_by_similarity( + input_string: str, insights: list[Dict[str, str]], embeddings: list +) -> list[Dict[str, Any]]: + """Ingests an input string and a list of documents, returning the list of documents sorted by their semantic similarity to the input string.""" + _insights = insights.copy() + + # Encode the input string and the document texts. + input_embedding = MODEL.encode(input_string, convert_to_tensor=True) + + # Compute the cosine similarity between the input and each document. + def _get_score(a, b): + score = util.pytorch_cos_sim(a, b).item() + score = (score + 1) / 2 + return score + + similarities = [_get_score(input_embedding, doc_embedding) for doc_embedding in embeddings] # type: ignore + + # Attach the similarity scores to the documents. + for i, doc in enumerate(_insights): + doc["similarity"] = similarities[i] # type: ignore + + # Sort the documents by descending similarity score. + sorted_documents = sorted(_insights, key=lambda x: x["similarity"], reverse=True) + + return sorted_documents + + +def st_display_insight(insight): + # :material/person + authors = ", ".join([tag_in_md(a, "gray", ":material/person") for a in insight["authors"]]) + score = round(insight["similarity"] * 100) + + # Get edit URLs + # url_gdoc = f"https://docs.google.com/document/d/{insight['id']}/edit" + url_admin = f"http://staging-site-covid-reporting-5/admin/gdocs/{insight['id']}/preview" + + with st.container(border=True): + # If public, display special header (inc multimedia content if insight is public) + if insight["is_public"]: + st.markdown(f"#### [{insight['title']}]({insight['url']})") + + # Display header 'Author | Date' + date_str = insight["date_published"].strftime("%B %d, %Y") + date_str = tag_in_md(date_str, "green", ":material/calendar_month") + # header = f"by **{authors}** | published **{date_str}** | [view]({insight['url']})" + st.markdown(f"by {authors} | {date_str} | [:material/edit: edit]({url_admin})") + + # Show multimedia content if available (image, video) + if insight["url_img_desktop"] is not None: + st.image(insight["url_img_desktop"], use_container_width=True) + elif insight["url_vid"] is not None: + st.video(insight["url_vid"]) + # Display only authors if not public + else: + st.markdown(f"#### {insight['title']}") + st.write(f":red[(Draft)] {authors} | [:material/edit: edit]({url_admin})") + + # Render text + text = insight["markdown"].replace("$", "\$") # type: ignore + st.caption(text) + + # Score + st.write(f"**Similarity Score:** {score}%") + + +@st.cache_data(show_spinner=False) +def get_authors_with_DIs(insights): + with st.spinner("Getting author names..."): + return set(author for insight in insights for author in insight["authors"]) + + +######################################################################################################################## +# Fetch all data insights. +insights = get_data_insights() +# Available authors +authors = get_authors_with_DIs(insights) + +# Create an embedding for each insight. +# TODO: This could also be stored in db. +embeddings = get_insights_embeddings(insights) +######################################################################################################################## + + +######################################################################################################################## +# RENDER +######################################################################################################################## + +# Streamlit app layout. +st.title(":material/search: DI search") + +# Other interesting links +with st.popover("Additional resources"): + st.markdown( + """ + + - [**Topic diversity**](http://analytics/analytics?sql=--%0D%0A--+Table+of+topics+%22neglected%22+by+our+published%2Fscheduled+data+insights%0D%0A--%0D%0A--+Notes%3A%0D%0A--+++-+%60n_insights_per_1m_views_365d%60+column+represents+the+%23+of+published%2Fscheduled+data%0D%0A--+++++insights+per+1+million+page+views+on+the+topic+in+the+past+365+days.%0D%0A--+++-+views_365d+represents+all+views+on+the+topic+%28including+articles%2C+charts%2C+data%0D%0A--+++++insights%2C+explorers%2C+topic+pages%29.%0D%0A--+++-+published+and+scheduled+data+insights+are+counted%2C+draft+data+insights+are+not.%0D%0A--+%0D%0A%0D%0Awith%0D%0A%0D%0Atopics+as+%28%0D%0A++select+%0D%0A++++topic%2C%0D%0A++++sum%28views_365d%29+as+views_365d%0D%0A++from+pages%0D%0A++join+page_x_topic+using%28url%29%0D%0A++group+by+topic%0D%0A%29%2C%0D%0A%0D%0Acounts+as+%28%0D%0A++select+topic%2C+count%28*%29+as+n_insights%0D%0A++from+%28%0D%0A++++select+unnest%28topics%29+as+topic+%0D%0A++++from+data_insights++--+alternatives%3A+articles%2C+charts%2C+explorers%0D%0A++++--+filter+by+author%3A%0D%0A++++--+where+list_contains%28authors%2C+%27Hannah+Ritchie%27%29%0D%0A++++--+filter+by+days+since+published%3A%0D%0A++++--+where+published_at+%3E+CURRENT_DATE%28%29+-+INTERVAL+90+DAY%0D%0A++%29%0D%0A++group+by+topic%0D%0A%29%0D%0A%0D%0Aselect%0D%0A++topic%2C%0D%0A++views_365d%2C%0D%0A++COALESCE%28n_insights%2C+0%29+as+n_insights%2C%0D%0A++COALESCE%28round%281e6+*+n_insights+%2F+views_365d%2C+1%29%2C+0%29+as+n_insights_per_1m_views_365d%0D%0Afrom+topics%0D%0Aleft+join+counts+using%28topic%29%0D%0Aorder+by+views_365d+desc) (Datasette): Check which topics we've covered so far — and which have been neglected — to find new ideas. + - **Country diversity** (Instagram): Look at which countries we have referenced in our Instagram posts. IG posts originate from a subset of DIs, therefore these can be a good indicator of which countries we are focusing on. + - [Countries covered by Instagram posts](https://admin.owid.io/admin/charts/8259/edit) + - [Average country share of mentions in a post](https://admin.owid.io/admin/charts/8260/edit) + """ + ) +# Box for input text. +input_string = st.text_input( + label="Enter a word or phrase to find the most similar insights.", + placeholder="Type something...", + help="Write any text to find the most similar data insights.", +) + +st_multiselect_wider() +with st_horizontal(): + input_authors = st.multiselect( + label="Authors", + options=authors, + help="Show only insights by selected authors.", + placeholder="Filter by author(s)", + ) + +if input_string or (input_authors != []): + if (len(input_string) < 3) and (len(input_authors) == 0): + st.warning("Please enter at least 3 characters or one author.") + else: + # Get the sorted DIs. + sorted_dis = get_sorted_documents_by_similarity(input_string, insights=insights, embeddings=embeddings) + + # Display the sorted documents. + # TODO: This could be enhanced in different ways: + # * Add a color to similarity score. + # * Show the part of the text that justifies the score (this may also slow down the search). + + # Filter DIs by publication status + options = ["All", "Published", "Drafts"] + selection = st.segmented_control( + "Publication status", + options, + selection_mode="single", + default="All", + label_visibility="collapsed", + ) + + # Filter DIs + match selection: + case "All": + filtered_dis = sorted_dis + case "Published": + filtered_dis = [di for di in sorted_dis if di["is_public"]] + case "Drafts": + filtered_dis = [di for di in sorted_dis if not di["is_public"]] + case _: + filtered_dis = sorted_dis + + # Filter DIs by author + if input_authors: + filtered_dis = [di for di in filtered_dis if any(author in di["authors"] for author in input_authors)] + + # Use pagination + items_per_page = 100 + pagination = Pagination( + items=filtered_dis, + items_per_page=items_per_page, + pagination_key=f"pagination-di-search-{input_string}", + ) + + if len(filtered_dis) > items_per_page: + pagination.show_controls(mode="bar") + + # Show items (only current page) + for item in pagination.get_page_items(): + st_display_insight(item) diff --git a/apps/wizard/app_pages/map_brackets.py b/apps/wizard/app_pages/map_brackets.py index 2a48432ca18..9975ebc55ba 100644 --- a/apps/wizard/app_pages/map_brackets.py +++ b/apps/wizard/app_pages/map_brackets.py @@ -138,7 +138,7 @@ def create_default_chart_config_for_variable(metadata: Dict[str, Any]) -> Dict[s """Create a default chart for a variable with id `variable_id`.""" chart_config = { "hasMapTab": True, - "hasChartTab": False, + "chartTypes": [], "tab": "map", "map": { # "timeTolerance": 0, diff --git a/apps/wizard/config/config.yml b/apps/wizard/config/config.yml index 0ec6f7414fd..f00f6090fc0 100644 --- a/apps/wizard/config/config.yml +++ b/apps/wizard/config/config.yml @@ -37,6 +37,14 @@ main: - "@daniel" entrypoint: app_pages/expert/app.py icon: ":material/hotel_class:" + analytics: + title: "Analytics (external)" + description: "Learn more about OWID in Data!" + maintainer: + - "@bobbie" + - "@lucas" + entrypoint: app_pages/analytics.py + icon: ":material/bar_chart:" # ETL steps etl: @@ -86,7 +94,7 @@ etl: image_url: "https://upload.wikimedia.org/wikipedia/commons/b/be/Alonso_%28Renault%29_qualifying_at_USGP_2005.jpg" sections: - - title: "Data tools" + - title: "Data curation" description: |- Updates, data edits, etc. apps: @@ -96,7 +104,7 @@ sections: description: Migrate indicators in charts maintainer: "@lucas" icon: ":material/upgrade:" - image_url: "https://superheroetc.wordpress.com/wp-content/uploads/2017/05/bulbasaur-line.jpg" + image_url: "https://wallpapers.com/images/hd/pokemon-evolution-1920-x-1080-wallpaper-h5u6nbg98e3tibyn.jpg" disable: production: True - title: "Anomalist" @@ -105,7 +113,7 @@ sections: description: List anomalies in data maintainer: "@lucas" icon: ":material/planner_review:" - image_url: "https://i0.pickpik.com/photos/87/645/315/halloween-ghosts-happy-halloween-ghost-preview.jpg" + image_url: "https://www.shutterstock.com/image-illustration/multitude-endless-standing-white-eggs-260nw-310142192.jpg" disable: production: True - title: "Chart Diff" @@ -114,7 +122,7 @@ sections: description: Compare charts in staging and prod maintainer: "@mojmir" icon: ":material/difference:" - image_url: "https://static.wikia.nocookie.net/dragonball/images/6/60/FusionDanceFinaleGotenTrunksBuuSaga.png" + image_url: "https://images.squarespace-cdn.com/content/v1/5ea8287254c2e00f35d1cc48/1595511684480-6RM6YK3XZGB2SD21GPGP/20140620appletoapplescomparison.jpg" disable: production: True - title: "Harmonizer" @@ -123,9 +131,9 @@ sections: maintainer: "@lucas" entrypoint: app_pages/harmonizer.py icon: ":material/music_note:" - image_url: "https://upload.wikimedia.org/wikipedia/commons/thumb/c/c1/C_triad.svg/2560px-C_triad.svg.png" + image_url: "https://www.shutterstock.com/image-vector/abstract-vector-musical-background-music-260nw-485443810.jpg" - - title: "Monitoring" + - title: "Data monitoring" description: |- Control panel for ETL steps. apps: @@ -167,13 +175,20 @@ sections: description: |- Research tools. apps: + - title: "Insight search" + alias: insight_search + description: "Browse data insights by their semantic similarity" + maintainer: "@pablo" + entrypoint: app_pages/insight_search.py + icon: ":material/search:" + image_url: "https://img.freepik.com/premium-photo/librarian-cataloging-new-books-library-database_1237301-1719.jpg" - title: "Insighter" alias: insighter description: "Data insights with GPT" maintainer: "@daniel" entrypoint: app_pages/datainsight_robot.py icon: ":material/lightbulb:" - image_url: "https://assets.change.org/photos/7/zi/ph/ZMziPHIKGSDaDCJ-800x450-noPad.jpg?1525274743" + image_url: "https://cdn.cpdonline.co.uk/wp-content/uploads/2023/08/03153310/Starting-To-Become-A-Writer-1200x350.jpg" - title: "Misc" description: |- @@ -185,14 +200,21 @@ sections: maintainer: "@lucas" entrypoint: app_pages/news.py icon: ":material/newspaper:" - image_url: "https://www.tsanet.org/wp-content/uploads/2022/08/latest-news.jpg" + image_url: "https://img.freepik.com/free-photo/top-view-old-french-newspaper-pieces_23-2149318857.jpg" - title: "owidle" alias: owidle description: "Guess the country game!" maintainer: "@lucas" entrypoint: app_pages/owidle.py icon: ":material/videogame_asset:" - image_url: "https://upload.wikimedia.org/wikipedia/en/thumb/e/e0/WPVG_icon_2016.svg/160px-WPVG_icon_2016.svg.png" + image_url: "https://t3.ftcdn.net/jpg/06/26/23/36/360_F_626233679_tesiSRP9Jinq5wS0ZgbdJ6k5adupmgKl.jpg" + - title: "Chart animation" + alias: chart-animation + description: "Create an animated GIF or a video from a chart" + maintainer: "@pablo" + entrypoint: app_pages/chart_animation.py + icon: ":material/animated_images:" + image_url: "https://img.freepik.com/free-photo/abstract-yellow-smooth-wave-lines_1017-24863.jpg" - title: "Metadata" description: |- diff --git a/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py b/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py index fad0174bcdc..145c24b3959 100644 --- a/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py +++ b/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("{{cookiecutter.short_name}}") # Read table from meadow dataset. - tb = ds_meadow["{{cookiecutter.short_name}}"].reset_index() + tb = ds_meadow.read("{{cookiecutter.short_name}}") # # Process data. diff --git a/apps/wizard/etl_steps/cookiecutter/grapher/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py b/apps/wizard/etl_steps/cookiecutter/grapher/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py index 5addd151c58..659143e7856 100644 --- a/apps/wizard/etl_steps/cookiecutter/grapher/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py +++ b/apps/wizard/etl_steps/cookiecutter/grapher/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py @@ -14,11 +14,7 @@ def run(dest_dir: str) -> None: ds_garden = paths.load_dataset("{{cookiecutter.short_name}}") # Read table from garden dataset. - tb = ds_garden["{{cookiecutter.short_name}}"] - - # - # Process data. - # + tb = ds_garden.read("{{cookiecutter.short_name}}", reset_index=False) # # Save outputs. diff --git a/apps/wizard/etl_steps/cookiecutter/snapshot/{{cookiecutter.namespace}}/{{cookiecutter.snapshot_version}}/{{cookiecutter.short_name}}.py b/apps/wizard/etl_steps/cookiecutter/snapshot/{{cookiecutter.namespace}}/{{cookiecutter.snapshot_version}}/{{cookiecutter.short_name}}.py index 68268eaf79a..0f07ecb9c03 100644 --- a/apps/wizard/etl_steps/cookiecutter/snapshot/{{cookiecutter.namespace}}/{{cookiecutter.snapshot_version}}/{{cookiecutter.short_name}}.py +++ b/apps/wizard/etl_steps/cookiecutter/snapshot/{{cookiecutter.namespace}}/{{cookiecutter.snapshot_version}}/{{cookiecutter.short_name}}.py @@ -12,7 +12,7 @@ @click.command() @click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") {% if cookiecutter.dataset_manual_import == True %} -@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") def main(path_to_file: str, upload: bool) -> None: # Create a new snapshot. snap = Snapshot(f"{{cookiecutter.namespace}}/{SNAPSHOT_VERSION}/{{cookiecutter.short_name}}.{{cookiecutter.file_extension}}") diff --git a/apps/wizard/etl_steps/snapshot.py b/apps/wizard/etl_steps/snapshot.py index ed9d8c483f4..8cc5b71c90d 100644 --- a/apps/wizard/etl_steps/snapshot.py +++ b/apps/wizard/etl_steps/snapshot.py @@ -789,7 +789,7 @@ def run_snap_step() -> None: # Display next steps if form.dataset_manual_import: - manual_import_instructions = "--path-to-file **relative path of file**" + manual_import_instructions = "-f **relative path of file**" else: manual_import_instructions = "" st.subheader("Next steps") @@ -845,7 +845,7 @@ def run_snap_step() -> None: commands = ["uv", "run", "python", script_path] if form.dataset_manual_import: # Get snapshot local file - commands.extend(["--path-to-file", st.session_state["snapshot_file"]]) + commands.extend(["-f", st.session_state["snapshot_file"]]) command_str = f"`{' '.join(commands)}`" # Run step diff --git a/apps/wizard/home.py b/apps/wizard/home.py index b3635a585cf..06c51bacb58 100644 --- a/apps/wizard/home.py +++ b/apps/wizard/home.py @@ -6,7 +6,7 @@ from streamlit_card import card from apps.wizard.config import WIZARD_CONFIG -from apps.wizard.utils import set_staging_creation_time, st_page_link +from apps.wizard.utils import st_page_link st.set_page_config( page_title="Wizard: Home", @@ -24,14 +24,26 @@ def st_show_home(): st.title("Wizard 🪄") with cols[1]: st.caption(f"streamlit {st.__version__}") - # Expert link - st_page_link( - "expert", - label="Questions about ETL or Grapher? Ask the expert!", - help="Ask the expert any documentation question!", - use_container_width=True, - border=True, - ) + + # Relevant links + with st.container(border=False): + cols = st.columns(2, vertical_alignment="center") + with cols[0]: + st_page_link( + "expert", + label="Questions about ETL or Grapher? Ask the expert!", + help="Ask the expert any documentation question!", + use_container_width=True, + border=True, + ) + with cols[1]: + st_page_link( + "analytics", + label="OWID Analytics", + help="Learn more with the OWID Analytics dashboard. It redirects you to another internal site.", + use_container_width=True, + border=True, + ) # Generic tools ## Default styling for the cards (Wizard apps are presented as cards) @@ -199,8 +211,5 @@ def create_card( st.switch_page(app["entrypoint"]) -# Load some config parameters -set_staging_creation_time() - # Show the home page st_show_home() diff --git a/apps/wizard/utils/__init__.py b/apps/wizard/utils/__init__.py index b6726fa6770..89ad8753306 100644 --- a/apps/wizard/utils/__init__.py +++ b/apps/wizard/utils/__init__.py @@ -663,24 +663,13 @@ def _get_staging_creation_time(session: Session): return create_time -def get_staging_creation_time(session: Optional[Session] = None): +def get_staging_creation_time(session: Session): """Get staging server creation time.""" - if VARNAME_STAGING_CREATION_TIME not in st.session_state: - set_staging_creation_time(session) - return st.session_state[VARNAME_STAGING_CREATION_TIME] - - -def set_staging_creation_time(session: Optional[Session] = None, key: str = VARNAME_STAGING_CREATION_TIME) -> None: - """Gest staging server creation time estimate.""" - - if session is None: - if OWID_ENV.env_remote == "staging": - with Session(OWID_ENV.engine) as session: - st.session_state[key] = _get_staging_creation_time(session) - else: - st.session_state[key] = None - else: + # Create a unique key for a session to avoid conflicts when working with multiple staging servers. + key = f"{VARNAME_STAGING_CREATION_TIME}_{str(session.bind)}" + if key not in st.session_state: st.session_state[key] = _get_staging_creation_time(session) + return st.session_state[key] def st_toast_error(message: str) -> None: diff --git a/apps/wizard/utils/cached.py b/apps/wizard/utils/cached.py index 540039c93aa..3e66574927a 100644 --- a/apps/wizard/utils/cached.py +++ b/apps/wizard/utils/cached.py @@ -1,17 +1,28 @@ +import json +import logging +import subprocess from typing import Any, Dict, List, Optional, Tuple import pandas as pd import streamlit as st +import structlog from owid.catalog import find from sqlalchemy.orm import Session +import etl.grapher_model as gm from apps.utils.map_datasets import get_grapher_changes from etl import grapher_io as gio from etl.config import OWID_ENV, OWIDEnv +from etl.db import get_engine from etl.git_helpers import get_changed_files from etl.grapher_model import Anomaly, Variable from etl.version_tracker import VersionTracker +log = structlog.get_logger() + +# silence WARNING streamlit.runtime.caching.cache_data_api: No runtime found, using MemoryCacheStorageManager +logging.getLogger("streamlit.runtime.caching.cache_data_api").setLevel(logging.ERROR) + @st.cache_data def load_entity_ids(entity_ids: Optional[List[int]] = None): @@ -162,3 +173,53 @@ def load_latest_population(): ).rename(columns={"country": "entity_name"}, errors="raise") return population + + +@st.cache_data +def get_tailscale_ip_to_user_map(): + """Get the mapping of Tailscale IPs to github usernames.""" + proc = subprocess.run(["tailscale", "status", "--json"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + if proc.returncode != 0: + log.warning(f"Error getting Tailscale status: {proc.stderr}") + return {} + + status = json.loads(proc.stdout) + ip_to_user = {} + + # Map user IDs to display names + user_id_to_name = {} + for user_id, user_info in status.get("User", {}).items(): + if "LoginName" in user_info: + user_id_to_name[int(user_id)] = user_info["LoginName"] + + # Map IPs to user display names + for peer in status.get("Peer", {}).values(): + user_id = peer.get("UserID") + login_name = user_id_to_name.get(user_id) + if login_name: + for ip in peer.get("TailscaleIPs", []): + ip_to_user[ip] = login_name + + return ip_to_user + + +@st.cache_data +def get_grapher_user_id(user_ip: str) -> Optional[int]: + """Get the Grapher user ID associated with the given Tailscale IP address.""" + # Get Tailscale IP-to-User mapping + ip_to_user_map = get_tailscale_ip_to_user_map() + + # Get the Tailscale display name / github username associated with the client's IP address + github_user_name = ip_to_user_map.get(user_ip) + + if not github_user_name: + return None + + with Session(get_engine()) as session: + grapher_user = gm.User.load_user(session, github_user_name) + + if grapher_user: + return grapher_user.id + else: + return None diff --git a/apps/wizard/utils/chart_config.py b/apps/wizard/utils/chart_config.py index e6120255bb5..82dbd0b2890 100644 --- a/apps/wizard/utils/chart_config.py +++ b/apps/wizard/utils/chart_config.py @@ -31,7 +31,7 @@ "hideLegend": False, "tab": "chart", "logo": "owid", - "$schema": "https://files.ourworldindata.org/schemas/grapher-schema.005.json", + "$schema": "https://files.ourworldindata.org/schemas/grapher-schema.006.json", "showYearLabels": False, "id": 807, "selectedFacetStrategy": "none", @@ -41,7 +41,7 @@ "version": 14, "sortOrder": "desc", "maxTime": "latest", - "type": "LineChart", + "chartTypes": ["LineChart"], "hideRelativeToggle": True, "addCountryMode": "add-country", "hideAnnotationFieldsInTitle": {"entity": False, "changeInPrefix": False, "time": False}, @@ -65,7 +65,6 @@ "missingDataStrategy": "auto", "isPublished": False, "timelineMinTime": "earliest", - "hasChartTab": True, "timelineMaxTime": "latest", "sortBy": "total", } @@ -84,7 +83,7 @@ def bake_chart_config( Bakes a very basic config, which will be enough most of the times. If you want a more complex config, use this as a baseline to adjust to your needs. - Note: You can find more details on our Grapher API at https://files.ourworldindata.org/schemas/grapher-schema.005.json. + Note: You can find more details on our Grapher API at https://files.ourworldindata.org/schemas/grapher-schema.latest.json. """ # Define chart config diff --git a/apps/wizard/utils/components.py b/apps/wizard/utils/components.py index 258bdb1e3d5..c2934548512 100644 --- a/apps/wizard/utils/components.py +++ b/apps/wizard/utils/components.py @@ -76,7 +76,7 @@ def grapher_chart( You can either plot a given chart config (using chart_config) or plot an indicator with its default metadata using either catalog_path, variable_id or variable. - Note: You can find more details on our Grapher API at https://files.ourworldindata.org/schemas/grapher-schema.005.json. + Note: You can find more details on our Grapher API at https://files.ourworldindata.org/schemas/grapher-schema.latest.json. Parameters ---------- @@ -111,6 +111,16 @@ def grapher_chart( _chart_html(chart_config, owid_env, height=height, **kwargs) +def grapher_chart_from_url(chart_url: str, height=600): + """Plot a Grapher chart using the Grapher API.""" + chart_animation_iframe_html = f""" + + """ + return st.components.v1.html(chart_animation_iframe_html, height=height) # type: ignore + + def _chart_html(chart_config: Dict[str, Any], owid_env: OWIDEnv, height=600, **kwargs): """Plot a Grapher chart using the Grapher API. @@ -309,3 +319,19 @@ def _change_page(): on_change=_change_page, key=self.pagination_key, ) + + +def st_multiselect_wider(num_px: int = 1000): + st.markdown( + f""" + """, + unsafe_allow_html=True, + ) + + +def st_info(text): + st.info(text, icon=":material/info:") diff --git a/apps/wizard/utils/io.py b/apps/wizard/utils/io.py index e6918e7ec61..aa0198be39c 100644 --- a/apps/wizard/utils/io.py +++ b/apps/wizard/utils/io.py @@ -14,7 +14,8 @@ from apps.wizard.utils.cached import get_datasets_from_version_tracker from etl.git_helpers import get_changed_files from etl.grapher_io import get_all_datasets -from etl.paths import BASE_DIR, STEP_DIR +from etl.paths import BASE_DIR, SNAPSHOTS_DIR, STEP_DIR +from etl.steps import filter_to_subgraph, load_dag # Initialize logger. log = get_logger() @@ -61,9 +62,8 @@ def get_steps_df(archived: bool = True): ######################################################################################################################## -def get_changed_grapher_steps(files_changed: Dict[str, Dict[str, str]]) -> List[str]: - """Get list of new grapher steps with their corresponding old steps.""" - grapher_steps = [] +def get_changed_steps(files_changed: Dict[str, Dict[str, str]]) -> List[str]: + changed_steps = [] for file_path, file_status in files_changed.items(): # File status can be: D (deleted), A (added), M (modified). # NOTE: In principle, we could select only "A" files. But it is possible that the user adds a new grapher step, and then commits changes to it, in which case (I think) the status would be "M". @@ -73,14 +73,26 @@ def get_changed_grapher_steps(files_changed: Dict[str, Dict[str, str]]) -> List[ # Skip deleted files. continue - # Identify grapher data steps, and ignore the rest. - if file_path.startswith(STEP_DIR.relative_to(BASE_DIR).as_posix()) and file_path.endswith(".py"): - if Path(file_path).with_suffix("").as_posix().split("/")[-4] == "grapher": - grapher_steps.append(file_path) + # Identify potential recipes for data steps + if file_path.startswith( + (STEP_DIR.relative_to(BASE_DIR).as_posix(), SNAPSHOTS_DIR.relative_to(BASE_DIR).as_posix()) + ): + changed_steps.append(file_path) else: continue - return grapher_steps + return changed_steps + + +def get_changed_grapher_steps(files_changed: Dict[str, Dict[str, str]]) -> List[str]: + """Get list of new grapher steps with their corresponding old steps.""" + steps = [] + for step_path in get_changed_steps(files_changed): + if step_path.endswith(".py"): + parts = Path(step_path).with_suffix("").as_posix().split("/") + if len(parts) >= 4 and parts[-4] == "grapher": + steps.append(step_path) + return steps def get_new_grapher_datasets_and_their_previous_versions(session: Session) -> Dict[int, Optional[int]]: @@ -136,3 +148,37 @@ def get_new_grapher_datasets_and_their_previous_versions(session: Session) -> Di new_datasets[ds_id] = previous_dataset return new_datasets + + +def get_all_changed_catalog_paths(files_changed: Dict[str, Dict[str, str]]) -> List[str]: + """Get all changed steps and their downstream dependencies.""" + dataset_catalog_paths = [] + + # Get catalog paths of all datasets with changed files. + for step_path in get_changed_steps(files_changed): + abs_step_path = BASE_DIR / Path(step_path) + try: + # TODO: use StepPath from https://github.com/owid/etl/pull/3165 to refactor this + if step_path.startswith("snapshots/"): + ds_path = abs_step_path.relative_to(SNAPSHOTS_DIR).with_suffix("").with_suffix("").as_posix() + else: + ds_path = abs_step_path.relative_to(STEP_DIR / "data").with_suffix("").with_suffix("").as_posix() + dataset_catalog_paths.append(ds_path) + except ValueError: + continue + + # NOTE: + # This is OK, as it filters down the DAG a little bit. But using VersionTracker.steps_df would be much more precise. You could do: + # steps_df[(steps_df["step"].isin([...])]["all_active_usages"] + # And that would give you only the steps that are affected by the changed files. That would be ultimately what we need. But I + # understand that loading steps_df is very slow. + + # Add all downstream dependencies of those datasets. + DAG = load_dag() + dag_steps = list(filter_to_subgraph(DAG, dataset_catalog_paths, downstream=True).keys()) + + # From data://... extract catalogPath + # TODO: use StepPath from https://github.com/owid/etl/pull/3165 to refactor this + catalog_paths = [step.split("://")[1] for step in dag_steps if step.startswith("data://")] + + return catalog_paths diff --git a/dag/archive/agriculture.yml b/dag/archive/agriculture.yml index a5a794024a2..0fc575e7f69 100644 --- a/dag/archive/agriculture.yml +++ b/dag/archive/agriculture.yml @@ -1,58 +1,5 @@ steps: # - # Attainable yields. - # - data://garden/agriculture/2023-05-26/attainable_yields: - - data://garden/papers/2023-05-26/mueller_et_al_2012 - - data://garden/faostat/2023-02-22/faostat_qcl - data://grapher/agriculture/2023-05-26/attainable_yields: - - data://garden/agriculture/2023-05-26/attainable_yields - # - # Crop yields explorer. - # - data://explorers/agriculture/2023-05-26/crop_yields: - - data://grapher/agriculture/2023-05-26/attainable_yields - # - # Long-term wheat yields in Europe. - # - data://garden/agriculture/2023-04-20/long_term_wheat_yields: - - data://garden/papers/2023-04-20/bayliss_smith_wanmali_1984 - - data://garden/faostat/2023-02-22/faostat_qcl - data://grapher/agriculture/2023-04-20/long_term_wheat_yields: - - data://garden/agriculture/2023-04-20/long_term_wheat_yields - # - # Long-run crop yields. - # - data://garden/agriculture/2023-05-30/long_term_crop_yields: - - data://garden/faostat/2023-02-22/faostat_qcl - - data://garden/agriculture/2023-04-21/uk_long_term_yields - - data://garden/usda_nass/2023-04-20/us_corn_yields - - data://garden/agriculture/2023-04-20/long_term_wheat_yields - data://grapher/agriculture/2023-05-30/long_term_crop_yields: - - data://garden/agriculture/2023-05-30/long_term_crop_yields - # - # Long-term yields in the United Kingdom. - # - data://garden/agriculture/2023-04-21/uk_long_term_yields: - - data://garden/papers/2023-04-21/broadberry_et_al_2015 - - data://garden/papers/2023-04-21/brassley_2000 - - data://garden/faostat/2023-02-22/faostat_qcl - data://grapher/agriculture/2023-04-21/uk_long_term_yields: - - data://garden/agriculture/2023-04-21/uk_long_term_yields - # - # Attainable yields. - # - data://garden/agriculture/2023-05-30/attainable_yields: - - data://garden/papers/2023-05-26/mueller_et_al_2012 - - data://garden/agriculture/2023-05-30/long_term_crop_yields - data://grapher/agriculture/2023-05-30/attainable_yields: - - data://garden/agriculture/2023-05-30/attainable_yields - # - # Crop yields explorer. - # - data://explorers/agriculture/2023-05-30/crop_yields: - - data://grapher/agriculture/2023-05-30/attainable_yields - # # Long-term wheat yields in Europe. # data://garden/agriculture/2023-06-12/long_term_wheat_yields: diff --git a/dag/archive/animal_welfare.yml b/dag/archive/animal_welfare.yml index 0204ecc90b0..aab2628e1b6 100644 --- a/dag/archive/animal_welfare.yml +++ b/dag/archive/animal_welfare.yml @@ -1,14 +1,5 @@ steps: # - # Fur laws (Fur Free Alliance, 2023). - # - data://meadow/animal_welfare/2023-09-08/fur_laws: - - snapshot://animal_welfare/2023-09-08/fur_laws.pdf - data://garden/animal_welfare/2023-09-08/fur_laws: - - data://meadow/animal_welfare/2023-09-08/fur_laws - data://grapher/animal_welfare/2023-09-08/fur_laws: - - data://garden/animal_welfare/2023-09-08/fur_laws - # # Bullfighting laws (Various sources, 2023). # data://meadow/animal_welfare/2023-09-05/bullfighting_laws: diff --git a/dag/archive/artificial_intelligence.yml b/dag/archive/artificial_intelligence.yml index 70476d70855..a69ed71a8d4 100644 --- a/dag/archive/artificial_intelligence.yml +++ b/dag/archive/artificial_intelligence.yml @@ -1,6 +1,7 @@ steps: ############################################################################################################## # EPOCH archive (monthly updates) + # Artificial Intelligence (EPOCH) data://meadow/artificial_intelligence/latest/epoch: - snapshot://artificial_intelligence/latest/epoch.csv @@ -189,6 +190,100 @@ steps: - data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain: - data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain + + # Main EPOCH dataset + data://meadow/artificial_intelligence/2024-10-01/epoch: + - snapshot://artificial_intelligence/2024-10-01/epoch.csv + data://garden/artificial_intelligence/2024-10-01/epoch: + - data://meadow/artificial_intelligence/2024-10-01/epoch + data://grapher/artificial_intelligence/2024-10-01/epoch: + - data://garden/artificial_intelligence/2024-10-01/epoch + + # Main EPOCH dataset regression lines + data://garden/artificial_intelligence/2024-10-01/epoch_regressions: + - data://garden/artificial_intelligence/2024-10-01/epoch + data://grapher/artificial_intelligence/2024-10-01/epoch_regressions: + - data://garden/artificial_intelligence/2024-10-01/epoch_regressions + + # EPOCH aggregates by domain + data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_domain: + - data://meadow/artificial_intelligence/2024-10-01/epoch + data://grapher/artificial_intelligence/2024-10-01/epoch_aggregates_domain: + - data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_domain + + # EPOCH aggregates by researcher affiliaiton + data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-10-01/epoch + data://grapher/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation + + # EPOCH dataset on Compute Intensive AI + data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive: + - snapshot://artificial_intelligence/2024-10-01/epoch_compute_intensive.csv + data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive: + - data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive: + - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive + + # EPOCH dataset on Compute Intensive AI, aggregates by country + data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries: + - data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries: + - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries + + # EPOCH dataset on Compute Intensive AI, aggregates by domain + data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain: + - data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain: + - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain + + # Main EPOCH dataset + data://meadow/artificial_intelligence/2024-11-03/epoch: + - snapshot://artificial_intelligence/2024-11-03/epoch.csv + data://garden/artificial_intelligence/2024-11-03/epoch: + - data://meadow/artificial_intelligence/2024-11-03/epoch + data://grapher/artificial_intelligence/2024-11-03/epoch: + - data://garden/artificial_intelligence/2024-11-03/epoch + + # Main EPOCH dataset regression lines + data://garden/artificial_intelligence/2024-11-03/epoch_regressions: + - data://garden/artificial_intelligence/2024-11-03/epoch + data://grapher/artificial_intelligence/2024-11-03/epoch_regressions: + - data://garden/artificial_intelligence/2024-11-03/epoch_regressions + + # EPOCH aggregates by domain + data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain: + - data://meadow/artificial_intelligence/2024-11-03/epoch + data://grapher/artificial_intelligence/2024-11-03/epoch_aggregates_domain: + - data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain + + # EPOCH aggregates by researcher affiliaiton + data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-11-03/epoch + data://grapher/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation + + # EPOCH dataset on Compute Intensive AI + data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive: + - snapshot://artificial_intelligence/2024-11-03/epoch_compute_intensive.csv + data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive: + - data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive + + + # EPOCH dataset on Compute Intensive AI, aggregates by country + data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries: + - data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries: + - data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries + + # EPOCH dataset on Compute Intensive AI, aggregates by domain + data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain: + - data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain: + - data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain + + + ############################################################################################################## # AI Incidents diff --git a/dag/archive/climate.yml b/dag/archive/climate.yml index 95df7e404b6..08b7e2a683e 100644 --- a/dag/archive/climate.yml +++ b/dag/archive/climate.yml @@ -639,3 +639,147 @@ steps: # data://grapher/met_office_hadley_centre/2024-05-20/near_surface_temperature: - data://garden/met_office_hadley_centre/2024-05-20/near_surface_temperature + # + # Rutgers University Global Snow Lab - Snow Cover Extent. + # + data://meadow/climate/2024-09-30/snow_cover_extent: + - snapshot://climate/2024-09-30/snow_cover_extent_northern_hemisphere.csv + - snapshot://climate/2024-09-30/snow_cover_extent_north_america.csv + # + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # + data://meadow/climate/2024-09-30/hawaii_ocean_time_series: + - snapshot://climate/2024-09-30/hawaii_ocean_time_series.csv + # + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # + data://meadow/climate/2024-09-30/ocean_heat_content: + - snapshot://climate/2024-07-23/ocean_heat_content_monthly_world_700m.csv + - snapshot://climate/2024-07-23/ocean_heat_content_annual_world_2000m.csv + - snapshot://climate/2024-07-23/ocean_heat_content_monthly_world_2000m.csv + - snapshot://climate/2024-07-23/ocean_heat_content_annual_world_700m.csv + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://meadow/climate/2024-09-30/sea_surface_temperature: + - snapshot://climate/2024-09-30/sea_surface_temperature_southern_hemisphere.csv + - snapshot://climate/2024-09-30/sea_surface_temperature_northern_hemisphere.csv + - snapshot://climate/2024-09-30/sea_surface_temperature_world.csv + # + # NOAA Global Monitoring Laboratory - GHG concentration. + # + data://meadow/climate/2024-09-30/ghg_concentration: + - snapshot://climate/2024-09-30/co2_concentration_monthly.csv + - snapshot://climate/2024-09-30/ch4_concentration_monthly.csv + - snapshot://climate/2024-09-30/n2o_concentration_monthly.csv + # + # GISS - Surface temperature analysis. + # + data://meadow/climate/2024-09-30/surface_temperature_analysis: + - snapshot://climate/2024-07-23/surface_temperature_analysis_northern_hemisphere.csv + - snapshot://climate/2024-07-23/surface_temperature_analysis_southern_hemisphere.csv + - snapshot://climate/2024-07-23/surface_temperature_analysis_world.csv + # + # Rutgers University Global Snow Lab - Snow Cover Extent. + # + data://garden/climate/2024-09-30/snow_cover_extent: + - data://meadow/climate/2024-09-30/snow_cover_extent + # + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # + data://garden/climate/2024-09-30/ocean_ph_levels: + - data://meadow/climate/2024-09-30/hawaii_ocean_time_series + # + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # + data://garden/climate/2024-09-30/ocean_heat_content: + - data://meadow/climate/2024-09-30/ocean_heat_content + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://garden/climate/2024-09-30/sea_surface_temperature: + - data://meadow/climate/2024-09-30/sea_surface_temperature + # + # NOAA Global Monitoring Laboratory - GHG concentration. + # + data://garden/climate/2024-09-30/ghg_concentration: + - data://meadow/climate/2024-09-30/ghg_concentration + # + # GISS - Surface temperature analysis. + # + data://garden/climate/2024-09-30/surface_temperature_analysis: + - data://meadow/climate/2024-09-30/surface_temperature_analysis + # + # Various sources - Long-run greenhouse gas concentration. + # + data://garden/climate/2024-09-30/long_run_ghg_concentration: + - data://garden/climate/2024-09-30/ghg_concentration + - data://garden/epa/2024-04-17/ghg_concentration + # + # Various sources - Climate change impacts. + # + data://garden/climate/2024-09-30/climate_change_impacts: + - data://garden/epa/2024-04-17/ice_sheet_mass_balance + - data://garden/climate/2024-09-30/snow_cover_extent + - data://garden/climate/2024-01-28/global_sea_level + - data://garden/climate/2024-09-30/ghg_concentration + - data://garden/climate/2024-09-30/sea_surface_temperature + - data://garden/climate/2024-09-30/surface_temperature_analysis + - data://garden/climate/2024-09-30/sea_ice_index + - data://garden/climate/2024-09-30/ocean_heat_content + - data://garden/climate/2024-09-30/long_run_ghg_concentration + - data://garden/climate/2024-09-30/ocean_ph_levels + - data://garden/epa/2024-04-17/ocean_heat_content + - data://garden/epa/2024-04-17/mass_balance_us_glaciers + # + # Various sources - Climate change impacts (monthly). + # + data://grapher/climate/2024-09-30/climate_change_impacts_monthly: + - data://garden/climate/2024-09-30/climate_change_impacts + # + # Various sources - Climate change impacts (annual). + # + data://grapher/climate/2024-09-30/climate_change_impacts_annual: + - data://garden/climate/2024-09-30/climate_change_impacts + # + # Met Office Hadley Centre - Near surface temperature anomaly. + # + data://meadow/met_office_hadley_centre/2024-10-22/near_surface_temperature: + - snapshot://met_office_hadley_centre/2024-10-22/near_surface_temperature_northern_hemisphere.csv + - snapshot://met_office_hadley_centre/2024-10-22/near_surface_temperature_southern_hemisphere.csv + - snapshot://met_office_hadley_centre/2024-10-22/near_surface_temperature_global.csv + # + # Met Office Hadley Centre - Near surface temperature anomaly. + # + data://garden/met_office_hadley_centre/2024-10-22/near_surface_temperature: + - data://meadow/met_office_hadley_centre/2024-10-22/near_surface_temperature + # + # NSIDC - Monthly sea ice extent ("country" for years, "year" for month number, one indicator per hemisphere). + # + data://grapher/climate/2024-09-30/sea_ice_extent_by_year: + - data://garden/climate/2024-09-30/sea_ice_index + # + # NSIDC - Monthly sea ice extent ("country" for decades and latest year, "year" for month number, one indicator per hemisphere). + # + data://grapher/climate/2024-09-30/sea_ice_extent_by_decade: + - data://garden/climate/2024-09-30/sea_ice_index + # + # Met Office Hadley Centre - Near surface temperature anomaly. + # + data://grapher/met_office_hadley_centre/2024-10-22/near_surface_temperature: + - data://garden/met_office_hadley_centre/2024-10-22/near_surface_temperature + # + # NSIDC - Arctic sea ice extent. + # + data://meadow/climate/2024-09-30/sea_ice_index: + - snapshot://climate/2024-09-30/sea_ice_index.xlsx + # + # NSIDC - Arctic sea ice extent. + # + data://garden/climate/2024-09-30/sea_ice_index: + - data://meadow/climate/2024-09-30/sea_ice_index + # + # NSIDC - Monthly sea ice anomalies ("country" for month names, "year" for years, one indicator per hemisphere). + # + data://grapher/climate/2024-09-30/sea_ice_anomalies_by_month: + - data://garden/climate/2024-09-30/sea_ice_index diff --git a/dag/archive/demography.yml b/dag/archive/demography.yml index 9c0ecbe8808..8319f1b63d2 100644 --- a/dag/archive/demography.yml +++ b/dag/archive/demography.yml @@ -1,11 +1,4 @@ steps: - # Population density - data://garden/demography/2023-04-14/population_density: - - data://garden/demography/2023-03-31/population - - data://garden/faostat/2023-02-22/faostat_rl - data://grapher/demography/2023-04-14/population_density: - - data://garden/demography/2023-04-14/population_density - # GINI LE inequality data://garden/demography/2023-09-29/gini_le: - data://garden/hmd/2023-09-19/hmd @@ -59,9 +52,69 @@ steps: data-private://grapher/un/2024-07-11/un_wpp_full: - data-private://garden/un/2024-07-11/un_wpp - # Population density + # Population density data://garden/demography/2023-06-12/population_density: - data://garden/demography/2023-03-31/population - data://garden/faostat/2024-03-14/faostat_rl data://grapher/demography/2023-06-12/population_density: - data://garden/demography/2023-06-12/population_density + + # HMD + data://meadow/hmd/2022-12-07/life_tables: + - snapshot://hmd/2022-12-07/hmd.zip + data://garden/hmd/2022-12-07/life_tables: + - data://meadow/hmd/2022-12-07/life_tables + + # Survivorship ages (HMD-derived) + data://garden/demography/2023-09-27/survivor_percentiles: + - data://garden/hmd/2023-09-19/hmd + data://grapher/demography/2023-09-27/survivor_percentiles: + - data://garden/demography/2023-09-27/survivor_percentiles + + # Phi-gender life expectancy inequality + data://garden/demography/2023-10-03/phi_gender_le: + - data://garden/demography/2023-10-03/life_tables + data://grapher/demography/2023-10-03/phi_gender_le: + - data://garden/demography/2023-10-03/phi_gender_le + + # Broken limits of Life Expectancy + data://garden/demography/2023-10-20/broken_limits_le: + - data://garden/demography/2023-10-03/life_tables + - data://garden/hmd/2023-09-19/hmd + data://grapher/demography/2023-10-20/broken_limits_le: + - data://garden/demography/2023-10-20/broken_limits_le + + # Gini Life Expectancy Inequality + data://garden/demography/2023-10-04/gini_le: + - data://garden/demography/2023-10-03/life_tables + data://grapher/demography/2023-10-04/gini_le: + - data://garden/demography/2023-10-04/gini_le + + # HMD + data://meadow/hmd/2023-09-19/hmd: + - snapshot://hmd/2023-09-18/hmd.zip + data://garden/hmd/2023-09-19/hmd: + - data://meadow/hmd/2023-09-19/hmd + data://grapher/hmd/2023-09-19/hmd: + - data://garden/hmd/2023-09-19/hmd + # UN WPP Life Tables + data://meadow/un/2023-10-02/un_wpp_lt: + - snapshot://un/2023-10-02/un_wpp_lt_all.zip + - snapshot://un/2023-10-02/un_wpp_lt_f.zip + - snapshot://un/2023-10-02/un_wpp_lt_m.zip + data://garden/un/2023-10-02/un_wpp_lt: + - data://meadow/un/2023-10-02/un_wpp_lt + # UN WPP + HMD Life Tables + data://garden/demography/2023-10-03/life_tables: + - data://garden/hmd/2023-09-19/hmd + - data://garden/un/2023-10-02/un_wpp_lt + data://grapher/demography/2023-10-04/life_tables: + - data://garden/demography/2023-10-03/life_tables + # OMM: Life Expectancy + data://garden/demography/2023-10-09/life_expectancy: + - data://garden/demography/2023-10-03/life_tables + - data://garden/demography/2023-10-10/zijdeman_et_al_2015 + - data://garden/demography/2023-10-10/riley_2005 + - data://garden/un/2022-07-11/un_wpp + data://grapher/demography/2023-10-10/life_expectancy: + - data://garden/demography/2023-10-09/life_expectancy diff --git a/dag/archive/education.yml b/dag/archive/education.yml new file mode 100644 index 00000000000..7fb0ac85e2b --- /dev/null +++ b/dag/archive/education.yml @@ -0,0 +1,12 @@ +steps: + # World Bank EdStats + data://meadow/wb/2023-07-10/education: + - snapshot://wb/2023-07-10/education.csv + + data://garden/wb/2023-07-10/education: + - data://meadow/wb/2023-07-10/education + - data://garden/education/2017-09-30/public_expenditure + - data://garden/education/2018-04-18/literacy_rates + + data://grapher/wb/2023-07-10/education: + - data://garden/wb/2023-07-10/education \ No newline at end of file diff --git a/dag/archive/emissions.yml b/dag/archive/emissions.yml index 742525ba1ef..ce3432d86ac 100644 --- a/dag/archive/emissions.yml +++ b/dag/archive/emissions.yml @@ -1,143 +1,11 @@ steps: # - # GCP - Global Carbon Budget (2023-04-28). - # - data://meadow/gcp/2023-04-28/global_carbon_budget: - - snapshot://gcp/2023-04-28/global_carbon_budget_fossil_co2_emissions.csv - - snapshot://gcp/2023-04-28/global_carbon_budget_global_emissions.xlsx - - snapshot://gcp/2023-04-28/global_carbon_budget_national_emissions.xlsx - - snapshot://gcp/2023-04-28/global_carbon_budget_land_use_change_emissions.xlsx - data://garden/gcp/2023-04-28/global_carbon_budget: - - data://meadow/gcp/2023-04-28/global_carbon_budget - # Loaded to calculate emissions per unit energy. - - data://garden/energy/2023-06-01/primary_energy_consumption - # Loaded to calculate emissions per GDP. - - data://garden/ggdc/2020-10-01/ggdc_maddison - # Loaded to create per-capita variables. - - data://garden/demography/2022-12-08/population - # Loaded to create region aggregates (including income groups). - - data://garden/regions/2023-01-01/regions - - data://garden/wb/2021-07-01/wb_income - data://grapher/gcp/2023-04-28/global_carbon_budget: - - data://garden/gcp/2023-04-28/global_carbon_budget - # - # GCP - Global Carbon Budget (2023-07-10). - # - data://meadow/gcp/2023-07-10/global_carbon_budget: - - snapshot://gcp/2023-04-28/global_carbon_budget_fossil_co2_emissions.csv - - snapshot://gcp/2023-04-28/global_carbon_budget_global_emissions.xlsx - - snapshot://gcp/2023-04-28/global_carbon_budget_national_emissions.xlsx - - snapshot://gcp/2023-04-28/global_carbon_budget_land_use_change_emissions.xlsx - data://garden/gcp/2023-07-10/global_carbon_budget: - - data://meadow/gcp/2023-07-10/global_carbon_budget - # Loaded to calculate emissions per unit energy. - - data://garden/energy/2023-07-10/primary_energy_consumption - # Loaded to calculate emissions per GDP. - - data://garden/ggdc/2020-10-01/ggdc_maddison - # Loaded to create per-capita variables. - - data://garden/demography/2023-03-31/population - # Loaded to create region aggregates (including income groups). - - data://garden/regions/2023-01-01/regions - - data://garden/wb/2023-04-30/income_groups - data://grapher/gcp/2023-07-10/global_carbon_budget: - - data://garden/gcp/2023-07-10/global_carbon_budget - # # Jones et al. (2023) - National contributions to climate change (2023-05-02). # data://meadow/emissions/2023-05-02/national_contributions: - snapshot://emissions/2023-05-02/national_contributions_annual_emissions.csv - snapshot://emissions/2023-05-02/national_contributions_cumulative_emissions.csv - snapshot://emissions/2023-05-02/national_contributions_temperature_response.csv - data://garden/emissions/2023-05-02/national_contributions: - - data://meadow/emissions/2023-05-02/national_contributions - - data://garden/regions/2023-01-01/regions - - data://garden/wb/2021-07-01/wb_income - - data://garden/demography/2022-12-08/population - data://grapher/emissions/2023-05-02/national_contributions: - - data://garden/emissions/2023-05-02/national_contributions - # - # Emissions - CO2 dataset (2023-11-08). - # - data://garden/emissions/2023-11-08/owid_co2: - - data://garden/emissions/2023-05-02/national_contributions - - data://garden/gcp/2023-09-28/global_carbon_budget - - data://garden/climate_watch/2023-10-31/emissions_by_sector - - data://garden/energy/2023-07-10/primary_energy_consumption - - data://garden/demography/2023-03-31/population - - data://garden/ggdc/2020-10-01/ggdc_maddison - - data://garden/regions/2023-01-01/regions - # - # Emissions - CO2 dataset (2023-11-23). - # - data://garden/emissions/2023-11-23/owid_co2: - - data://garden/emissions/2023-11-23/national_contributions - - data://garden/gcp/2023-09-28/global_carbon_budget - - data://garden/climate_watch/2023-10-31/emissions_by_sector - - data://garden/energy/2023-07-10/primary_energy_consumption - - data://garden/demography/2023-03-31/population - - data://garden/ggdc/2020-10-01/ggdc_maddison - - data://garden/regions/2023-01-01/regions - # - # GCP - Global Carbon Budget (2023-09-28). - # - data://meadow/gcp/2023-09-28/global_carbon_budget: - - snapshot://gcp/2023-09-28/global_carbon_budget_fossil_co2_emissions.csv - - snapshot://gcp/2023-09-28/global_carbon_budget_global_emissions.xlsx - - snapshot://gcp/2023-09-28/global_carbon_budget_national_emissions.xlsx - - snapshot://gcp/2023-09-28/global_carbon_budget_land_use_change_emissions.xlsx - data://garden/gcp/2023-09-28/global_carbon_budget: - - data://meadow/gcp/2023-09-28/global_carbon_budget - # Loaded to calculate emissions per unit energy. - - data://garden/energy/2023-07-10/primary_energy_consumption - # Loaded to calculate emissions per GDP. - - data://garden/ggdc/2020-10-01/ggdc_maddison - # Loaded to create per-capita variables. - - data://garden/demography/2023-03-31/population - # Loaded to create region aggregates (including income groups). - - data://garden/regions/2023-01-01/regions - - data://garden/wb/2023-04-30/income_groups - data://grapher/gcp/2023-09-28/global_carbon_budget: - - data://garden/gcp/2023-09-28/global_carbon_budget - # - # Decoupling of GDP and CO2 (2023). - # - # The data from the following step will be used in this static chart: - # https://drive.google.com/file/d/1PflfQpr4mceVWRSGEqMP6Gbo1tFQZzOp/view?usp=sharing - data://garden/emissions/2023-10-06/gdp_and_co2_decoupling: - - data://garden/gcp/2023-09-28/global_carbon_budget - - data://garden/worldbank_wdi/2022-05-26/wdi - # - # Emissions - CO2 dataset (2023-12-05). - # - data://garden/emissions/2023-12-05/owid_co2: - - data://garden/emissions/2023-11-23/national_contributions - - data://garden/gcp/2023-12-05/global_carbon_budget - - data://garden/climate_watch/2023-10-31/emissions_by_sector - - data://garden/energy/2023-07-10/primary_energy_consumption - - data://garden/demography/2023-03-31/population - - data://garden/ggdc/2020-10-01/ggdc_maddison - - data://garden/regions/2023-01-01/regions - # - # GCP - Global Carbon Budget (2023-12-05). - # - data://meadow/gcp/2023-12-05/global_carbon_budget: - - snapshot://gcp/2023-12-05/global_carbon_budget_fossil_co2_emissions.csv - - snapshot://gcp/2023-12-05/global_carbon_budget_global_emissions.xlsx - - snapshot://gcp/2023-12-05/global_carbon_budget_national_emissions.xlsx - - snapshot://gcp/2023-12-05/global_carbon_budget_land_use_change_emissions.xlsx - data://garden/gcp/2023-12-05/global_carbon_budget: - - data://meadow/gcp/2023-12-05/global_carbon_budget - # Loaded to calculate emissions per unit energy. - - data://garden/energy/2023-07-10/primary_energy_consumption - # Loaded to calculate emissions per GDP. - - data://garden/ggdc/2020-10-01/ggdc_maddison - # Loaded to create per-capita variables. - - data://garden/demography/2023-03-31/population - # Loaded to create region aggregates (including income groups). - - data://garden/regions/2023-01-01/regions - - data://garden/wb/2023-04-30/income_groups - data://grapher/gcp/2023-12-05/global_carbon_budget: - - data://garden/gcp/2023-12-05/global_carbon_budget # # Emissions - CO2 dataset (2023-12-12). # @@ -180,3 +48,98 @@ steps: data://garden/emissions/2024-02-26/gdp_and_co2_decoupling: - data://garden/worldbank_wdi/2024-05-20/wdi - data://garden/gcp/2023-12-12/global_carbon_budget + # + # GCP - Global Carbon Budget (published on 2023-12-05, updated on 2023-12-12 to use the latest primary energy data). + # + data://meadow/gcp/2023-12-12/global_carbon_budget: + - snapshot://gcp/2023-12-12/global_carbon_budget_fossil_co2_emissions.csv + - snapshot://gcp/2023-12-12/global_carbon_budget_national_emissions.xlsx + - snapshot://gcp/2023-12-12/global_carbon_budget_land_use_change_emissions.xlsx + - snapshot://gcp/2023-12-12/global_carbon_budget_global_emissions.xlsx + # + # GCP - Global Carbon Budget (published on 2023-12-05, updated on 2024-06-20 to use the latest primary energy data). + # + data://garden/gcp/2024-06-20/global_carbon_budget: + - data://garden/wb/2024-03-11/income_groups + - data://garden/demography/2023-03-31/population + - data://garden/energy/2024-06-20/primary_energy_consumption + - data://garden/regions/2023-01-01/regions + - data://meadow/gcp/2023-12-12/global_carbon_budget + - data://garden/ggdc/2024-04-26/maddison_project_database + # + # Decoupling of GDP and CO2 (2023). + # + data://garden/emissions/2024-06-20/gdp_and_co2_decoupling: + - data://garden/gcp/2024-06-20/global_carbon_budget + - data://garden/worldbank_wdi/2024-05-20/wdi + # + # GCP - Global Carbon Budget. + # + data://grapher/gcp/2024-06-20/global_carbon_budget: + - data://garden/gcp/2024-06-20/global_carbon_budget + # + # GCP - Global Carbon Budget. + # + data://meadow/gcp/2024-11-13/global_carbon_budget: + - snapshot://gcp/2024-11-13/global_carbon_budget_national_emissions.xlsx + - snapshot://gcp/2024-11-13/global_carbon_budget_fossil_co2_emissions.csv + - snapshot://gcp/2024-11-13/global_carbon_budget_global_emissions.xlsx + - snapshot://gcp/2024-11-13/global_carbon_budget_land_use_change_emissions.xlsx + # + # GCP - Global Carbon Budget. + # + data://garden/gcp/2024-11-13/global_carbon_budget: + - data://garden/ggdc/2024-04-26/maddison_project_database + - data://garden/wb/2024-07-29/income_groups + - data://garden/demography/2024-07-15/population + - data://meadow/gcp/2024-11-13/global_carbon_budget + - data://garden/regions/2023-01-01/regions + - data://garden/energy/2024-06-20/primary_energy_consumption + # + # Emissions - CO2 dataset. + # + data://garden/emissions/2024-11-13/owid_co2: + - data://garden/ggdc/2024-04-26/maddison_project_database + - data://garden/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions + - data://garden/gcp/2024-11-13/global_carbon_budget + - data://garden/emissions/2024-04-08/national_contributions + - data://garden/climate_watch/2023-10-31/emissions_by_sector + - data://garden/energy/2024-06-20/primary_energy_consumption + # + # Decoupling of GDP and CO2 (2023). + # + data://garden/emissions/2024-11-13/gdp_and_co2_decoupling: + - data://garden/gcp/2024-11-13/global_carbon_budget + - data://garden/worldbank_wdi/2024-05-20/wdi + # + # GCP - Global Carbon Budget. + # + data://grapher/gcp/2024-11-13/global_carbon_budget: + - data://garden/gcp/2024-11-13/global_carbon_budget + # + # Climate Watch - Greenhouse gas emissions by sector (2023-10-31). + # + data://meadow/climate_watch/2023-10-31/emissions_by_sector: + - snapshot://climate_watch/2023-10-31/emissions_by_sector.gz + # + # Jones et al. - National contributions to climate change. + # + data://meadow/emissions/2024-04-08/national_contributions: + - snapshot://emissions/2024-04-08/national_contributions_temperature_response.csv + - snapshot://emissions/2024-04-08/national_contributions_annual_emissions.csv + - snapshot://emissions/2024-04-08/national_contributions_cumulative_emissions.csv + data://garden/climate_watch/2023-10-31/emissions_by_sector: + - data://meadow/climate_watch/2023-10-31/emissions_by_sector + - data://garden/wb/2023-04-30/income_groups + - data://garden/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions + data://garden/emissions/2024-04-08/national_contributions: + - data://meadow/emissions/2024-04-08/national_contributions + - data://garden/wb/2024-03-11/income_groups + - data://garden/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions + data://grapher/climate_watch/2023-10-31/emissions_by_sector: + - data://garden/climate_watch/2023-10-31/emissions_by_sector + data://grapher/emissions/2024-04-08/national_contributions: + - data://garden/emissions/2024-04-08/national_contributions diff --git a/dag/archive/energy.yml b/dag/archive/energy.yml index 5af6ba52b36..5444472ffa3 100644 --- a/dag/archive/energy.yml +++ b/dag/archive/energy.yml @@ -1,114 +1,5 @@ steps: # - # BP - Statistical review 2022. - # - data://garden/bp/2022-07-14/statistical_review: - - backport://backport/owid/latest/dataset_5650_statistical_review_of_world_energy__bp__2022 - - data://garden/bp/2022-07-11/statistical_review - - data://garden/owid/latest/key_indicators - - data://garden/wb/2021-07-01/wb_income - - data://garden/regions/2023-01-01/regions - data://grapher/bp/2022-07-14/statistical_review: - - data://garden/bp/2022-07-14/statistical_review - # - # BP - Energy mix 2022. - # - data://grapher/bp/2022-07-14/energy_mix: - - data://garden/bp/2022-07-14/energy_mix - # - # BP - Fossil fuel reserves/production ratio (2022). - # - data://garden/bp/2022-09-19/fossil_fuel_reserves_production_ratio: - - data://garden/bp/2022-07-14/statistical_review - data://grapher/bp/2022-09-19/fossil_fuel_reserves_production_ratio: - - data://garden/bp/2022-09-19/fossil_fuel_reserves_production_ratio - # - # Ember yearly electricity 2022. - # - data://meadow/ember/2022-12-13/yearly_electricity: - - snapshot://ember/2022-12-13/yearly_electricity.csv - data://garden/ember/2022-12-13/yearly_electricity: - - data://meadow/ember/2022-12-13/yearly_electricity - - data://garden/owid/latest/key_indicators - - data://garden/regions/2023-01-01/regions - # - # BP - Energy mix 2022. - # - data://garden/bp/2022-12-28/energy_mix: - - data://garden/bp/2022-12-28/statistical_review - - data://garden/owid/latest/key_indicators - - data://garden/wb/2021-07-01/wb_income - data://grapher/bp/2022-12-28/energy_mix: - - data://garden/bp/2022-12-28/energy_mix - # - # Ember - Yearly electricity data 2023. - # - data://meadow/ember/2023-02-20/yearly_electricity: - - snapshot://ember/2023-02-20/yearly_electricity.csv - data://garden/ember/2023-02-20/yearly_electricity: - - data://meadow/ember/2023-02-20/yearly_electricity - - data://garden/demography/2022-12-08/population - - data://garden/wb/2021-07-01/wb_income - - data://garden/regions/2023-01-01/regions - # - # BP - Energy mix (2022). - # - data://garden/bp/2022-07-14/energy_mix: - - data://garden/bp/2022-07-14/statistical_review - - data://garden/owid/latest/key_indicators - - data://garden/wb/2021-07-01/wb_income - # - # BP - Statistical review 2021. - # - # NOTE: This dataset is not used in grapher. It exists only to fill gaps in the 2022 version. - data://garden/bp/2022-07-11/statistical_review: - - backport://backport/owid/latest/dataset_5347_statistical_review_of_world_energy__bp__2021 - - data://garden/owid/latest/key_indicators - - data://garden/wb/2021-07-01/wb_income - - data://garden/regions/2023-01-01/regions - # - # BP - Statistical review 2022. - # - # NOTE: For the moment this is not the full processing (which is still done in importers). - # This garden step loads the dataset and adds region aggregates properly, plus some other minor improvements. - # Here, we also remove some regions that had misleading data (BP regions like "Other *"). - data://garden/bp/2022-12-28/statistical_review: - # The backported 2022 release is the main source of data of this step. - - backport://backport/owid/latest/dataset_5650_statistical_review_of_world_energy__bp__2022 - # The 2021 release is loaded just to fill missing data in the current version (and to get a missing column). - - data://garden/bp/2022-07-11/statistical_review - - data://garden/owid/latest/key_indicators - - data://garden/wb/2021-07-01/wb_income - - data://garden/regions/2023-01-01/regions - data://grapher/bp/2022-12-28/statistical_review: - - data://garden/bp/2022-12-28/statistical_review - # - # BP - Fossil fuel reserves/production ratio (2022). - # - data://garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio: - - data://garden/bp/2022-12-28/statistical_review - data://grapher/bp/2022-12-28/fossil_fuel_reserves_production_ratio: - - data://garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio - # - # Ember - Yearly electricity data 2023. - # - data://meadow/ember/2023-06-01/yearly_electricity: - - snapshot://ember/2023-06-01/yearly_electricity.csv - data://garden/ember/2023-06-01/yearly_electricity: - - data://meadow/ember/2023-06-01/yearly_electricity - - data://garden/demography/2023-03-31/population - - data://garden/wb/2023-04-30/income_groups - - data://garden/regions/2023-01-01/regions - # - # BP - Energy mix 2023. - # - data://garden/bp/2023-02-20/energy_mix: - - data://garden/bp/2022-12-28/statistical_review - - data://garden/demography/2022-12-08/population - - data://garden/wb/2021-07-01/wb_income - data://grapher/bp/2023-02-20/energy_mix: - - data://garden/bp/2023-02-20/energy_mix - # # UNEP - Global trends in renewable energy investment (2019). # data://meadow/unep/2023-01-03/renewable_energy_investments: @@ -211,8 +102,6 @@ steps: - data://garden/energy_institute/2023-06-26/statistical_review_of_world_energy - data://garden/demography/2023-03-31/population - data://garden/wb/2023-04-30/income_groups - data://grapher/energy/2023-07-10/energy_mix: - - data://garden/energy/2023-07-10/energy_mix # # Energy - Primary energy consumption 2023. # @@ -285,8 +174,6 @@ steps: - data://garden/energy_institute/2023-12-12/statistical_review_of_world_energy - data://garden/demography/2023-03-31/population - data://garden/wb/2023-04-30/income_groups - data://grapher/energy/2023-12-12/energy_mix: - - data://garden/energy/2023-12-12/energy_mix # # Energy - OWID Energy dataset. # @@ -417,8 +304,6 @@ steps: - data://garden/energy/2024-05-08/primary_energy_consumption data://grapher/energy/2024-05-08/fossil_fuel_production: - data://garden/energy/2024-05-08/fossil_fuel_production - data://grapher/energy/2024-05-08/energy_mix: - - data://garden/energy/2024-05-08/energy_mix data://grapher/energy/2024-05-08/fossil_fuel_reserves_production_ratio: - data://garden/energy/2024-05-08/fossil_fuel_reserves_production_ratio data://grapher/energy/2024-05-08/uk_historical_electricity: @@ -442,3 +327,57 @@ steps: - data://garden/irena/2023-12-12/renewable_power_generation_costs data://grapher/energy/2024-05-08/photovoltaic_cost_and_capacity: - data://garden/energy/2024-05-08/photovoltaic_cost_and_capacity + # + # Energy - Photovoltaic cost and capacity. + # + data://garden/energy/2024-10-30/photovoltaic_cost_and_capacity: + - data://garden/papers/2023-12-12/farmer_lafond_2016 + - data://garden/irena/2023-12-12/renewable_electricity_capacity + - data://garden/papers/2023-12-12/nemet_2009 + - data://garden/irena/2024-10-29/renewable_power_generation_costs + # + # Energy - Photovoltaic cost and capacity. + # + data://grapher/energy/2024-10-30/photovoltaic_cost_and_capacity: + - data://garden/energy/2024-10-30/photovoltaic_cost_and_capacity + # + # IRENA - Renewable electricity capacity. + # + data://meadow/irena/2023-12-12/renewable_electricity_capacity: + - snapshot://irena/2023-12-12/renewable_electricity_capacity_and_generation.xlsm + data://garden/irena/2023-12-12/renewable_electricity_capacity: + - data://meadow/irena/2023-12-12/renewable_electricity_capacity + - data://garden/regions/2023-01-01/regions + - data://garden/wb/2023-04-30/income_groups + data://grapher/irena/2023-12-12/renewable_electricity_capacity_by_technology: + - data://garden/irena/2023-12-12/renewable_electricity_capacity + data://grapher/irena/2023-12-12/renewable_electricity_capacity: + - data://garden/irena/2023-12-12/renewable_electricity_capacity + # + # IRENA - Renewable power generation costs. + # + data://meadow/irena/2024-10-29/renewable_power_generation_costs: + - snapshot://irena/2024-10-29/renewable_power_generation_costs.xlsx + # + # IRENA - Renewable power generation costs. + # + data://garden/irena/2024-10-29/renewable_power_generation_costs: + - data://meadow/irena/2024-10-29/renewable_power_generation_costs + # + # Energy - Photovoltaic cost and capacity. + # + data://garden/energy/2024-11-01/photovoltaic_cost_and_capacity: + - data://garden/irena/2024-11-01/renewable_capacity_statistics + - data://garden/papers/2023-12-12/nemet_2009 + - data://garden/papers/2023-12-12/farmer_lafond_2016 + - data://garden/irena/2024-10-29/renewable_power_generation_costs + # + # IRENA - Renewable power generation costs. + # + data://grapher/irena/2024-10-29/renewable_power_generation_costs: + - data://garden/irena/2024-10-29/renewable_power_generation_costs + # + # Energy - Photovoltaic cost and capacity. + # + data://grapher/energy/2024-11-01/photovoltaic_cost_and_capacity: + - data://garden/energy/2024-11-01/photovoltaic_cost_and_capacity diff --git a/dag/archive/faostat.yml b/dag/archive/faostat.yml index 245d48add7f..56c8b794cdf 100644 --- a/dag/archive/faostat.yml +++ b/dag/archive/faostat.yml @@ -1,319 +1,5 @@ steps: # - # FAOSTAT meadow steps for version 2023-02-22 - # - data://meadow/faostat/2023-02-22/faostat_ef: - - snapshot://faostat/2023-02-22/faostat_ef.zip - data://meadow/faostat/2023-02-22/faostat_ei: - - snapshot://faostat/2023-02-22/faostat_ei.zip - data://meadow/faostat/2023-02-22/faostat_ek: - - snapshot://faostat/2023-02-22/faostat_ek.zip - data://meadow/faostat/2023-02-22/faostat_el: - - snapshot://faostat/2023-02-22/faostat_el.zip - data://meadow/faostat/2023-02-22/faostat_emn: - - snapshot://faostat/2023-02-22/faostat_emn.zip - data://meadow/faostat/2023-02-22/faostat_ep: - - snapshot://faostat/2023-02-22/faostat_ep.zip - data://meadow/faostat/2023-02-22/faostat_esb: - - snapshot://faostat/2023-02-22/faostat_esb.zip - data://meadow/faostat/2023-02-22/faostat_fa: - - snapshot://faostat/2023-02-22/faostat_fa.zip - data://meadow/faostat/2023-02-22/faostat_fbs: - - snapshot://faostat/2023-02-22/faostat_fbs.zip - data://meadow/faostat/2023-02-22/faostat_fbsh: - - snapshot://faostat/2023-02-22/faostat_fbsh.zip - data://meadow/faostat/2023-02-22/faostat_fo: - - snapshot://faostat/2023-02-22/faostat_fo.zip - data://meadow/faostat/2023-02-22/faostat_fs: - - snapshot://faostat/2023-02-22/faostat_fs.zip - data://meadow/faostat/2023-02-22/faostat_gn: - - snapshot://faostat/2023-02-22/faostat_gn.zip - data://meadow/faostat/2023-02-22/faostat_ic: - - snapshot://faostat/2023-02-22/faostat_ic.zip - data://meadow/faostat/2023-02-22/faostat_lc: - - snapshot://faostat/2023-02-22/faostat_lc.zip - data://meadow/faostat/2023-02-22/faostat_metadata: - - snapshot://faostat/2023-02-22/faostat_metadata.json - data://meadow/faostat/2023-02-22/faostat_qcl: - - snapshot://faostat/2023-02-22/faostat_qcl.zip - data://meadow/faostat/2023-02-22/faostat_qi: - - snapshot://faostat/2023-02-22/faostat_qi.zip - data://meadow/faostat/2023-02-22/faostat_qv: - - snapshot://faostat/2023-02-22/faostat_qv.zip - data://meadow/faostat/2023-02-22/faostat_rfb: - - snapshot://faostat/2023-02-22/faostat_rfb.zip - data://meadow/faostat/2023-02-22/faostat_rfn: - - snapshot://faostat/2023-02-22/faostat_rfn.zip - data://meadow/faostat/2023-02-22/faostat_rl: - - snapshot://faostat/2023-02-22/faostat_rl.zip - data://meadow/faostat/2023-02-22/faostat_rp: - - snapshot://faostat/2023-02-22/faostat_rp.zip - data://meadow/faostat/2023-02-22/faostat_rt: - - snapshot://faostat/2023-02-22/faostat_rt.zip - data://meadow/faostat/2023-02-22/faostat_scl: - - snapshot://faostat/2023-02-22/faostat_scl.zip - data://meadow/faostat/2023-02-22/faostat_sdgb: - - snapshot://faostat/2023-02-22/faostat_sdgb.zip - data://meadow/faostat/2023-02-22/faostat_tcl: - - snapshot://faostat/2023-02-22/faostat_tcl.zip - data://meadow/faostat/2023-02-22/faostat_ti: - - snapshot://faostat/2023-02-22/faostat_ti.zip - data://meadow/faostat/2023-02-22/faostat_wcad: - - snapshot://faostat/2023-02-22/faostat_wcad.zip - # - # FAOSTAT garden steps for version 2023-02-22 - # - data://garden/faostat/2023-02-22/faostat_ef: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_ef - data://garden/faostat/2023-02-22/faostat_ei: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_ei - data://garden/faostat/2023-02-22/faostat_ek: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://meadow/faostat/2023-02-22/faostat_ek - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_el: - - data://meadow/faostat/2023-02-22/faostat_el - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_emn: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_emn - data://garden/faostat/2023-02-22/faostat_ep: - - data://meadow/faostat/2023-02-22/faostat_ep - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_esb: - - data://meadow/faostat/2023-02-22/faostat_esb - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_fa: - - data://garden/wb/2021-07-01/wb_income - - data://garden/faostat/2023-02-22/faostat_metadata - - data://garden/owid/latest/key_indicators - - data://meadow/faostat/2023-02-22/faostat_fa - data://garden/faostat/2023-02-22/faostat_fbsc: - - data://meadow/faostat/2023-02-22/faostat_fbsh - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_fbs - - data://garden/wb/2021-07-01/wb_income - data://garden/faostat/2023-02-22/faostat_fo: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_fo - data://garden/faostat/2023-02-22/faostat_food_explorer: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://garden/faostat/2023-02-22/faostat_qcl - - data://garden/faostat/2023-02-22/faostat_fbsc - data://garden/faostat/2023-02-22/faostat_fs: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_fs - data://garden/faostat/2023-02-22/faostat_gn: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_gn - data://garden/faostat/2023-02-22/faostat_ic: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_ic - data://garden/faostat/2023-02-22/faostat_lc: - - data://garden/wb/2021-07-01/wb_income - - data://meadow/faostat/2023-02-22/faostat_lc - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_metadata: - - data://meadow/faostat/2023-02-22/faostat_qcl - - data://meadow/faostat/2023-02-22/faostat_el - - data://meadow/faostat/2023-02-22/faostat_rl - - data://meadow/faostat/2023-02-22/faostat_ti - - data://meadow/faostat/2023-02-22/faostat_lc - - data://meadow/faostat/2023-02-22/faostat_fbs - - data://meadow/faostat/2023-02-22/faostat_ek - - data://meadow/faostat/2023-02-22/faostat_ei - - data://meadow/faostat/2023-02-22/faostat_ef - - data://meadow/faostat/2023-02-22/faostat_tcl - - data://meadow/faostat/2023-02-22/faostat_fbsh - - data://meadow/faostat/2023-02-22/faostat_emn - - data://meadow/faostat/2023-02-22/faostat_rfb - - data://meadow/faostat/2023-02-22/faostat_esb - - data://meadow/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_sdgb - - data://meadow/faostat/2023-02-22/faostat_qi - - data://meadow/faostat/2023-02-22/faostat_ep - - data://meadow/faostat/2023-02-22/faostat_fa - - data://meadow/faostat/2023-02-22/faostat_qv - - data://meadow/faostat/2023-02-22/faostat_rfn - - data://meadow/faostat/2023-02-22/faostat_fo - - data://meadow/faostat/2023-02-22/faostat_rt - - data://meadow/faostat/2023-02-22/faostat_scl - - data://meadow/faostat/2023-02-22/faostat_rp - - data://meadow/faostat/2023-02-22/faostat_fs - - data://meadow/faostat/2023-02-22/faostat_gn - - data://meadow/faostat/2023-02-22/faostat_ic - - data://meadow/faostat/2023-02-22/faostat_wcad - data://garden/faostat/2023-02-22/faostat_qcl: - - data://meadow/faostat/2023-02-22/faostat_qcl - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_qi: - - data://meadow/faostat/2023-02-22/faostat_qi - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_qv: - - data://garden/wb/2021-07-01/wb_income - - data://meadow/faostat/2023-02-22/faostat_qv - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_rfb: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_rfb - data://garden/faostat/2023-02-22/faostat_rfn: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_rfn - data://garden/faostat/2023-02-22/faostat_rl: - - data://garden/wb/2021-07-01/wb_income - - data://meadow/faostat/2023-02-22/faostat_rl - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_rp: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_rp - data://garden/faostat/2023-02-22/faostat_rt: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_rt - data://garden/faostat/2023-02-22/faostat_scl: - - data://garden/wb/2021-07-01/wb_income - - data://meadow/faostat/2023-02-22/faostat_scl - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_sdgb: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_sdgb - data://garden/faostat/2023-02-22/faostat_tcl: - - data://meadow/faostat/2023-02-22/faostat_tcl - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_ti: - - data://garden/wb/2021-07-01/wb_income - - data://meadow/faostat/2023-02-22/faostat_ti - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - data://garden/faostat/2023-02-22/faostat_wcad: - - data://garden/wb/2021-07-01/wb_income - - data://garden/owid/latest/key_indicators - - data://garden/faostat/2023-02-22/faostat_metadata - - data://meadow/faostat/2023-02-22/faostat_wcad - # - # FAOSTAT grapher steps for version 2023-02-22 - # - data://grapher/faostat/2023-02-22/faostat_ef: - - data://garden/faostat/2023-02-22/faostat_ef - data://grapher/faostat/2023-02-22/faostat_ei: - - data://garden/faostat/2023-02-22/faostat_ei - data://grapher/faostat/2023-02-22/faostat_ek: - - data://garden/faostat/2023-02-22/faostat_ek - data://grapher/faostat/2023-02-22/faostat_el: - - data://garden/faostat/2023-02-22/faostat_el - data://grapher/faostat/2023-02-22/faostat_emn: - - data://garden/faostat/2023-02-22/faostat_emn - data://grapher/faostat/2023-02-22/faostat_ep: - - data://garden/faostat/2023-02-22/faostat_ep - data://grapher/faostat/2023-02-22/faostat_esb: - - data://garden/faostat/2023-02-22/faostat_esb - data://grapher/faostat/2023-02-22/faostat_fa: - - data://garden/faostat/2023-02-22/faostat_fa - data://grapher/faostat/2023-02-22/faostat_fbsc: - - data://garden/faostat/2023-02-22/faostat_fbsc - data://grapher/faostat/2023-02-22/faostat_fo: - - data://garden/faostat/2023-02-22/faostat_fo - data://grapher/faostat/2023-02-22/faostat_fs: - - data://garden/faostat/2023-02-22/faostat_fs - data://grapher/faostat/2023-02-22/faostat_lc: - - data://garden/faostat/2023-02-22/faostat_lc - data://grapher/faostat/2023-02-22/faostat_qcl: - - data://garden/faostat/2023-02-22/faostat_qcl - data://grapher/faostat/2023-02-22/faostat_qi: - - data://garden/faostat/2023-02-22/faostat_qi - data://grapher/faostat/2023-02-22/faostat_qv: - - data://garden/faostat/2023-02-22/faostat_qv - data://grapher/faostat/2023-02-22/faostat_rfb: - - data://garden/faostat/2023-02-22/faostat_rfb - data://grapher/faostat/2023-02-22/faostat_rfn: - - data://garden/faostat/2023-02-22/faostat_rfn - data://grapher/faostat/2023-02-22/faostat_rl: - - data://garden/faostat/2023-02-22/faostat_rl - data://grapher/faostat/2023-02-22/faostat_rp: - - data://garden/faostat/2023-02-22/faostat_rp - data://grapher/faostat/2023-02-22/faostat_rt: - - data://garden/faostat/2023-02-22/faostat_rt - data://grapher/faostat/2023-02-22/faostat_scl: - - data://garden/faostat/2023-02-22/faostat_scl - data://grapher/faostat/2023-02-22/faostat_sdgb: - - data://garden/faostat/2023-02-22/faostat_sdgb - data://grapher/faostat/2023-02-22/faostat_tcl: - - data://garden/faostat/2023-02-22/faostat_tcl - data://grapher/faostat/2023-02-22/faostat_ti: - - data://garden/faostat/2023-02-22/faostat_ti - data://grapher/faostat/2023-02-22/faostat_gn: - - data://garden/faostat/2023-02-22/faostat_gn - data://grapher/faostat/2023-02-22/faostat_ic: - - data://garden/faostat/2023-02-22/faostat_ic - data://grapher/faostat/2023-02-22/faostat_wcad: - - data://garden/faostat/2023-02-22/faostat_wcad - # - # FAOSTAT food explorer step for version 2023-02-22 - # - data://explorers/faostat/2023-02-22/food_explorer: - - data://garden/faostat/2023-02-22/faostat_food_explorer - # - # FAOSTAT garden step for additional variables for version 2023-02-22 - # - data://garden/faostat/2023-02-22/additional_variables: - - data://garden/faostat/2023-02-22/faostat_rl - - data://garden/faostat/2023-02-22/faostat_qi - - data://garden/faostat/2023-02-22/faostat_qcl - - data://garden/faostat/2023-02-22/faostat_sdgb - - data://garden/faostat/2023-02-22/faostat_fbsc - - data://garden/faostat/2023-02-22/faostat_ef - - data://garden/faostat/2023-02-22/faostat_rfn - # - # FAOSTAT grapher step for additional variables for version 2023-02-22 - # - data://grapher/faostat/2023-02-22/additional_variables: - - data://garden/faostat/2023-02-22/additional_variables - # # FAOSTAT meadow steps for version 2023-06-12 # data://meadow/faostat/2023-06-12/faostat_cahd: diff --git a/dag/archive/fasttrack.yml b/dag/archive/fasttrack.yml index f15d9831565..af669e2acdc 100644 --- a/dag/archive/fasttrack.yml +++ b/dag/archive/fasttrack.yml @@ -1,5 +1,5 @@ steps: - data-private://grapher/fasttrack/2023-03-27/global_warming_contributions: - - snapshot-private://fasttrack/2023-03-27/global_warming_contributions.csv - data-private://grapher/fasttrack/2023-01-19/food_expenditures_by_country: - - snapshot-private://fasttrack/2023-01-19/food_expenditures_by_country.csv + data://grapher/fasttrack/latest/democracy_lexical_index: + - snapshot://fasttrack/latest/democracy_lexical_index.csv + data://grapher/fasttrack/latest/mineral_prices_usgs: + - snapshot://fasttrack/latest/mineral_prices_usgs.csv diff --git a/dag/archive/health.yml b/dag/archive/health.yml index 9d9813fc384..2722545fa83 100644 --- a/dag/archive/health.yml +++ b/dag/archive/health.yml @@ -38,7 +38,7 @@ steps: - data://meadow/oecd/2023-08-11/road_accidents - data://garden/oecd/2018-03-11/road_deaths_and_injuries -# GBD 2019 + # GBD 2019 # IHME GBD Leading cause of deaths - to archive data://meadow/ihme_gbd/2023-10-04/cause_hierarchy: @@ -49,7 +49,7 @@ steps: data://grapher/ihme_gbd/2023-10-04/leading_causes_child_deaths: - data://garden/ihme_gbd/2023-10-04/leading_causes_child_deaths -# IHME Global Burden of Disease - Prevalence and incidence + # IHME Global Burden of Disease - Prevalence and incidence data://meadow/ihme_gbd/2019/gbd_prevalence: - walden://ihme_gbd/2019/gbd_prevalence data://garden/ihme_gbd/2019/gbd_prevalence: @@ -118,4 +118,12 @@ steps: data://garden/who/2023-08-04/icd_codes: - data://meadow/who/2023-08-04/icd_codes data://grapher/who/2023-08-04/icd_codes: - - data://garden/who/2023-08-04/icd_codes \ No newline at end of file + - data://garden/who/2023-08-04/icd_codes + + # MICROBE - removing some unneeded datasets from grapher + data-private://grapher/antibiotics/2024-12-04/microbe_total_pathogens: + - data-private://garden/antibiotics/2024-12-04/microbe_total_pathogens + data-private://grapher/antibiotics/2024-12-02/total_pathogen_bloodstream: + - data-private://garden/antibiotics/2024-12-02/total_pathogen_bloodstream + data-private://grapher/antibiotics/2024-11-20/pathogen_bloodstream: + - data-private://garden/antibiotics/2024-11-20/pathogen_bloodstream diff --git a/dag/archive/main.yml b/dag/archive/main.yml index b551d5c91af..bf53b69a855 100644 --- a/dag/archive/main.yml +++ b/dag/archive/main.yml @@ -1,41 +1,4 @@ steps: - # Homicide - UNODC - data://meadow/homicide/2023-01-04/unodc: - - snapshot://homicide/2023-01-04/unodc.xlsx - data://garden/homicide/2023-01-04/unodc: - - data://meadow/homicide/2023-01-04/unodc - # - # Met Office Hadley Centre - Near surface temperature anomaly (2023-01-02). - # - data://meadow/met_office_hadley_centre/2023-01-02/near_surface_temperature: - - snapshot://met_office_hadley_centre/2023-01-02/near_surface_temperature_global.csv - - snapshot://met_office_hadley_centre/2023-01-02/near_surface_temperature_northern_hemisphere.csv - - snapshot://met_office_hadley_centre/2023-01-02/near_surface_temperature_southern_hemisphere.csv - data://garden/met_office_hadley_centre/2023-01-02/near_surface_temperature: - - data://meadow/met_office_hadley_centre/2023-01-02/near_surface_temperature - data://grapher/met_office_hadley_centre/2023-01-02/near_surface_temperature: - - data://garden/met_office_hadley_centre/2023-01-02/near_surface_temperature - # - # Met Office Hadley Centre - Near surface temperature anomaly (2023-01-17). - # - data://meadow/met_office_hadley_centre/2023-01-17/near_surface_temperature: - - snapshot://met_office_hadley_centre/2023-01-17/near_surface_temperature_global.csv - - snapshot://met_office_hadley_centre/2023-01-17/near_surface_temperature_northern_hemisphere.csv - - snapshot://met_office_hadley_centre/2023-01-17/near_surface_temperature_southern_hemisphere.csv - data://garden/met_office_hadley_centre/2023-01-17/near_surface_temperature: - - data://meadow/met_office_hadley_centre/2023-01-17/near_surface_temperature - data://grapher/met_office_hadley_centre/2023-01-17/near_surface_temperature: - - data://garden/met_office_hadley_centre/2023-01-17/near_surface_temperature - - - # Homicide - UNODC - to archive - data://meadow/homicide/2023-01-27/unodc: - - snapshot://homicide/2023-01-27/unodc.xlsx - data://garden/homicide/2023-01-27/unodc: - - data://meadow/homicide/2023-01-27/unodc - data://grapher/homicide/2023-01-27/unodc: - - data://garden/homicide/2023-01-27/unodc - # SDG dataset assmebled from https://sdg-tracker.org/ # This is commented now to avoid `etl.helpers.LatestVersionOfStepShouldBeActive` error # data://garden/sdg/latest/sdg: @@ -69,28 +32,6 @@ steps: # - backport://backport/owid/latest/dataset_943_sexual_violence__unicef__2017 # - data://garden/worldbank_wdi/2022-05-26/wdi - # - # EM-DAT Natural disasters (2022). - # - data://meadow/emdat/2022-11-24/natural_disasters: - - snapshot://emdat/2022-11-24/natural_disasters.xlsx - # The following dataset has a table for yearly data and another for decadal data. - data://garden/emdat/2022-11-24/natural_disasters: - - data://meadow/emdat/2022-11-24/natural_disasters - - data://garden/owid/latest/key_indicators - - data://garden/wb/2021-07-01/wb_income - - data://garden/regions/2023-01-01/regions - - data://garden/worldbank_wdi/2022-05-26/wdi - # The following dataset has all (yearly and decadal) variables together. - data://grapher/emdat/2022-11-24/natural_disasters: - - data://garden/emdat/2022-11-24/natural_disasters - # The following dataset has only global data, and entity corresponds to the type of disaster. - data://grapher/emdat/2022-11-24/natural_disasters_global_by_type: - - data://garden/emdat/2022-11-24/natural_disasters - # Natural disasters explorer. - data://explorers/emdat/2022-12-07/natural_disasters: - - data://garden/emdat/2022-11-24/natural_disasters - # International Monetary Fund, World Economic Outlook data://meadow/imf/2023-05-02/world_economic_outlook: - snapshot://imf/2023-05-02/world_economic_outlook.xls @@ -316,6 +257,20 @@ steps: data://grapher/itopf/2023-05-18/oil_spills: - data://garden/itopf/2023-05-18/oil_spills + data://garden/countries/2023-09-25/gleditsch: + - data://meadow/countries/2023-09-25/gleditsch + - data://garden/demography/2023-03-31/population + data://grapher/countries/2023-10-01/gleditsch: + - data://garden/countries/2023-09-25/gleditsch + + # International Monetary Fund, World Economic Outlook + data://meadow/imf/2024-05-02/world_economic_outlook: + - snapshot://imf/2024-05-02/world_economic_outlook.xls + data://garden/imf/2024-05-02/world_economic_outlook: + - data://meadow/imf/2024-05-02/world_economic_outlook + data://grapher/imf/2024-05-02/world_economic_outlook: + - data://garden/imf/2024-05-02/world_economic_outlook + include: # Include all active steps plus all archive steps. - dag/main.yml diff --git a/dag/archive/papers.yml b/dag/archive/papers.yml index e7c97acb8c5..a29497fbac4 100644 --- a/dag/archive/papers.yml +++ b/dag/archive/papers.yml @@ -2,22 +2,6 @@ steps: # # Farmer & Lafond (2016). # - data://meadow/papers/2023-01-04/farmer_lafond_2016: - - snapshot://papers/2023-01-04/farmer_lafond_2016.csv - data://garden/papers/2023-01-04/farmer_lafond_2016: - - data://meadow/papers/2023-01-04/farmer_lafond_2016 - data://grapher/papers/2023-01-04/farmer_lafond_2016: - - data://garden/papers/2023-01-04/farmer_lafond_2016 - # - # Nemet (2009). - # - data://meadow/papers/2023-01-04/nemet_2009: - - snapshot://papers/2023-01-04/nemet_2009.csv - data://garden/papers/2023-01-04/nemet_2009: - - data://meadow/papers/2023-01-04/nemet_2009 - # - # Farmer & Lafond (2016). - # data://meadow/papers/2023-07-10/farmer_lafond_2016: - snapshot://papers/2023-01-04/farmer_lafond_2016.csv data://garden/papers/2023-07-10/farmer_lafond_2016: diff --git a/dag/archive/poverty_inequality.yml b/dag/archive/poverty_inequality.yml index d35c56ca854..8ca39d6197c 100644 --- a/dag/archive/poverty_inequality.yml +++ b/dag/archive/poverty_inequality.yml @@ -35,3 +35,20 @@ steps: - data://garden/wb/2024-03-27/world_bank_pip data://grapher/wb/2024-03-27/world_bank_pip_2017ppp: - data://garden/wb/2024-03-27/world_bank_pip + + # Multidimensional Poverty Index + data://meadow/ophi/2023-07-05/multidimensional_poverty_index: + - snapshot://ophi/2023-07-05/multidimensional_poverty_index.csv + data://garden/ophi/2023-07-05/multidimensional_poverty_index: + - data://meadow/ophi/2023-07-05/multidimensional_poverty_index + data://grapher/ophi/2023-07-05/multidimensional_poverty_index: + - data://garden/ophi/2023-07-05/multidimensional_poverty_index + + # Poverty projections from the World Bank + data://meadow/wb/2024-06-26/poverty_projections: + - snapshot://wb/2024-06-26/poverty_projections_number_global.csv + - snapshot://wb/2024-06-26/poverty_projections_share_regions.csv + data://garden/wb/2024-06-26/poverty_projections: + - data://meadow/wb/2024-06-26/poverty_projections + data://grapher/wb/2024-06-26/poverty_projections: + - data://garden/wb/2024-06-26/poverty_projections diff --git a/dag/archive/war.yml b/dag/archive/war.yml index 541bc01dcb3..13d2a970e69 100644 --- a/dag/archive/war.yml +++ b/dag/archive/war.yml @@ -114,3 +114,17 @@ steps: - data://garden/countries/2023-09-25/gleditsch data://grapher/war/2023-10-24/ucdp_prio: - data://garden/war/2023-09-21/ucdp_prio + data://grapher/war/2023-09-21/prio_v31: + - data://garden/war/2023-09-21/prio_v31 + + # UCDP (candidate data) + data://meadow/war/2024-10-02/ucdp_ced: + - snapshot://war/2024-10-02/ucdp_ced.csv + data://garden/war/2024-10-02/ucdp_monthly: + - data://garden/demography/2024-07-15/population + - data://meadow/war/2024-08-26/ucdp + - data://garden/countries/2024-08-27/gleditsch + - data://meadow/war/2024-10-02/ucdp_ced + - data://garden/geography/2023-11-28/nat_earth_110 + data://grapher/war/2024-10-02/ucdp_monthly: + - data://garden/war/2024-10-02/ucdp_monthly diff --git a/dag/artificial_intelligence.yml b/dag/artificial_intelligence.yml index e44a62c2241..868a40ad852 100644 --- a/dag/artificial_intelligence.yml +++ b/dag/artificial_intelligence.yml @@ -1,50 +1,51 @@ steps: - +########### UPDATED MONTHLY ############################################################################# # Main EPOCH dataset - data://meadow/artificial_intelligence/2024-10-01/epoch: - - snapshot://artificial_intelligence/2024-10-01/epoch.csv - data://garden/artificial_intelligence/2024-10-01/epoch: - - data://meadow/artificial_intelligence/2024-10-01/epoch - data://grapher/artificial_intelligence/2024-10-01/epoch: - - data://garden/artificial_intelligence/2024-10-01/epoch + data://meadow/artificial_intelligence/2024-12-05/epoch: + - snapshot://artificial_intelligence/2024-12-05/epoch.csv + data://garden/artificial_intelligence/2024-12-05/epoch: + - data://meadow/artificial_intelligence/2024-12-05/epoch + data://grapher/artificial_intelligence/2024-12-05/epoch: + - data://garden/artificial_intelligence/2024-12-05/epoch # Main EPOCH dataset regression lines - data://garden/artificial_intelligence/2024-10-01/epoch_regressions: - - data://garden/artificial_intelligence/2024-10-01/epoch - data://grapher/artificial_intelligence/2024-10-01/epoch_regressions: - - data://garden/artificial_intelligence/2024-10-01/epoch_regressions + data://garden/artificial_intelligence/2024-12-05/epoch_regressions: + - data://garden/artificial_intelligence/2024-12-05/epoch + data://grapher/artificial_intelligence/2024-12-05/epoch_regressions: + - data://garden/artificial_intelligence/2024-12-05/epoch_regressions # EPOCH aggregates by domain - data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_domain: - - data://meadow/artificial_intelligence/2024-10-01/epoch - data://grapher/artificial_intelligence/2024-10-01/epoch_aggregates_domain: - - data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_domain + data://garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain: + - data://meadow/artificial_intelligence/2024-12-05/epoch + data://grapher/artificial_intelligence/2024-12-05/epoch_aggregates_domain: + - data://garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain # EPOCH aggregates by researcher affiliaiton - data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation: - - data://garden/artificial_intelligence/2024-10-01/epoch - data://grapher/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation: - - data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation + data://garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-12-05/epoch + data://grapher/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation # EPOCH dataset on Compute Intensive AI - data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive: - - snapshot://artificial_intelligence/2024-10-01/epoch_compute_intensive.csv - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive: - - data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive - data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive: - - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive + data://meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive: + - snapshot://artificial_intelligence/2024-12-05/epoch_compute_intensive.csv + data://garden/artificial_intelligence/2024-12-05/epoch_compute_intensive: + - data://meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive + # EPOCH dataset on Compute Intensive AI, aggregates by country - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries: - - data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive - data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries: - - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries + data://garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries: + - data://meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries: + - data://garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries # EPOCH dataset on Compute Intensive AI, aggregates by domain - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain: - - data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive - data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain: - - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain + data://garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain: + - data://meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain: + - data://garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain + + ############### OTHERS ##################################################################################### # Large Language Models and Compute (EPOCH) data://garden/artificial_intelligence/2024-02-15/epoch_llms: diff --git a/dag/climate.yml b/dag/climate.yml index cdd55fce79e..84dd5b43073 100644 --- a/dag/climate.yml +++ b/dag/climate.yml @@ -32,7 +32,7 @@ steps: # Copernicus Climate Change Service - Surface temperature. # data://meadow/climate/2023-12-20/surface_temperature: - - snapshot://climate/2024-10-13/surface_temperature.zip + - snapshot://climate/2024-11-05/surface_temperature.zip - snapshot://countries/2023-12-27/world_bank.zip data://garden/climate/2023-12-20/surface_temperature: - data://meadow/climate/2023-12-20/surface_temperature @@ -50,11 +50,22 @@ steps: - data://garden/climate/2023-12-20/surface_temperature data://grapher/climate/2023-12-20/surface_global_monthly_anomaly_all_countries: - data://garden/climate/2023-12-20/surface_temperature + + # + # Copernicus Climate Change Service - Precipitation. + # + data://meadow/climate/2024-11-19/total_precipitation: + - snapshot://climate/2024-11-19/total_precipitation.zip + - snapshot://countries/2023-12-27/world_bank.zip + data://garden/climate/2024-11-19/total_precipitation: + - data://meadow/climate/2024-11-19/total_precipitation + data://grapher/climate/2024-11-19/total_precipitation_annual: + - data://garden/climate/2024-11-19/total_precipitation # # Climate change impacts data explorer. # data://explorers/climate/latest/climate_change_impacts: - - data://garden/climate/2024-09-30/climate_change_impacts + - data://garden/climate/2024-11-18/climate_change_impacts # # Global Wildfire Information System - Monthly burned area. # @@ -122,146 +133,146 @@ steps: data://garden/epa/2024-04-17/mass_balance_us_glaciers: - data://meadow/epa/2024-04-17/mass_balance_us_glaciers # - # Met Office Hadley Centre - Sea surface temperature. - # - data://meadow/climate/2024-09-30/sea_surface_temperature: - - snapshot://climate/2024-09-30/sea_surface_temperature_world.csv - - snapshot://climate/2024-09-30/sea_surface_temperature_northern_hemisphere.csv - - snapshot://climate/2024-09-30/sea_surface_temperature_southern_hemisphere.csv - # - # NOAA National Centers for Environmental Information - Ocean Heat Content. + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). # - data://meadow/climate/2024-09-30/ocean_heat_content: - - snapshot://climate/2024-07-23/ocean_heat_content_monthly_world_700m.csv - - snapshot://climate/2024-07-23/ocean_heat_content_monthly_world_2000m.csv - - snapshot://climate/2024-07-23/ocean_heat_content_annual_world_700m.csv - - snapshot://climate/2024-07-23/ocean_heat_content_annual_world_2000m.csv + data://meadow/climate/2024-11-18/hawaii_ocean_time_series: + - snapshot://climate/2024-11-18/hawaii_ocean_time_series.csv # # NSIDC - Arctic sea ice extent. # - data://meadow/climate/2024-09-30/sea_ice_index: - - snapshot://climate/2024-09-30/sea_ice_index.xlsx - # - # Rutgers University Global Snow Lab - Snow Cover Extent. - # - data://meadow/climate/2024-09-30/snow_cover_extent: - - snapshot://climate/2024-09-30/snow_cover_extent_north_america.csv - - snapshot://climate/2024-09-30/snow_cover_extent_northern_hemisphere.csv + data://meadow/climate/2024-11-18/sea_ice_index: + - snapshot://climate/2024-11-18/sea_ice_index.xlsx # # GISS - Surface temperature analysis. # - data://meadow/climate/2024-09-30/surface_temperature_analysis: - - snapshot://climate/2024-07-23/surface_temperature_analysis_world.csv - - snapshot://climate/2024-07-23/surface_temperature_analysis_southern_hemisphere.csv - - snapshot://climate/2024-07-23/surface_temperature_analysis_northern_hemisphere.csv - # - # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). - # - data://meadow/climate/2024-09-30/hawaii_ocean_time_series: - - snapshot://climate/2024-09-30/hawaii_ocean_time_series.csv + data://meadow/climate/2024-11-18/surface_temperature_analysis: + - snapshot://climate/2024-11-18/surface_temperature_analysis_world.csv + - snapshot://climate/2024-11-18/surface_temperature_analysis_southern_hemisphere.csv + - snapshot://climate/2024-11-18/surface_temperature_analysis_northern_hemisphere.csv # # NOAA Global Monitoring Laboratory - GHG concentration. # - data://meadow/climate/2024-09-30/ghg_concentration: - - snapshot://climate/2024-09-30/n2o_concentration_monthly.csv - - snapshot://climate/2024-09-30/co2_concentration_monthly.csv - - snapshot://climate/2024-09-30/ch4_concentration_monthly.csv + data://meadow/climate/2024-11-18/ghg_concentration: + - snapshot://climate/2024-11-18/n2o_concentration_monthly.csv + - snapshot://climate/2024-11-18/co2_concentration_monthly.csv + - snapshot://climate/2024-11-18/ch4_concentration_monthly.csv # # Met Office Hadley Centre - Sea surface temperature. # - data://garden/climate/2024-09-30/sea_surface_temperature: - - data://meadow/climate/2024-09-30/sea_surface_temperature + data://meadow/climate/2024-11-18/sea_surface_temperature: + - snapshot://climate/2024-11-18/sea_surface_temperature_world.csv + - snapshot://climate/2024-11-18/sea_surface_temperature_southern_hemisphere.csv + - snapshot://climate/2024-11-18/sea_surface_temperature_northern_hemisphere.csv # - # NOAA National Centers for Environmental Information - Ocean Heat Content. + # Rutgers University Global Snow Lab - Snow Cover Extent. # - data://garden/climate/2024-09-30/ocean_heat_content: - - data://meadow/climate/2024-09-30/ocean_heat_content + data://meadow/climate/2024-11-18/snow_cover_extent: + - snapshot://climate/2024-11-18/snow_cover_extent_northern_hemisphere.csv + - snapshot://climate/2024-11-18/snow_cover_extent_north_america.csv # - # NSIDC - Arctic sea ice extent. + # NOAA National Centers for Environmental Information - Ocean Heat Content. # - data://garden/climate/2024-09-30/sea_ice_index: - - data://meadow/climate/2024-09-30/sea_ice_index + data://meadow/climate/2024-11-18/ocean_heat_content: + - snapshot://climate/2024-11-18/ocean_heat_content_monthly_world_2000m.csv + - snapshot://climate/2024-11-18/ocean_heat_content_annual_world_700m.csv + - snapshot://climate/2024-11-18/ocean_heat_content_monthly_world_700m.csv + - snapshot://climate/2024-11-18/ocean_heat_content_annual_world_2000m.csv # - # Rutgers University Global Snow Lab - Snow Cover Extent. + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). # - data://garden/climate/2024-09-30/snow_cover_extent: - - data://meadow/climate/2024-09-30/snow_cover_extent + data://garden/climate/2024-11-18/ocean_ph_levels: + - data://meadow/climate/2024-11-18/hawaii_ocean_time_series # - # GISS - Surface temperature analysis. + # NSIDC - Arctic sea ice extent. # - data://garden/climate/2024-09-30/surface_temperature_analysis: - - data://meadow/climate/2024-09-30/surface_temperature_analysis + data://garden/climate/2024-11-18/sea_ice_index: + - data://meadow/climate/2024-11-18/sea_ice_index # - # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # GISS - Surface temperature analysis. # - data://garden/climate/2024-09-30/ocean_ph_levels: - - data://meadow/climate/2024-09-30/hawaii_ocean_time_series + data://garden/climate/2024-11-18/surface_temperature_analysis: + - data://meadow/climate/2024-11-18/surface_temperature_analysis # # NOAA Global Monitoring Laboratory - GHG concentration. # - data://garden/climate/2024-09-30/ghg_concentration: - - data://meadow/climate/2024-09-30/ghg_concentration + data://garden/climate/2024-11-18/ghg_concentration: + - data://meadow/climate/2024-11-18/ghg_concentration # - # NSIDC - Monthly sea ice anomalies ("country" for month names, "year" for years, one indicator per hemisphere). + # Met Office Hadley Centre - Sea surface temperature. # - data://grapher/climate/2024-09-30/sea_ice_anomalies_by_month: - - data://garden/climate/2024-09-30/sea_ice_index + data://garden/climate/2024-11-18/sea_surface_temperature: + - data://meadow/climate/2024-11-18/sea_surface_temperature # - # NSIDC - Monthly sea ice extent ("country" for years, "year" for month number, one indicator per hemisphere). + # Rutgers University Global Snow Lab - Snow Cover Extent. # - data://grapher/climate/2024-09-30/sea_ice_extent_by_year: - - data://garden/climate/2024-09-30/sea_ice_index + data://garden/climate/2024-11-18/snow_cover_extent: + - data://meadow/climate/2024-11-18/snow_cover_extent # - # NSIDC - Monthly sea ice extent ("country" for decades and latest year, "year" for month number, one indicator per hemisphere). + # NOAA National Centers for Environmental Information - Ocean Heat Content. # - data://grapher/climate/2024-09-30/sea_ice_extent_by_decade: - - data://garden/climate/2024-09-30/sea_ice_index + data://garden/climate/2024-11-18/ocean_heat_content: + - data://meadow/climate/2024-11-18/ocean_heat_content # # Various sources - Long-run greenhouse gas concentration. # - data://garden/climate/2024-09-30/long_run_ghg_concentration: + data://garden/climate/2024-11-18/long_run_ghg_concentration: + - data://garden/climate/2024-11-18/ghg_concentration - data://garden/epa/2024-04-17/ghg_concentration - - data://garden/climate/2024-09-30/ghg_concentration # # Various sources - Climate change impacts. # - data://garden/climate/2024-09-30/climate_change_impacts: + data://garden/climate/2024-11-18/climate_change_impacts: + - data://garden/climate/2024-11-18/snow_cover_extent + - data://garden/climate/2024-11-18/sea_surface_temperature + - data://garden/climate/2024-11-18/ghg_concentration + - data://garden/climate/2024-11-18/ocean_ph_levels + - data://garden/climate/2024-11-18/long_run_ghg_concentration + - data://garden/climate/2024-11-18/sea_ice_index + - data://garden/climate/2024-11-18/surface_temperature_analysis - data://garden/epa/2024-04-17/ocean_heat_content - - data://garden/climate/2024-09-30/long_run_ghg_concentration - - data://garden/epa/2024-04-17/ice_sheet_mass_balance - - data://garden/climate/2024-09-30/sea_surface_temperature - data://garden/epa/2024-04-17/mass_balance_us_glaciers - data://garden/climate/2024-01-28/global_sea_level - - data://garden/climate/2024-09-30/ocean_heat_content - - data://garden/climate/2024-09-30/ocean_ph_levels - - data://garden/climate/2024-09-30/snow_cover_extent - - data://garden/climate/2024-09-30/surface_temperature_analysis - - data://garden/climate/2024-09-30/sea_ice_index - - data://garden/climate/2024-09-30/ghg_concentration + - data://garden/epa/2024-04-17/ice_sheet_mass_balance + - data://garden/climate/2024-11-18/ocean_heat_content + # + # Various sources - Climate change impacts (monthly). + # + data://grapher/climate/2024-11-18/climate_change_impacts_monthly: + - data://garden/climate/2024-11-18/climate_change_impacts # # Various sources - Climate change impacts (annual). # - data://grapher/climate/2024-09-30/climate_change_impacts_annual: - - data://garden/climate/2024-09-30/climate_change_impacts + data://grapher/climate/2024-11-18/climate_change_impacts_annual: + - data://garden/climate/2024-11-18/climate_change_impacts # - # Various sources - Climate change impacts (monthly). + # NSIDC - Monthly sea ice extent ("country" for years, "year" for month number, one indicator per hemisphere). + # + data://grapher/climate/2024-11-18/sea_ice_extent_by_year: + - data://garden/climate/2024-11-18/sea_ice_index # - data://grapher/climate/2024-09-30/climate_change_impacts_monthly: - - data://garden/climate/2024-09-30/climate_change_impacts + # NSIDC - Monthly sea ice extent ("country" for decades and latest year, "year" for month number, one indicator per hemisphere). + # + data://grapher/climate/2024-11-18/sea_ice_extent_by_decade: + - data://garden/climate/2024-11-18/sea_ice_index # # Met Office Hadley Centre - Near surface temperature anomaly. # - data://meadow/met_office_hadley_centre/2024-10-22/near_surface_temperature: - - snapshot://met_office_hadley_centre/2024-10-22/near_surface_temperature_northern_hemisphere.csv - - snapshot://met_office_hadley_centre/2024-10-22/near_surface_temperature_southern_hemisphere.csv - - snapshot://met_office_hadley_centre/2024-10-22/near_surface_temperature_global.csv + data://meadow/met_office_hadley_centre/2024-11-18/near_surface_temperature: + - snapshot://met_office_hadley_centre/2024-11-18/near_surface_temperature_global.csv + - snapshot://met_office_hadley_centre/2024-11-18/near_surface_temperature_southern_hemisphere.csv + - snapshot://met_office_hadley_centre/2024-11-18/near_surface_temperature_northern_hemisphere.csv # # Met Office Hadley Centre - Near surface temperature anomaly. # - data://garden/met_office_hadley_centre/2024-10-22/near_surface_temperature: - - data://meadow/met_office_hadley_centre/2024-10-22/near_surface_temperature + data://garden/met_office_hadley_centre/2024-11-18/near_surface_temperature: + - data://meadow/met_office_hadley_centre/2024-11-18/near_surface_temperature # # Met Office Hadley Centre - Near surface temperature anomaly. # - data://grapher/met_office_hadley_centre/2024-10-22/near_surface_temperature: - - data://garden/met_office_hadley_centre/2024-10-22/near_surface_temperature + data://grapher/met_office_hadley_centre/2024-11-18/near_surface_temperature: + - data://garden/met_office_hadley_centre/2024-11-18/near_surface_temperature + # + # NSIDC - Monthly sea ice anomalies ("country" for month names, "year" for years, one indicator per hemisphere). + # + data://grapher/climate/2024-11-18/sea_ice_anomalies_by_month: + - data://garden/climate/2024-11-18/sea_ice_index diff --git a/dag/covid.yml b/dag/covid.yml index 23ee826dbdb..d0c1e5a290b 100644 --- a/dag/covid.yml +++ b/dag/covid.yml @@ -27,7 +27,7 @@ steps: - data://garden/regions/2023-01-01/regions # Demography - data://garden/demography/2024-07-15/population - - data://garden/demography/2023-10-09/life_expectancy + - data://garden/demography/2024-12-03/life_expectancy - data://garden/un/2024-07-12/un_wpp # Econ - data://garden/wb/2024-10-07/world_bank_pip @@ -288,6 +288,30 @@ steps: data://grapher/covid/latest/deaths_vax_status: - data://garden/covid/latest/deaths_vax_status + # Countries reporting data + data://meadow/covid/latest/countries_reporting: + - snapshot://covid/2024-11-05/github_stats_vax_reporting.csv + data://garden/covid/latest/countries_reporting: + - data://meadow/covid/latest/vaccinations_global + - data://meadow/covid/latest/countries_reporting + data://grapher/covid/latest/countries_reporting: + - data://garden/covid/latest/countries_reporting + + # Contributions to GitHub + data://meadow/covid/2024-11-05/github_stats: + - snapshot://covid/2024-11-05/github_stats_issues.csv + - snapshot://covid/2024-11-05/github_stats_issues_comments.csv + - snapshot://covid/2024-11-05/github_stats_issues_users.csv + - snapshot://covid/2024-11-05/github_stats_pr.csv + - snapshot://covid/2024-11-05/github_stats_pr_comments.csv + - snapshot://covid/2024-11-05/github_stats_pr_users.csv + - snapshot://covid/2024-11-05/github_stats_commits.csv + - snapshot://covid/2024-11-05/github_stats_commits_users.csv + data://garden/covid/2024-11-05/github_stats: + - data://meadow/covid/2024-11-05/github_stats + data://grapher/covid/2024-11-05/github_stats: + - data://garden/covid/2024-11-05/github_stats + ###################################### # MULTIDIM export://multidim/covid/latest/covid: diff --git a/dag/demography.yml b/dag/demography.yml index 2359aaacdb1..536bbc339ad 100644 --- a/dag/demography.yml +++ b/dag/demography.yml @@ -1,7 +1,7 @@ steps: - #################################### - # Population ####################### - #################################### + ######################################################################## + # Population # + ######################################################################## # Population OMM ############################# # 2022-12-08: HYDE 3.2 + Gapminder + UN WPP 2022 @@ -111,50 +111,21 @@ steps: data://grapher/demography/2023-07-03/world_population_comparison: - data://garden/demography/2023-06-27/world_population_comparison - #################################### - # Life expectancy ################## - #################################### - - # HMD - data://meadow/hmd/2023-09-19/hmd: - - snapshot://hmd/2023-09-18/hmd.zip - data://garden/hmd/2023-09-19/hmd: - - data://meadow/hmd/2023-09-19/hmd - data://grapher/hmd/2023-09-19/hmd: - - data://garden/hmd/2023-09-19/hmd - - # Gini Life Expectancy Inequality - data://garden/demography/2023-10-04/gini_le: - - data://garden/demography/2023-10-03/life_tables - data://grapher/demography/2023-10-04/gini_le: - - data://garden/demography/2023-10-04/gini_le - - # Survivorship ages (HMD-derived) - data://garden/demography/2023-09-27/survivor_percentiles: - - data://garden/hmd/2023-09-19/hmd - data://grapher/demography/2023-09-27/survivor_percentiles: - - data://garden/demography/2023-09-27/survivor_percentiles - - # Phi-gender life expectancy inequality - data://garden/demography/2023-10-03/phi_gender_le: - - data://garden/demography/2023-10-03/life_tables - data://grapher/demography/2023-10-03/phi_gender_le: - - data://garden/demography/2023-10-03/phi_gender_le + # Maddison working paper (2022) + data://meadow/ggdc/2024-01-19/maddison_federico_paper: + - snapshot://ggdc/2024-01-19/maddison_federico_paper.xlsx + data://garden/ggdc/2024-01-19/maddison_federico_paper: + - data://meadow/ggdc/2024-01-19/maddison_federico_paper - # UN WPP Life Tables - data://meadow/un/2023-10-02/un_wpp_lt: - - snapshot://un/2023-10-02/un_wpp_lt_all.zip - - snapshot://un/2023-10-02/un_wpp_lt_f.zip - - snapshot://un/2023-10-02/un_wpp_lt_m.zip - data://garden/un/2023-10-02/un_wpp_lt: - - data://meadow/un/2023-10-02/un_wpp_lt + # UN WPP largest age-group per country + data://garden/un/2024-03-14/un_wpp_most: + - data://garden/un/2024-07-12/un_wpp + data://grapher/un/2024-03-14/un_wpp_most: + - data://garden/un/2024-03-14/un_wpp_most - # UN WPP + HMD Life Tables - data://garden/demography/2023-10-03/life_tables: - - data://garden/hmd/2023-09-19/hmd - - data://garden/un/2023-10-02/un_wpp_lt - data://grapher/demography/2023-10-04/life_tables: - - data://garden/demography/2023-10-03/life_tables + ######################################################################## + # Life expectancy # + ######################################################################## # Zijdeman et al data://meadow/demography/2023-10-10/zijdeman_et_al_2015: @@ -168,14 +139,62 @@ steps: data://garden/demography/2023-10-10/riley_2005: - data://meadow/demography/2023-10-10/riley_2005 + # Human Mortality Database + data://meadow/hmd/2024-12-01/hmd: + - snapshot://hmd/2024-11-27/hmd.zip + data://garden/hmd/2024-12-01/hmd: + - data://meadow/hmd/2024-12-01/hmd + data://grapher/hmd/2024-12-01/hmd: + - data://garden/hmd/2024-12-01/hmd + + # UN WPP Life Tables + data://meadow/un/2024-12-02/un_wpp_lt: + - snapshot://un/2024-12-02/un_wpp_lt_m.csv + - snapshot://un/2024-12-02/un_wpp_lt_all.csv + - snapshot://un/2024-12-02/un_wpp_lt_f.csv + data://garden/un/2024-12-02/un_wpp_lt: + - data://meadow/un/2024-12-02/un_wpp_lt + + # Survivorship ages (HMD-derived) + data://garden/demography/2024-12-02/survivor_percentiles: + - data://garden/hmd/2024-12-01/hmd + data://grapher/demography/2024-12-02/survivor_percentiles: + - data://garden/demography/2024-12-02/survivor_percentiles + + # UN WPP + HMD Life Tables + data://garden/demography/2024-12-03/life_tables: + - data://garden/hmd/2024-12-01/hmd + - data://garden/un/2024-12-02/un_wpp_lt + data://grapher/demography/2024-12-03/life_tables: + - data://garden/demography/2024-12-03/life_tables + # OMM: Life Expectancy - data://garden/demography/2023-10-09/life_expectancy: - - data://garden/demography/2023-10-03/life_tables - - data://garden/demography/2023-10-10/zijdeman_et_al_2015 + data://garden/demography/2024-12-03/life_expectancy: - data://garden/demography/2023-10-10/riley_2005 - - data://garden/un/2022-07-11/un_wpp - data://grapher/demography/2023-10-10/life_expectancy: - - data://garden/demography/2023-10-09/life_expectancy + - data://garden/demography/2023-10-10/zijdeman_et_al_2015 + - data://garden/demography/2024-12-03/life_tables + - data://garden/un/2024-07-12/un_wpp + data://grapher/demography/2024-12-03/life_expectancy: + - data://garden/demography/2024-12-03/life_expectancy + + # Broken limits of Life Expectancy + data://garden/demography/2024-12-03/broken_limits_le: + - data://garden/hmd/2024-12-01/hmd + - data://garden/demography/2024-12-03/life_tables + data://grapher/demography/2024-12-03/broken_limits_le: + - data://garden/demography/2024-12-03/broken_limits_le + + # Phi-gender life expectancy inequality + data://garden/demography/2024-12-03/phi_gender_le: + - data://garden/demography/2024-12-03/life_tables + data://grapher/demography/2024-12-03/phi_gender_le: + - data://garden/demography/2024-12-03/phi_gender_le + + # Gini Life Expectancy Inequality + data://garden/demography/2024-12-03/gini_le: + - data://garden/demography/2024-12-03/life_tables + data://grapher/demography/2024-12-03/gini_le: + - data://garden/demography/2024-12-03/gini_le # Life Expectancy OECD data://meadow/oecd/2023-10-11/life_expectancy_birth: @@ -185,13 +204,6 @@ steps: data://grapher/oecd/2023-10-11/life_expectancy_birth: - data://garden/oecd/2023-10-11/life_expectancy_birth - # Broken limits of Life Expectancy - data://garden/demography/2023-10-20/broken_limits_le: - - data://garden/demography/2023-10-03/life_tables - - data://garden/hmd/2023-09-19/hmd - data://grapher/demography/2023-10-20/broken_limits_le: - - data://garden/demography/2023-10-20/broken_limits_le - # Contribution to sex gap in Life Expectancy data://meadow/demography/2023-11-08/le_sex_gap_age_contribution: - snapshot://demography/2023-11-08/le_sex_gap_age_contribution.zip @@ -208,64 +220,53 @@ steps: data://grapher/demography/2023-11-08/modal_age_death: - data://garden/demography/2023-11-08/modal_age_death - # Maddison working paper (2022) - data://meadow/ggdc/2024-01-19/maddison_federico_paper: - - snapshot://ggdc/2024-01-19/maddison_federico_paper.xlsx - data://garden/ggdc/2024-01-19/maddison_federico_paper: - - data://meadow/ggdc/2024-01-19/maddison_federico_paper - - # UN WPP experiments - data://garden/un/2024-03-14/un_wpp_most: - - data://garden/un/2022-07-11/un_wpp - data://grapher/un/2024-03-14/un_wpp_most: - - data://garden/un/2024-03-14/un_wpp_most - - # Migration UN DESA - data://meadow/un/2024-07-16/migrant_stock: - - snapshot://un/2024-07-16/migrant_stock_dest_origin.xlsx - - snapshot://un/2024-07-16/migrant_stock_origin.xlsx - - snapshot://un/2024-07-16/migrant_stock_dest.xlsx - - snapshot://un/2024-07-16/migrant_stock_age_sex.xlsx - data://garden/un/2024-07-16/migrant_stock: - - data://meadow/un/2024-07-16/migrant_stock - data://grapher/un/2024-07-16/migrant_stock: - - data://garden/un/2024-07-16/migrant_stock - - # Internal displacement monitoring centre - data://meadow/idmc/2024-08-02/internal_displacement: - - snapshot://idmc/2024-08-02/internal_displacement.xlsx - - data://garden/demography/2024-07-15/population - data://grapher/idmc/2024-08-02/internal_displacement: - - data://meadow/idmc/2024-08-02/internal_displacement - - # UNHCR refugee data - data://meadow/un/2024-07-25/refugee_data: - - snapshot://un/2024-07-25/refugee_data.zip - data://garden/un/2024-07-25/refugee_data: - - data://meadow/un/2024-07-25/refugee_data - - data://garden/demography/2024-07-15/population - - data://garden/un/2024-07-25/resettlement - data://grapher/un/2024-07-25/refugee_data: - - data://garden/un/2024-07-25/refugee_data - - # UNHCR resettlement data - data://meadow/un/2024-07-25/resettlement: - - snapshot://un/2024-07-25/resettlement.zip - data://garden/un/2024-07-25/resettlement: - - data://meadow/un/2024-07-25/resettlement - - data://garden/demography/2024-07-15/population - - # Child migration (UNICEF) - data://meadow/unicef/2024-07-30/child_migration: - - snapshot://unicef/2024-07-30/child_migration.csv - data://garden/unicef/2024-07-30/child_migration: - - data://meadow/unicef/2024-07-30/child_migration - - data://garden/demography/2024-07-15/population - data://grapher/unicef/2024-07-30/child_migration: - - data://garden/unicef/2024-07-30/child_migration + ######################################################################## + # Fertility # + ######################################################################## # Mothers by decadal age-group data://garden/un/2024-10-01/births_by_age: - data://garden/un/2024-07-12/un_wpp data://grapher/un/2024-10-01/births_by_age: - data://garden/un/2024-10-01/births_by_age + + # Human Fertility Database + data://meadow/hmd/2024-11-19/hfd: + - snapshot://hmd/2024-11-19/hfd.zip + data://garden/hmd/2024-11-19/hfd: + - data://meadow/hmd/2024-11-19/hfd + data://grapher/hmd/2024-11-19/hfd: + - data://garden/hmd/2024-11-19/hfd + + # Multiple births + data://meadow/demography/2024-11-26/multiple_births: + - snapshot://demography/2024-11-26/multiple_births.7z + data://garden/demography/2024-11-26/multiple_births: + - data://meadow/demography/2024-11-26/multiple_births + data://grapher/demography/2024-11-26/multiple_births: + - data://garden/demography/2024-11-26/multiple_births + + # OMM: Fertility Rate -- HFD + UN WPP + data://garden/demography/2024-12-03/fertility_rate: + - data://garden/hmd/2024-11-19/hfd + - data://garden/un/2024-07-12/un_wpp + data://grapher/demography/2024-12-03/fertility_rate: + - data://garden/demography/2024-12-03/fertility_rate + + # OMM: Birth rate -- HFD + UN WPP + data://garden/demography/2024-12-03/birth_rate: + - data://garden/hmd/2024-12-01/hmd + - data://garden/un/2024-07-12/un_wpp + data://grapher/demography/2024-12-03/birth_rate: + - data://garden/demography/2024-12-03/birth_rate + + # HMD country data + data://meadow/hmd/2024-12-03/hmd_country: + - snapshot://hmd/2024-12-01/hmd_country.zip + + # HMD - Birth rate by month + data://garden/hmd/2024-12-03/hmd_country: + - data://meadow/hmd/2024-12-03/hmd_country + - data://garden/hmd/2024-12-01/hmd + data://grapher/hmd/2024-12-03/hmd_country: + - data://garden/hmd/2024-12-03/hmd_country diff --git a/dag/education.yml b/dag/education.yml index 781f94d2b82..a52b02d95cc 100644 --- a/dag/education.yml +++ b/dag/education.yml @@ -1,17 +1,4 @@ steps: - # World Bank EdStats - data://meadow/wb/2023-07-10/education: - - snapshot://wb/2023-07-10/education.csv - - data://garden/wb/2023-07-10/education: - - data://meadow/wb/2023-07-10/education - - data://garden/education/2017-09-30/public_expenditure - - data://garden/education/2018-04-18/literacy_rates - - data://grapher/wb/2023-07-10/education: - - data://garden/wb/2023-07-10/education - - # Barro and Lee projections dataset data://meadow/education/2023-07-17/education_barro_lee_projections: - snapshot://education/2023-07-17/education_barro_lee_projections.csv @@ -32,8 +19,9 @@ steps: data://garden/education/2023-07-17/education_lee_lee: - data://meadow/education/2023-07-17/education_lee_lee - data://garden/regions/2023-01-01/regions - - data://garden/worldbank_wdi/2024-05-20/wdi - data://garden/demography/2022-12-08/population + - data://garden/worldbank_wdi/2024-05-20/wdi + - data://garden/unesco/2024-11-21/enrolment_rates data://grapher/education/2023-07-17/education_lee_lee: - data://garden/education/2023-07-17/education_lee_lee @@ -54,7 +42,7 @@ steps: - snapshot://education/2023-08-14/oecd_education.csv data://garden/education/2023-08-14/oecd_education: - data://meadow/education/2023-08-14/oecd_education - - data://garden/wb/2023-07-10/education + - data://garden/wb/2024-11-04/edstats data://grapher/education/2023-08-14/oecd_education: - data://garden/education/2023-08-14/oecd_education @@ -119,3 +107,24 @@ steps: - snapshot://wb/2024-06-18/edstats_metadata.xls data://grapher/unesco/2024-06-25/education_sdgs: - data://garden/unesco/2024-06-25/education_sdgs + + # UNESCO data on other policy related education indicators + data://meadow/unesco/2024-11-21/enrolment_rates: + - snapshot://unesco/2024-11-21/enrolment_rates.csv + data://garden/unesco/2024-11-21/enrolment_rates: + - data://meadow/unesco/2024-11-21/enrolment_rates + - snapshot://wb/2024-06-18/edstats_metadata.xls + data://grapher/unesco/2024-11-21/enrolment_rates: + - data://garden/unesco/2024-11-21/enrolment_rates + + + # World Bank EdStats + data://meadow/wb/2024-11-04/edstats: + - snapshot://wb/2024-11-04/edstats.csv + data://garden/wb/2024-11-04/edstats: + - data://meadow/wb/2024-11-04/edstats + - data://garden/education/2017-09-30/public_expenditure + - data://garden/education/2018-04-18/literacy_rates + + data://grapher/wb/2024-11-04/edstats: + - data://garden/wb/2024-11-04/edstats diff --git a/dag/emissions.yml b/dag/emissions.yml index 22ea091dde3..e1a32f77169 100644 --- a/dag/emissions.yml +++ b/dag/emissions.yml @@ -1,17 +1,5 @@ steps: # - # Climate Watch - Greenhouse gas emissions by sector (2023-10-31). - # - data://meadow/climate_watch/2023-10-31/emissions_by_sector: - - snapshot://climate_watch/2023-10-31/emissions_by_sector.gz - data://garden/climate_watch/2023-10-31/emissions_by_sector: - - data://meadow/climate_watch/2023-10-31/emissions_by_sector - - data://garden/regions/2023-01-01/regions - - data://garden/demography/2023-03-31/population - - data://garden/wb/2023-04-30/income_groups - data://grapher/climate_watch/2023-10-31/emissions_by_sector: - - data://garden/climate_watch/2023-10-31/emissions_by_sector - # # Andrew - CO2 mitigation curves (2019). # data://meadow/andrew/2019-12-03/co2_mitigation_curves: @@ -24,14 +12,6 @@ steps: data://grapher/andrew/2019-12-03/co2_mitigation_curves_2celsius: - data://garden/andrew/2019-12-03/co2_mitigation_curves # - # GCP - Global Carbon Budget (published on 2023-12-05, updated on 2023-12-12 to use the latest primary energy data). - # - data://meadow/gcp/2023-12-12/global_carbon_budget: - - snapshot://gcp/2023-12-12/global_carbon_budget_fossil_co2_emissions.csv - - snapshot://gcp/2023-12-12/global_carbon_budget_global_emissions.xlsx - - snapshot://gcp/2023-12-12/global_carbon_budget_national_emissions.xlsx - - snapshot://gcp/2023-12-12/global_carbon_budget_land_use_change_emissions.xlsx - # # RFF - World Carbon Pricing (2022-09-14). # data://meadow/rff/2023-10-19/world_carbon_pricing: @@ -79,50 +59,90 @@ steps: data://grapher/emissions/2023-11-06/global_warming_potential_factors: - data://garden/emissions/2023-11-06/global_warming_potential_factors # - # Jones et al. - National contributions to climate change. + # GCP - Global Carbon Budget. + # + data://meadow/gcp/2024-11-21/global_carbon_budget: + - snapshot://gcp/2024-11-21/global_carbon_budget_fossil_co2_emissions.csv + - snapshot://gcp/2024-11-21/global_carbon_budget_land_use_change_emissions.xlsx + - snapshot://gcp/2024-11-21/global_carbon_budget_global_emissions.xlsx + - snapshot://gcp/2024-11-21/global_carbon_budget_national_emissions.xlsx # - data://meadow/emissions/2024-04-08/national_contributions: - - snapshot://emissions/2024-04-08/national_contributions_temperature_response.csv - - snapshot://emissions/2024-04-08/national_contributions_cumulative_emissions.csv - - snapshot://emissions/2024-04-08/national_contributions_annual_emissions.csv - data://garden/emissions/2024-04-08/national_contributions: - - data://meadow/emissions/2024-04-08/national_contributions - - data://garden/demography/2023-03-31/population - - data://garden/wb/2024-03-11/income_groups + # GCP - Global Carbon Budget. + # + data://garden/gcp/2024-11-21/global_carbon_budget: + - data://garden/demography/2024-07-15/population + - data://meadow/gcp/2024-11-21/global_carbon_budget - data://garden/regions/2023-01-01/regions - data://grapher/emissions/2024-04-08/national_contributions: - - data://garden/emissions/2024-04-08/national_contributions + - data://garden/wb/2024-07-29/income_groups + - data://garden/energy/2024-06-20/primary_energy_consumption + - data://garden/ggdc/2024-04-26/maddison_project_database + # + # Decoupling of GDP and CO2 (2023). + # + data://garden/emissions/2024-11-21/gdp_and_co2_decoupling: + - data://garden/worldbank_wdi/2024-05-20/wdi + - data://garden/gcp/2024-11-21/global_carbon_budget + # + # GCP - Global Carbon Budget. + # + data://grapher/gcp/2024-11-21/global_carbon_budget: + - data://garden/gcp/2024-11-21/global_carbon_budget # # Emissions - CO2 dataset. # - export://github/co2_data/latest/owid_co2: - - data://garden/ggdc/2024-04-26/maddison_project_database - - data://garden/demography/2023-03-31/population + data://garden/emissions/2024-11-21/owid_co2: + - data://garden/demography/2024-07-15/population + - data://garden/emissions/2024-11-21/national_contributions + - data://garden/gcp/2024-11-21/global_carbon_budget - data://garden/regions/2023-01-01/regions - - data://garden/emissions/2024-04-08/national_contributions - - data://garden/gcp/2024-06-20/global_carbon_budget - - data://garden/climate_watch/2023-10-31/emissions_by_sector - data://garden/energy/2024-06-20/primary_energy_consumption + # - data://garden/climate_watch/2024-11-21/emissions_by_sector + - data://garden/ggdc/2024-04-26/maddison_project_database # - # GCP - Global Carbon Budget (published on 2023-12-05, updated on 2024-06-20 to use the latest primary energy data). + # Emissions - CO2 dataset. # - data://garden/gcp/2024-06-20/global_carbon_budget: - - data://garden/ggdc/2024-04-26/maddison_project_database - - data://meadow/gcp/2023-12-12/global_carbon_budget - - data://garden/demography/2023-03-31/population + export://github/co2_data/latest/owid_co2: + - data://garden/emissions/2024-11-21/owid_co2 + # + # Emissions - CO2 dataset. + # + export://s3/co2_data/latest/owid_co2: + - data://garden/emissions/2024-11-21/owid_co2 + # + # Climate Watch - Greenhouse gas emissions by sector. + # + data://meadow/climate_watch/2024-11-21/emissions_by_sector: + - snapshot://climate_watch/2024-11-21/emissions_by_sector.gz + # + # Climate Watch - Greenhouse gas emissions by sector. + # + data://garden/climate_watch/2024-11-21/emissions_by_sector: + - data://meadow/climate_watch/2024-11-21/emissions_by_sector - data://garden/regions/2023-01-01/regions - - data://garden/wb/2024-03-11/income_groups - - data://garden/energy/2024-06-20/primary_energy_consumption + - data://garden/wb/2024-07-29/income_groups + - data://garden/demography/2024-07-15/population # - # GCP - Global Carbon Budget. + # Climate Watch - Greenhouse gas emissions by sector. # - data://grapher/gcp/2024-06-20/global_carbon_budget: - - data://garden/gcp/2024-06-20/global_carbon_budget + data://grapher/climate_watch/2024-11-21/emissions_by_sector: + - data://garden/climate_watch/2024-11-21/emissions_by_sector # - # Decoupling of GDP and CO2 (2023). + # Jones et al. - National contributions to climate change. # - # The data from the following step will be used in this static chart: - # https://drive.google.com/file/d/1PflfQpr4mceVWRSGEqMP6Gbo1tFQZzOp/view?usp=sharing - data://garden/emissions/2024-06-20/gdp_and_co2_decoupling: - - data://garden/worldbank_wdi/2024-05-20/wdi - - data://garden/gcp/2024-06-20/global_carbon_budget + data://meadow/emissions/2024-11-21/national_contributions: + - snapshot://emissions/2024-11-21/national_contributions_temperature_response.csv + - snapshot://emissions/2024-11-21/national_contributions_cumulative_emissions.csv + - snapshot://emissions/2024-11-21/national_contributions_annual_emissions.csv + # + # Jones et al. - National contributions to climate change. + # + data://garden/emissions/2024-11-21/national_contributions: + - data://meadow/emissions/2024-11-21/national_contributions + - data://garden/demography/2024-07-15/population + - data://garden/wb/2024-07-29/income_groups + - data://garden/regions/2023-01-01/regions + # + # Jones et al. - National contributions to climate change. + # + data://grapher/emissions/2024-11-21/national_contributions: + - data://garden/emissions/2024-11-21/national_contributions diff --git a/dag/energy.yml b/dag/energy.yml index d7e02417a1f..f9d129f15d5 100644 --- a/dag/energy.yml +++ b/dag/energy.yml @@ -7,19 +7,6 @@ steps: data://garden/papers/2023-12-12/smil_2017: - data://meadow/papers/2023-12-12/smil_2017 # - # IRENA - Renewable electricity capacity. - # - data://meadow/irena/2023-12-12/renewable_electricity_capacity: - - snapshot://irena/2023-12-12/renewable_electricity_capacity_and_generation.xlsm - data://garden/irena/2023-12-12/renewable_electricity_capacity: - - data://meadow/irena/2023-12-12/renewable_electricity_capacity - - data://garden/regions/2023-01-01/regions - - data://garden/wb/2023-04-30/income_groups - data://grapher/irena/2023-12-12/renewable_electricity_capacity_by_technology: - - data://garden/irena/2023-12-12/renewable_electricity_capacity - data://grapher/irena/2023-12-12/renewable_electricity_capacity: - - data://garden/irena/2023-12-12/renewable_electricity_capacity - # # IRENA - Renewable energy patents. # data://meadow/irena/2023-12-12/renewable_energy_patents: @@ -192,30 +179,103 @@ steps: export://multidim/energy/latest/energy: - grapher://grapher/energy/2024-06-20/primary_energy_consumption # - # IRENA - Renewable power generation costs. + # IRENA - Renewable electricity capacity. + # + data://meadow/irena/2024-11-01/renewable_capacity_statistics: + - snapshot://irena/2024-11-01/renewable_capacity_statistics.xlsx + # + # IRENA - Renewable electricity capacity. # - data://meadow/irena/2024-10-29/renewable_power_generation_costs: - - snapshot://irena/2024-10-29/renewable_power_generation_costs.xlsx + data://garden/irena/2024-11-01/renewable_capacity_statistics: + - data://meadow/irena/2024-11-01/renewable_capacity_statistics + - data://garden/regions/2023-01-01/regions + - data://garden/wb/2024-07-29/income_groups + # + # IRENA - Renewable electricity capacity. + # + data://grapher/irena/2024-11-01/renewable_capacity_statistics_by_technology: + - data://garden/irena/2024-11-01/renewable_capacity_statistics + # + # IRENA - Renewable electricity capacity. + # + data://grapher/irena/2024-11-01/renewable_capacity_statistics: + - data://garden/irena/2024-11-01/renewable_capacity_statistics # # IRENA - Renewable power generation costs. # - data://garden/irena/2024-10-29/renewable_power_generation_costs: - - data://meadow/irena/2024-10-29/renewable_power_generation_costs + data://meadow/irena/2024-11-15/renewable_power_generation_costs: + - snapshot://irena/2024-11-15/renewable_power_generation_costs.xlsx # # IRENA - Renewable power generation costs. # - data://grapher/irena/2024-10-29/renewable_power_generation_costs: - - data://garden/irena/2024-10-29/renewable_power_generation_costs + data://garden/irena/2024-11-15/renewable_power_generation_costs: + - data://meadow/irena/2024-11-15/renewable_power_generation_costs # # Energy - Photovoltaic cost and capacity. # - data://garden/energy/2024-10-30/photovoltaic_cost_and_capacity: + data://garden/energy/2024-11-15/photovoltaic_cost_and_capacity: - data://garden/papers/2023-12-12/farmer_lafond_2016 - - data://garden/irena/2023-12-12/renewable_electricity_capacity - data://garden/papers/2023-12-12/nemet_2009 - - data://garden/irena/2024-10-29/renewable_power_generation_costs + - data://garden/irena/2024-11-15/renewable_power_generation_costs + - data://garden/irena/2024-11-01/renewable_capacity_statistics + # + # IRENA - Renewable power generation costs. + # + data://grapher/irena/2024-11-15/renewable_power_generation_costs: + - data://garden/irena/2024-11-15/renewable_power_generation_costs # # Energy - Photovoltaic cost and capacity. # - data://grapher/energy/2024-10-30/photovoltaic_cost_and_capacity: - - data://garden/energy/2024-10-30/photovoltaic_cost_and_capacity + data://grapher/energy/2024-11-15/photovoltaic_cost_and_capacity: + - data://garden/energy/2024-11-15/photovoltaic_cost_and_capacity + # + # Eurostat - Energy statistics, prices of natural gas and electricity + # + data://meadow/eurostat/2024-11-05/gas_and_electricity_prices: + - snapshot://eurostat/2024-11-05/gas_and_electricity_prices.zip + # + # Eurostat - Energy statistics, prices of natural gas and electricity + # + data://garden/eurostat/2024-11-05/gas_and_electricity_prices: + - data://meadow/eurostat/2024-11-05/gas_and_electricity_prices + # + # Ember - European wholesale electricity prices + # + data://meadow/ember/2024-11-20/european_wholesale_electricity_prices: + - snapshot://ember/2024-11-20/european_wholesale_electricity_prices.csv + # + # Ember - European wholesale electricity prices + # + data://garden/ember/2024-11-20/european_wholesale_electricity_prices: + - data://meadow/ember/2024-11-20/european_wholesale_electricity_prices + # + # IEA - Fossil fuel subsidies + # + data://meadow/iea/2024-11-20/fossil_fuel_subsidies: + - snapshot://iea/2024-11-20/fossil_fuel_subsidies.xlsx + # + # IEA - Fossil fuel subsidies + # + data://garden/iea/2024-11-20/fossil_fuel_subsidies: + - data://meadow/iea/2024-11-20/fossil_fuel_subsidies + # + # IEA - Fossil fuel subsidies + # + data://grapher/iea/2024-11-20/fossil_fuel_subsidies: + - data://garden/iea/2024-11-20/fossil_fuel_subsidies + # + # Energy prices + # + data://garden/energy/2024-11-20/energy_prices: + - data://garden/eurostat/2024-11-05/gas_and_electricity_prices + - data://garden/ember/2024-11-20/european_wholesale_electricity_prices + # + # Energy prices + # + data://grapher/energy/2024-11-20/energy_prices: + - data://garden/energy/2024-11-20/energy_prices + # + # Energy prices explorer + # + export://multidim/energy/latest/energy_prices: + - data://grapher/energy/2024-11-20/energy_prices diff --git a/dag/fasttrack.yml b/dag/fasttrack.yml index d7837ded158..2c0f8c90876 100644 --- a/dag/fasttrack.yml +++ b/dag/fasttrack.yml @@ -66,8 +66,6 @@ steps: - snapshot://fasttrack/2023-06-16/guinea_worm.csv data://grapher/fasttrack/2023-06-28/guinea_worm: - snapshot://fasttrack/2023-06-28/guinea_worm.csv - data://grapher/fasttrack/2023-06-19/world_population_comparison: - - snapshot://fasttrack/2023-06-19/world_population_comparison.csv data-private://grapher/fasttrack/latest/fiscal_top1_shares_country_standardized: - snapshot-private://fasttrack/latest/fiscal_top1_shares_country_standardized.csv data://grapher/fasttrack/2023-04-30/paratz: @@ -76,8 +74,6 @@ steps: - snapshot-private://fasttrack/latest/pain_hours_hen_systems.csv data-private://grapher/fasttrack/latest/antibiotic_usage_livestock: - snapshot-private://fasttrack/latest/antibiotic_usage_livestock.csv - data-private://grapher/fasttrack/latest/antimicrobial_usage_livestock: - - snapshot-private://fasttrack/latest/antimicrobial_usage_livestock.csv data://grapher/fasttrack/2023-08-07/pain_hours_days_hen_systems: - snapshot://fasttrack/2023-08-07/pain_hours_days_hen_systems.csv data-private://grapher/fasttrack/latest/historical_france_mortality_cause: @@ -96,8 +92,6 @@ steps: - snapshot://fasttrack/latest/transport_co2_emissions_modes.csv data://grapher/fasttrack/latest/democracy_freedom_house: - snapshot://fasttrack/latest/democracy_freedom_house.csv - data://grapher/fasttrack/latest/democracy_lexical_index: - - snapshot://fasttrack/latest/democracy_lexical_index.csv data://grapher/fasttrack/latest/global_maternal_offspring_loss: - snapshot://fasttrack/latest/global_maternal_offspring_loss.csv data://grapher/fasttrack/latest/under_five_mortality_lmics: @@ -240,3 +234,9 @@ steps: - snapshot://fasttrack/latest/pepfar_patients_receiving_art.csv data://grapher/fasttrack/latest/simon_ehrlich_wager: - snapshot://fasttrack/latest/simon_ehrlich_wager.csv + data://grapher/fasttrack/latest/useful_energy_cost_way: + - snapshot://fasttrack/latest/useful_energy_cost_way.csv + data://grapher/fasttrack/2023-06-19/world_population_comparison: + - snapshot://fasttrack/2023-06-19/world_population_comparison.csv + data://grapher/fasttrack/latest/antimicrobial_usage_livestock: + - snapshot://fasttrack/latest/antimicrobial_usage_livestock.csv diff --git a/dag/health.yml b/dag/health.yml index a843ce82182..2e47cb03825 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -39,7 +39,6 @@ steps: data://grapher/who/2023-03-09/gho_suicides: - data://garden/who/2023-03-09/gho_suicides - # IHME Global Burden of Disease - Risk factors data://meadow/ihme_gbd/2019/gbd_risk: - walden://ihme_gbd/2019/gbd_risk @@ -48,8 +47,6 @@ steps: data://grapher/ihme_gbd/2019/gbd_risk: - data://garden/ihme_gbd/2019/gbd_risk - - # IHME GBD Leading cause of deaths - update data-private://meadow/ihme_gbd/2024-06-10/cause_hierarchy: - snapshot-private://ihme_gbd/2024-06-10/cause_hierarchy.csv @@ -192,7 +189,6 @@ steps: data://grapher/un/2023-08-10/comtrade_pandemics: - data://garden/un/2023-08-02/comtrade_pandemics - # UNAIDS data://meadow/health/2023-08-09/unaids: - snapshot://health/2023-08-09/unaids.csv @@ -232,7 +228,6 @@ steps: data://grapher/oecd/2024-07-01/road_accidents: - data://garden/oecd/2024-07-01/road_accidents - # Kucharski data://meadow/health/2023-08-14/avian_influenza_h5n1_kucharski: - snapshot://health/2023-08-14/avian_influenza_h5n1_kucharski.xlsx @@ -577,7 +572,6 @@ steps: data-private://grapher/ihme_gbd/2024-05-20/gbd_mental_health_burden_dalys: - data-private://garden/ihme_gbd/2024-05-20/gbd_mental_health_burden - # GBD 2021 - GBD Risk Factors data-private://meadow/ihme_gbd/2024-05-20/gbd_risk: - snapshot-private://ihme_gbd/2024-05-20/gbd_risk.feather @@ -598,7 +592,6 @@ steps: data-private://grapher/ihme_gbd/2024-05-20/gbd_drug_risk: - data-private://garden/ihme_gbd/2024-05-20/gbd_drug_risk - # GBD 2021 - GBD Child Mortality data-private://meadow/ihme_gbd/2024-05-20/gbd_child_mortality: - snapshot-private://ihme_gbd/2024-05-20/gbd_child_mortality.feather @@ -609,7 +602,6 @@ steps: data-private://grapher/ihme_gbd/2024-05-20/gbd_child_mortality: - data-private://garden/ihme_gbd/2024-05-20/gbd_child_mortality - # GBD 2021 - GBD Health-adjusted Life Expectancy and Life Expectancy data-private://meadow/ihme_gbd/2024-07-02/gbd_life_expectancy: - snapshot-private://ihme_gbd/2024-07-02/gbd_life_expectancy.zip @@ -715,7 +707,6 @@ steps: data://grapher/who/2024-08-06/mortality_database_cancer_most_common: - data://garden/who/2024-08-06/mortality_database_cancer_most_common - data://meadow/who/latest/monkeypox: - snapshot://who/latest/monkeypox.csv data://garden/who/latest/monkeypox: @@ -729,7 +720,7 @@ steps: - data://garden/who/latest/monkeypox export://github/who/latest/monkeypox: - data://garden/who/latest/monkeypox -# Mpox - Global.health + # Mpox - Global.health data://meadow/health/latest/global_health_mpox: - snapshot://health/latest/global_health_mpox.csv data://garden/health/latest/global_health_mpox: @@ -737,7 +728,7 @@ steps: # Eurostat cancer - # Eurostat Cancer Screening + # Eurostat Cancer Screening data://meadow/health/2024-08-23/eurostat_cancer: - snapshot://health/2024-08-23/eurostat_cancer.csv data://garden/health/2024-08-23/eurostat_cancer: @@ -745,12 +736,10 @@ steps: data://grapher/health/2024-08-23/eurostat_cancer: - data://garden/health/2024-08-23/eurostat_cancer - # Multi-dim indicators export://multidim/health/latest/causes_of_death: - grapher://grapher/ihme_gbd/2024-05-20/gbd_cause - # GBD 2021 - GBD Risk Factors cancer specific data-private://meadow/ihme_gbd/2024-08-26/gbd_risk_cancer: - snapshot-private://ihme_gbd/2024-08-26/gbd_risk_cancer.feather @@ -775,7 +764,6 @@ steps: data://grapher/health/2024-09-05/seattle_pathogens: - data://garden/health/2024-09-05/seattle_pathogens - # International Agency for Research on Cancer data://meadow/cancer/2024-08-30/gco_alcohol: - snapshot://cancer/2024-08-30/gco_alcohol.csv @@ -791,13 +779,13 @@ steps: data://grapher/cancer/2024-09-06/gco_infections: - data://garden/cancer/2024-09-06/gco_infections -# Flu testing data + # Flu testing data data://garden/who/2024-09-09/flu_test: - data://meadow/who/latest/flunet data://grapher/who/2024-09-09/flu_test: - data://garden/who/2024-09-09/flu_test -# Cancer diagnosis routes and survival rates + # Cancer diagnosis routes and survival rates data://meadow/cancer/2024-09-13/diagnosis_routes_by_route: - snapshot://cancer/2024-09-13/diagnosis_routes_by_route.csv data://garden/cancer/2024-09-13/diagnosis_routes_by_route: @@ -824,6 +812,8 @@ steps: - snapshot://antibiotics/2024-10-09/gram.csv data://garden/antibiotics/2024-10-09/gram: - data://meadow/antibiotics/2024-10-09/gram + - data://garden/demography/2024-07-15/population + - data://garden/regions/2023-01-01/regions data://grapher/antibiotics/2024-10-09/gram: - data://garden/antibiotics/2024-10-09/gram @@ -832,6 +822,8 @@ steps: - snapshot://antibiotics/2024-10-09/gram_level.csv data://garden/antibiotics/2024-10-09/gram_level: - data://meadow/antibiotics/2024-10-09/gram_level + - data://garden/demography/2024-07-15/population + - data://garden/regions/2023-01-01/regions data://grapher/antibiotics/2024-10-09/gram_level: - data://garden/antibiotics/2024-10-09/gram_level @@ -843,7 +835,6 @@ steps: data://grapher/antibiotics/2024-10-09/gram_children: - data://garden/antibiotics/2024-10-09/gram_children - # Cervical cancer incidence rates GCO - Cancer Over Time data://meadow/cancer/2024-10-13/gco_cancer_over_time_cervical: - snapshot://cancer/2024-10-13/gco_cancer_over_time_cervical.csv @@ -860,6 +851,21 @@ steps: data://grapher/cancer/2024-10-13/gco_cancer_today_cervical: - data://garden/cancer/2024-10-13/gco_cancer_today_cervical + # Antibiotic resistance data - WHO GLASS + data://meadow/antibiotics/2024-10-18/who_glass: + - snapshot://antibiotics/2024-10-18/who_glass.zip + data://garden/antibiotics/2024-10-18/who_glass: + - data://meadow/antibiotics/2024-10-18/who_glass + data://grapher/antibiotics/2024-10-18/who_glass: + - data://garden/antibiotics/2024-10-18/who_glass + + # Antibiotic resistance data - WHO GLASS + data://meadow/antibiotics/2024-10-18/who_glass_by_antibiotic: + - snapshot://antibiotics/2024-10-18/who_glass_by_antibiotic.zip + data://garden/antibiotics/2024-10-18/who_glass_by_antibiotic: + - data://meadow/antibiotics/2024-10-18/who_glass_by_antibiotic + data://grapher/antibiotics/2024-10-18/who_glass_by_antibiotic: + - data://garden/antibiotics/2024-10-18/who_glass_by_antibiotic # ANIMUSE - antibiotic use in animals data://meadow/antibiotics/2024-10-23/animuse_year: @@ -869,7 +875,6 @@ steps: data://grapher/antibiotics/2024-10-23/animuse_year: - data://garden/antibiotics/2024-10-23/animuse_year - # ESVAC antimicrobial use in animals data://meadow/antibiotics/2024-10-25/esvac_sales: - snapshot://antibiotics/2024-10-25/esvac_sales.zip @@ -893,3 +898,108 @@ steps: - data://meadow/antibiotics/2024-10-23/tracss data://grapher/antibiotics/2024-10-23/tracss: - data://garden/antibiotics/2024-10-23/tracss + # WHO Antimicrobial usage + data://meadow/antibiotics/2024-11-12/antimicrobial_usage: + - snapshot://antibiotics/2024-11-12/antimicrobial_usage.xlsx + data://garden/antibiotics/2024-11-12/antimicrobial_usage: + - data://meadow/antibiotics/2024-11-12/antimicrobial_usage + data://grapher/antibiotics/2024-11-12/antimicrobial_usage: + - data://garden/antibiotics/2024-11-12/antimicrobial_usage + # WHO Antibiotic testing coverage + data://meadow/antibiotics/2024-11-15/testing_coverage: + - snapshot://antibiotics/2024-11-15/testing_coverage.zip + data://garden/antibiotics/2024-11-15/testing_coverage: + - data://meadow/antibiotics/2024-11-15/testing_coverage + data://grapher/antibiotics/2024-11-15/testing_coverage: + - data://garden/antibiotics/2024-11-15/testing_coverage + + # IHME Neonatal all infectious syndromes + data-private://meadow/antibiotics/2024-11-20/microbe: + - snapshot-private://antibiotics/2024-11-20/microbe.zip + data-private://garden/antibiotics/2024-11-20/microbe: + - data-private://meadow/antibiotics/2024-11-20/microbe + data-private://grapher/antibiotics/2024-11-20/microbe: + - data-private://garden/antibiotics/2024-11-20/microbe + # IHME Neonatal bloodstream infections by pathogen + data-private://meadow/antibiotics/2024-11-20/pathogen_bloodstream: + - snapshot-private://antibiotics/2024-11-20/pathogen_bloodstream.csv + data-private://garden/antibiotics/2024-11-20/pathogen_bloodstream: + - data-private://meadow/antibiotics/2024-11-20/pathogen_bloodstream + + # IHME Neonatal infections by syndrome + data-private://meadow/antibiotics/2024-11-24/total_syndrome: + - snapshot-private://antibiotics/2024-11-24/total_syndrome.csv + # IHME Neonatal infections by syndrome and amr resistance + data-private://meadow/antibiotics/2024-12-02/microbe_amr: + - snapshot-private://antibiotics/2024-12-02/microbe_amr.csv + data-private://garden/antibiotics/2024-12-02/microbe_amr: + - data-private://meadow/antibiotics/2024-12-02/microbe_amr + - data-private://meadow/antibiotics/2024-11-24/total_syndrome + data-private://grapher/antibiotics/2024-12-02/microbe_amr: + - data-private://garden/antibiotics/2024-12-02/microbe_amr + # IHME Neonatal infections and amr resistance + data-private://meadow/antibiotics/2024-12-02/microbe_neonatal_amr: + - snapshot-private://antibiotics/2024-12-02/microbe_neonatal_amr.csv + data-private://garden/antibiotics/2024-12-02/microbe_neonatal_amr: + - data-private://meadow/antibiotics/2024-12-02/microbe_neonatal_amr + - data-private://meadow/antibiotics/2024-11-20/microbe + data-private://grapher/antibiotics/2024-12-02/microbe_neonatal_amr: + - data-private://garden/antibiotics/2024-12-02/microbe_neonatal_amr + + # MICROBE - total deaths by pathogen + data-private://meadow/antibiotics/2024-12-02/total_pathogen_bloodstream: + - snapshot-private://antibiotics/2024-12-02/total_pathogen_bloodstream.csv + data-private://garden/antibiotics/2024-12-02/total_pathogen_bloodstream: + - data-private://meadow/antibiotics/2024-12-02/total_pathogen_bloodstream + + # MICROBE - total deaths by pathogen and amr resistance + data-private://meadow/antibiotics/2024-12-02/total_pathogen_bloodstream_amr: + - snapshot-private://antibiotics/2024-12-02/total_pathogen_bloodstream_amr.csv + data-private://garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr: + - data-private://garden/antibiotics/2024-12-02/total_pathogen_bloodstream + - data-private://meadow/antibiotics/2024-12-02/total_pathogen_bloodstream_amr + data-private://grapher/antibiotics/2024-12-02/total_pathogen_bloodstream_amr: + - data-private://garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr + # WHO GLASS Enrolment + data://meadow/antibiotics/2024-12-03/glass_enrolment: + - snapshot://antibiotics/2024-12-03/glass_enrolment.xlsx + data://garden/antibiotics/2024-12-03/glass_enrolment: + - data://meadow/antibiotics/2024-12-03/glass_enrolment + data://grapher/antibiotics/2024-12-03/glass_enrolment: + - data://garden/antibiotics/2024-12-03/glass_enrolment + # MICROBE - total deaths by pathogen + data-private://meadow/antibiotics/2024-12-04/microbe_total_pathogens: + - snapshot-private://antibiotics/2024-12-04/microbe_total_pathogens.csv + data-private://garden/antibiotics/2024-12-04/microbe_total_pathogens: + - data-private://meadow/antibiotics/2024-12-04/microbe_total_pathogens + # MICROBE - total deaths by pathogen and amr resistance + data-private://meadow/antibiotics/2024-12-04/microbe_total_pathogens_amr: + - snapshot-private://antibiotics/2024-12-04/microbe_total_pathogens_amr.csv + data-private://garden/antibiotics/2024-12-04/microbe_total_pathogens_amr: + - data-private://meadow/antibiotics/2024-12-04/microbe_total_pathogens_amr + - data-private://garden/antibiotics/2024-12-04/microbe_total_pathogens + data-private://grapher/antibiotics/2024-12-04/microbe_total_pathogens_amr: + - data-private://garden/antibiotics/2024-12-04/microbe_total_pathogens_amr + + # MICROBE - neonatal deaths by pathogen and amr resistance + data-private://meadow/antibiotics/2024-12-05/microbe_neonatal_total_amr: + - snapshot-private://antibiotics/2024-12-05/microbe_neonatal_total_amr.csv + data-private://garden/antibiotics/2024-12-05/microbe_neonatal_total_amr: + - data-private://meadow/antibiotics/2024-12-05/microbe_neonatal_total_amr + - data-private://garden/antibiotics/2024-11-20/microbe + data-private://grapher/antibiotics/2024-12-05/microbe_neonatal_total_amr: + - data-private://garden/antibiotics/2024-12-05/microbe_neonatal_total_amr + # MICROBE - total deaths by syndrome + data-private://meadow/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome: + - snapshot-private://antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.csv + data-private://garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome: + - data-private://meadow/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome + + # MICROBE - total deaths by syndrome and amr resistance + data-private://meadow/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr: + - snapshot-private://antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.csv + data-private://garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr: + - data-private://meadow/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr + - data-private://garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome + data-private://grapher/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr: + - data-private://garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr diff --git a/dag/main.yml b/dag/main.yml index 49fe7781e0b..99a5ec05481 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -83,7 +83,7 @@ steps: data://grapher/worldbank_wdi/2022-05-26/wdi: - data://garden/worldbank_wdi/2022-05-26/wdi - # Homicide - UNODC - current + # Homicide - UNODC - to archive data://meadow/homicide/2023-07-04/unodc: - snapshot://homicide/2023-07-04/unodc.xlsx data://garden/homicide/2023-07-04/unodc: @@ -91,6 +91,15 @@ steps: data://grapher/homicide/2023-07-04/unodc: - data://garden/homicide/2023-07-04/unodc + # Homicide - UNODC - update + data://meadow/homicide/2024-10-30/unodc: + - snapshot://homicide/2024-10-30/unodc.xlsx + data://garden/homicide/2024-10-30/unodc: + - data://meadow/homicide/2024-10-30/unodc + - data://garden/demography/2023-03-31/population + data://grapher/homicide/2024-10-30/unodc: + - data://garden/homicide/2024-10-30/unodc + # # UN data://grapher/un/2022-07-11/un_wpp: @@ -121,11 +130,6 @@ steps: - data://garden/regions/2023-01-01/regions data://grapher/technology/2022/internet: - data://garden/technology/2022/internet - # HMD - data://meadow/hmd/2022-12-07/life_tables: - - snapshot://hmd/2022-12-07/hmd.zip - data://garden/hmd/2022-12-07/life_tables: - - data://meadow/hmd/2022-12-07/life_tables # UNDP data://meadow/un/2024-04-09/undp_hdr: @@ -270,14 +274,6 @@ steps: data://grapher/eth/2023-03-15/ethnic_power_relations: - data://garden/eth/2023-03-15/ethnic_power_relations - # International Monetary Fund, World Economic Outlook - data://meadow/imf/2024-05-02/world_economic_outlook: - - snapshot://imf/2024-05-02/world_economic_outlook.xls - data://garden/imf/2024-05-02/world_economic_outlook: - - data://meadow/imf/2024-05-02/world_economic_outlook - data://grapher/imf/2024-05-02/world_economic_outlook: - - data://garden/imf/2024-05-02/world_economic_outlook - # Patents & journal articles (World Bank, United Nations) data://garden/research_development/2024-05-20/patents_articles: - data://garden/worldbank_wdi/2024-05-20/wdi @@ -319,8 +315,6 @@ steps: data://grapher/terrorism/2023-07-20/global_terrorism_database: - data://garden/terrorism/2023-07-20/global_terrorism_database - - # Colonial Dates Dataset (COLDAT) data://meadow/harvard/2023-09-18/colonial_dates_dataset: - snapshot://harvard/2023-09-18/colonial_dates_dataset.csv @@ -353,7 +347,6 @@ steps: data://garden/gapminder/2023-09-21/under_five_mortality: - data://meadow/gapminder/2023-09-21/under_five_mortality - # Older vintage UN IGME (with longer time-series) data://meadow/un/2018/igme: - snapshot://un/2018/igme.csv @@ -379,11 +372,6 @@ steps: - data://garden/countries/2024-08-27/gleditsch # Outdated gleditsch - data://garden/countries/2023-09-25/gleditsch: - - data://meadow/countries/2023-09-25/gleditsch - - data://garden/demography/2023-03-31/population - data://grapher/countries/2023-10-01/gleditsch: - - data://garden/countries/2023-09-25/gleditsch data://garden/countries/2024-01-08/gleditsch_no_population: - data://meadow/countries/2023-09-25/gleditsch @@ -486,8 +474,6 @@ steps: - data://garden/regions/2023-01-01/regions data://garden/ess/2023-08-02/ess_trust: - data://meadow/ess/2023-08-02/ess_trust - data://grapher/ess/2023-08-02/ess_trust: - - data://garden/ess/2023-08-02/ess_trust # Latinobarómetro survey - Trust data://meadow/survey/2023-08-04/latinobarometro_trust: @@ -653,6 +639,8 @@ steps: - snapshot://worldbank_wdi/2024-05-20/wdi.zip - data://meadow/worldbank_wdi/2024-05-20/wdi - data://garden/demography/2024-07-15/population + - data://garden/regions/2023-01-01/regions + - data://garden/wb/2024-07-29/income_groups data://grapher/worldbank_wdi/2024-05-20/wdi: - data://garden/worldbank_wdi/2024-05-20/wdi @@ -766,17 +754,18 @@ steps: data-private://grapher/language/2024-07-17/ethnologue: - data-private://garden/language/2024-07-17/ethnologue -# Child Mortality Estimates - UN IGME + # Child Mortality Estimates - UN IGME data://meadow/un/2024-09-11/igme: - snapshot://un/2024-09-11/igme.zip data://garden/un/2024-09-11/igme: - data://meadow/un/2024-09-11/igme - data://garden/un/2018/igme + - data://garden/regions/2023-01-01/regions + - data://garden/demography/2024-07-15/population data://grapher/un/2024-09-11/igme: - data://garden/un/2024-09-11/igme - -# Long-run child mortality, Gapminder + UN IGME + # Long-run child mortality, Gapminder + UN IGME data://garden/un/2024-09-16/long_run_child_mortality: - data://garden/un/2024-09-11/igme - data://garden/gapminder/2023-09-18/under_five_mortality @@ -784,8 +773,7 @@ steps: data://grapher/un/2024-09-16/long_run_child_mortality: - data://garden/un/2024-09-16/long_run_child_mortality - -# UN SDG (2024) + # UN SDG (2024) data://meadow/un/2024-08-27/un_sdg: - snapshot://un/2024-08-27/un_sdg.feather data://garden/un/2024-08-27/un_sdg: @@ -795,7 +783,7 @@ steps: data://grapher/un/2024-08-27/un_sdg: - data://garden/un/2024-08-27/un_sdg -# OECD Official Development Assistance (ODA) + # OECD Official Development Assistance (ODA) data://meadow/oecd/2024-08-21/official_development_assistance: - snapshot://oecd/2024-08-21/official_development_assistance_dac1.zip - snapshot://oecd/2024-08-21/official_development_assistance_dac2a.zip @@ -807,7 +795,7 @@ steps: data://grapher/oecd/2024-08-21/official_development_assistance: - data://garden/oecd/2024-08-21/official_development_assistance -# Oil Spills + # Oil Spills data://meadow/itopf/2024-10-16/oil_spills: - snapshot://itopf/2024-10-16/oil_spills.pdf data://garden/itopf/2024-10-16/oil_spills: @@ -815,7 +803,15 @@ steps: data://grapher/itopf/2024-10-16/oil_spills: - data://garden/itopf/2024-10-16/oil_spills -# World Peace Foundation - Famines + # UN SD census data + data://meadow/un/2024-10-21/census_dates: + - snapshot://un/2024-10-21/census_dates.csv + data://garden/un/2024-10-21/census_dates: + - data://meadow/un/2024-10-21/census_dates + data://grapher/un/2024-10-21/census_dates: + - data://garden/un/2024-10-21/census_dates + + # World Peace Foundation - Famines data://meadow/wpf/2024-10-03/famines: - snapshot://wpf/2024-10-03/famines.xlsx data://garden/wpf/2024-10-03/famines: @@ -847,6 +843,38 @@ steps: data://grapher/wpf/2024-10-03/famines_by_place: - data://garden/wpf/2024-10-03/famines_by_place + data-private://meadow/owid/latest/ig_countries: + - snapshot-private://owid/latest/ig_countries.csv + data-private://garden/owid/latest/ig_countries: + - data-private://meadow/owid/latest/ig_countries + data-private://grapher/owid/latest/ig_countries: + - data-private://garden/owid/latest/ig_countries + +# Migration distances + data://garden/migration/2024-11-19/migration_distance: + - data://garden/un/2024-07-16/migrant_stock + - data://garden/geography/2023-11-28/nat_earth_110 + data://grapher/migration/2024-11-19/migration_distance: + - data://garden/migration/2024-11-19/migration_distance + + # Migration between regions, based on UN DESA flows + data://garden/migration/2024-11-18/migration_between_regions: + - data://garden/un/2024-07-16/migrant_stock + - data://garden/regions/2023-01-01/regions + - data://garden/wb/2024-07-29/income_groups + + data://grapher/migration/2024-11-18/migration_between_regions: + - data://garden/migration/2024-11-18/migration_between_regions + + # IMF World Economic Outlook + data://meadow/imf/2024-11-25/world_economic_outlook: + - snapshot://imf/2024-11-25/world_economic_outlook.xls + data://garden/imf/2024-11-25/world_economic_outlook: + - data://meadow/imf/2024-11-25/world_economic_outlook + data://grapher/imf/2024-11-25/world_economic_outlook: + - data://garden/imf/2024-11-25/world_economic_outlook + + include: - dag/open_numbers.yml - dag/faostat.yml @@ -882,3 +910,4 @@ include: - dag/chartbook.yml - dag/minerals.yml - dag/tourism.yml + - dag/migration.yml diff --git a/dag/migration.yml b/dag/migration.yml new file mode 100644 index 00000000000..3167519dffa --- /dev/null +++ b/dag/migration.yml @@ -0,0 +1,44 @@ +steps: + # Migration UN DESA + data://meadow/un/2024-07-16/migrant_stock: + - snapshot://un/2024-07-16/migrant_stock_dest_origin.xlsx + - snapshot://un/2024-07-16/migrant_stock_origin.xlsx + - snapshot://un/2024-07-16/migrant_stock_dest.xlsx + - snapshot://un/2024-07-16/migrant_stock_age_sex.xlsx + data://garden/un/2024-07-16/migrant_stock: + - data://meadow/un/2024-07-16/migrant_stock + data://grapher/un/2024-07-16/migrant_stock: + - data://garden/un/2024-07-16/migrant_stock + + # Internal displacement monitoring centre + data://meadow/idmc/2024-08-02/internal_displacement: + - snapshot://idmc/2024-08-02/internal_displacement.xlsx + - data://garden/demography/2024-07-15/population + data://grapher/idmc/2024-08-02/internal_displacement: + - data://meadow/idmc/2024-08-02/internal_displacement + + # UNHCR refugee data + data://meadow/un/2024-07-25/refugee_data: + - snapshot://un/2024-07-25/refugee_data.zip + data://garden/un/2024-07-25/refugee_data: + - data://meadow/un/2024-07-25/refugee_data + - data://garden/demography/2024-07-15/population + - data://garden/un/2024-07-25/resettlement + data://grapher/un/2024-07-25/refugee_data: + - data://garden/un/2024-07-25/refugee_data + + # UNHCR resettlement data + data://meadow/un/2024-07-25/resettlement: + - snapshot://un/2024-07-25/resettlement.zip + data://garden/un/2024-07-25/resettlement: + - data://meadow/un/2024-07-25/resettlement + - data://garden/demography/2024-07-15/population + + # Child migration (UNICEF) + data://meadow/unicef/2024-07-30/child_migration: + - snapshot://unicef/2024-07-30/child_migration.csv + data://garden/unicef/2024-07-30/child_migration: + - data://meadow/unicef/2024-07-30/child_migration + - data://garden/demography/2024-07-15/population + data://grapher/unicef/2024-07-30/child_migration: + - data://garden/unicef/2024-07-30/child_migration diff --git a/dag/minerals.yml b/dag/minerals.yml index 82a8d9ec567..85df4de7416 100644 --- a/dag/minerals.yml +++ b/dag/minerals.yml @@ -68,6 +68,11 @@ steps: data://grapher/minerals/2024-07-15/minerals: - data://garden/minerals/2024-07-15/minerals # + # Minerals - Minerals. + # + data://grapher/minerals/2024-07-15/global_mine_production_by_mineral: + - data://garden/minerals/2024-07-15/minerals + # # Minerals - Minerals explorer. # export://explorers/minerals/latest/minerals: diff --git a/dag/poverty_inequality.yml b/dag/poverty_inequality.yml index aa961d1ba89..25a1c80a547 100644 --- a/dag/poverty_inequality.yml +++ b/dag/poverty_inequality.yml @@ -60,13 +60,14 @@ steps: data://explorers/lis/latest/luxembourg_income_study: - data://garden/lis/2024-06-13/luxembourg_income_study - # Multidimensional Poverty Index - data://meadow/ophi/2023-07-05/multidimensional_poverty_index: - - snapshot://ophi/2023-07-05/multidimensional_poverty_index.csv - data://garden/ophi/2023-07-05/multidimensional_poverty_index: - - data://meadow/ophi/2023-07-05/multidimensional_poverty_index - data://grapher/ophi/2023-07-05/multidimensional_poverty_index: - - data://garden/ophi/2023-07-05/multidimensional_poverty_index + # Global Multidimensional Poverty Index + data://meadow/ophi/2024-10-28/multidimensional_poverty_index: + - snapshot://ophi/2024-10-28/multidimensional_poverty_index_cme.csv + - snapshot://ophi/2024-10-28/multidimensional_poverty_index_hot.csv + data://garden/ophi/2024-10-28/multidimensional_poverty_index: + - data://meadow/ophi/2024-10-28/multidimensional_poverty_index + data://grapher/ophi/2024-10-28/multidimensional_poverty_index: + - data://garden/ophi/2024-10-28/multidimensional_poverty_index # # OECD Income Distribution Database data://meadow/oecd/2024-04-10/income_distribution_database: @@ -113,15 +114,6 @@ steps: data://grapher/oecd/2024-04-30/affordable_housing_database: - data://garden/oecd/2024-04-30/affordable_housing_database - # Poverty projections from the World Bank - data://meadow/wb/2024-06-26/poverty_projections: - - snapshot://wb/2024-06-26/poverty_projections_number_global.csv - - snapshot://wb/2024-06-26/poverty_projections_share_regions.csv - data://garden/wb/2024-06-26/poverty_projections: - - data://meadow/wb/2024-06-26/poverty_projections - data://grapher/wb/2024-06-26/poverty_projections: - - data://garden/wb/2024-06-26/poverty_projections - # Institute of Global Homelessness - Better Data Project data://meadow/igh/2024-07-05/better_data_homelessness: - snapshot://igh/2024-07-05/better_data_homelessness.xlsx @@ -129,3 +121,11 @@ steps: - data://meadow/igh/2024-07-05/better_data_homelessness data://grapher/igh/2024-07-05/better_data_homelessness: - data://garden/igh/2024-07-05/better_data_homelessness + + # Poverty projections from the Poverty, Prosperity and Planet Report 2024 + data://meadow/wb/2024-12-03/poverty_projections: + - snapshot://wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip + data://garden/wb/2024-12-03/poverty_projections: + - data://meadow/wb/2024-12-03/poverty_projections + data://grapher/wb/2024-12-03/poverty_projections: + - data://garden/wb/2024-12-03/poverty_projections diff --git a/dag/urbanization.yml b/dag/urbanization.yml index c4c4fdc49f7..ecff7c1f0a8 100644 --- a/dag/urbanization.yml +++ b/dag/urbanization.yml @@ -45,17 +45,6 @@ steps: data://grapher/un/2024-01-17/urban_agglomerations_definition_count: - data://garden/un/2024-01-17/urban_agglomerations_definition_count # - # GHSL degree of urbanization. - # - data://meadow/urbanization/2024-01-26/ghsl_degree_of_urbanisation: - - snapshot://urbanization/2024-01-26/ghsl_degree_of_urbanisation.zip - data://garden/urbanization/2024-01-26/ghsl_degree_of_urbanisation: - - data://meadow/urbanization/2024-01-26/ghsl_degree_of_urbanisation - - data://garden/wb/2023-04-30/income_groups - - data://garden/regions/2023-01-01/regions - data://grapher/urbanization/2024-01-26/ghsl_degree_of_urbanisation: - - data://garden/urbanization/2024-01-26/ghsl_degree_of_urbanisation - # # UN SDG indicators related to urbanization. # data://meadow/un/2024-02-14/sdgs_urbanization: @@ -76,3 +65,13 @@ steps: - data://garden/regions/2023-01-01/regions data://grapher/urbanization/2024-10-14/ghsl_degree_of_urbanisation: - data://garden/urbanization/2024-10-14/ghsl_degree_of_urbanisation + + # GHSL urban centers. + data://meadow/urbanization/2024-12-02/ghsl_urban_centers: + - snapshot://urbanization/2024-12-02/ghsl_urban_centers.xlsx + data://garden/urbanization/2024-12-02/ghsl_urban_centers: + - data://meadow/urbanization/2024-12-02/ghsl_urban_centers + - data://garden/wb/2024-07-29/income_groups + - data://garden/regions/2023-01-01/regions + data://grapher/urbanization/2024-12-02/ghsl_urban_centers: + - data://garden/urbanization/2024-12-02/ghsl_urban_centers diff --git a/dag/war.yml b/dag/war.yml index b7a6d1970d0..ba73f455f24 100644 --- a/dag/war.yml +++ b/dag/war.yml @@ -47,24 +47,25 @@ steps: data://grapher/war/2024-08-26/ucdp: - data://garden/war/2024-08-26/ucdp - # UCDP (candidate data) - data://meadow/war/2024-10-02/ucdp_ced: - - snapshot://war/2024-10-02/ucdp_ced.csv - data://garden/war/2024-10-02/ucdp_monthly: - - data://garden/countries/2024-08-27/gleditsch + # UCDP (with candidate data, latest available) + data://meadow/war/2024-11-22/ucdp_ced: + - snapshot://war/2024-11-22/ucdp_ced_v24_0_10.csv + - snapshot://war/2024-11-22/ucdp_ced_v24_01_24_09.csv + data://garden/war/2024-11-22/ucdp_preview: + - data://meadow/war/2024-11-22/ucdp_ced - data://garden/demography/2024-07-15/population - data://garden/geography/2023-11-28/nat_earth_110 - data://meadow/war/2024-08-26/ucdp - - data://meadow/war/2024-10-02/ucdp_ced - data://grapher/war/2024-10-02/ucdp_monthly: - - data://garden/war/2024-10-02/ucdp_monthly + - data://garden/countries/2024-08-27/gleditsch + data://grapher/war/2024-11-22/ucdp_preview: + - data://garden/war/2024-11-22/ucdp_preview # PRIO v3.1 data://meadow/war/2023-09-21/prio_v31: - snapshot://war/2023-09-21/prio_v31.xls data://garden/war/2023-09-21/prio_v31: - data://meadow/war/2023-09-21/prio_v31 - - data://garden/countries/2023-09-25/gleditsch + - data://garden/countries/2024-08-27/gleditsch data://grapher/war/2023-09-21/prio_v31: - data://garden/war/2023-09-21/prio_v31 @@ -226,7 +227,7 @@ steps: data://garden/harvard/2024-07-22/global_military_spending_dataset: - data://meadow/harvard/2024-07-22/global_military_spending_dataset - data://garden/demography/2024-07-15/population - - data://garden/countries/2023-09-25/gleditsch + - data://garden/countries/2024-08-27/gleditsch - data://garden/cow/2024-07-26/national_material_capabilities data://grapher/harvard/2024-07-22/global_military_spending_dataset: - data://garden/harvard/2024-07-22/global_military_spending_dataset diff --git a/default.mk b/default.mk index 91b0bdf76f7..ba3fb4919f8 100644 --- a/default.mk +++ b/default.mk @@ -38,20 +38,27 @@ install-uv-default: @echo '==> Installing packages' @if [ -n "$(PYTHON_VERSION)" ]; then \ echo '==> Using Python version $(PYTHON_VERSION)'; \ - export UV_PYTHON=$(PYTHON_VERSION); \ + [ -f $$HOME/.cargo/env ] && . $$HOME/.cargo/env || true && UV_PYTHON=$(PYTHON_VERSION) uv sync --all-extras; \ + else \ + [ -f $$HOME/.cargo/env ] && . $$HOME/.cargo/env || true && uv sync --all-extras; \ fi - [ -f $$HOME/.cargo/env ] && . $$HOME/.cargo/env || true && uv sync --all-extras check-default: @echo '==> Lint & Format & Typecheck changed files' @git fetch -q origin master @RELATIVE_PATH=$$(pwd | sed "s|^$$(git rev-parse --show-toplevel)/||"); \ CHANGED_PY_FILES=$$(git diff --name-only origin/master HEAD -- . && git diff --name-only && git ls-files --others --exclude-standard | grep '\.py'); \ - CHANGED_PY_FILES=$$(echo "$$CHANGED_PY_FILES" | sed "s|^$$RELATIVE_PATH/||" | grep '\.py' | xargs -I {} sh -c 'test -f {} && echo {}'); \ - if [ -n "$$CHANGED_PY_FILES" ]; then \ + CHANGED_PY_FILES=$$(echo "$$CHANGED_PY_FILES" | sed "s|^$$RELATIVE_PATH/||" | grep '\.py' | xargs -I {} sh -c 'test -f {} && echo {}' | grep -v '{}'); \ + FILE_COUNT=$$(echo "$$CHANGED_PY_FILES" | wc -l); \ + if [ "$$FILE_COUNT" -le 1 ] && [ "$$FILE_COUNT" -gt 0 ]; then \ echo "$$CHANGED_PY_FILES" | xargs ruff check --fix; \ echo "$$CHANGED_PY_FILES" | xargs ruff format; \ echo "$$CHANGED_PY_FILES" | xargs pyright; \ + else \ + echo "Too many files, checking all files instead."; \ + make lint; \ + make format; \ + make check-typing; \ fi lint-default: .venv diff --git a/docs/api/index.md b/docs/api/index.md index 004d360835e..5e8fa626f3e 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -26,7 +26,7 @@ The following options can be specified for all of these endpoints: **csvType** - `full` (default): Get the full data, i.e. all time points and all entities -- `filtered`: Get only the data needed to display the visible chart. For a map chart this will be only data for a single year but all countries, for a line chart it will be the selected time range and visible entities, ... +- `filtered`: Get only the data needed to display the visible chart. Different chart types return different subsets of the full data. For a map this will download data for only a single year but all countries, for a line chart it will be the selected time range and visible entities and so on for other chart types. Note that if you use `filtered`, the other query parameters in the URL will change what is downloaded. E.g. if you navigate to our life-expectancy chart and then visually select the country "Italy" and change the time range to 1950-2000 you will see that the URL in the browser is modified to include `?time=1980..2000&country=~ITA`. When you make a request to any of the endpoints above you can include any of these modifications to get exactly that data: @@ -40,7 +40,7 @@ https://ourworldindata.org/grapher/life-expectancy.csv?csvType=filtered&time=198 - `true`: Column names are short and don't use whitespace - e.g. `life_expectancy_0__sex_all__age_0` ``` -https://ourworldindata.org/grapher/life-expectancy.csv?useShortNames=true +https://ourworldindata.org/grapher/life-expectancy.csv?useColumnShortNames=true ``` ## Example notebooks @@ -60,7 +60,7 @@ Afghanistan,AFG,1950,27.7275 Afghanistan,AFG,1951,27.9634 ``` -The first two columns in the CSV file are "Entity" and "Code." "Entity" is the name of the entity, typically a country, such as "United States." "Code" is the OWID internal entity code used for countries or regions. For standard countries, this matches the [ISO alpha-3 code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3) (e.g., "USA"); for non-standard or historical countries, we use custom codes. Country and region codes are standardized across all Our World in Data datasets, allowing you to join multiple datasets using either of these columns. +The first two columns in the CSV file are "Entity" and "Code." "Entity" is the name of the entity, typically a country, such as "United States." "Code" is the OWID internal entity code used for countries or regions. For standard countries, this matches the [ISO alpha-3 code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3) (e.g., "USA"); for non-standard or historical countries, we use custom codes. Country and region names are standardized across all Our World in Data datasets, allowing you to join multiple datasets using either of these columns. The third column is either "Year" or "Day". If the data is annual, this is "Year" and contains only the year as an integer. If the column is "Day", the column contains a date string in the form "YYYY-MM-DD". @@ -68,7 +68,7 @@ The final columns are the data columns, which are the time series that powers th ## Metadata structure -The `.metadata.json` file contains metadata about the data package. The "charts" key contains information to recreate the chart, like the title, subtitle etc.. The "columns" key contains information about each of the columns in the csv, like the unit, timespan covered, citation for the data etc.. Here is a (slightly shortened) example of the metadata for the life-expectancy chart: +The `.metadata.json` file contains metadata about the data package. The "charts" key contains information to recreate the chart, like the title, subtitle etc. The "columns" key contains information about each of the columns in the csv, like the unit, timespan covered, citation for the data etc. Here is a (slightly shortened) example of the metadata for the life-expectancy chart: ```json { diff --git a/docs/architecture/metadata/structuring-yaml.md b/docs/architecture/metadata/structuring-yaml.md index bb8a16b2862..f0c4091c5da 100644 --- a/docs/architecture/metadata/structuring-yaml.md +++ b/docs/architecture/metadata/structuring-yaml.md @@ -233,4 +233,41 @@ tables: {definitions.conflict_type_estimate} ``` -Be cautious with line breaks and trailing whitespace when utilizing templates. Despite using good defaults, you might end up experimenting a lot to get the desired result. +Line breaks and whitespaces can be tricky when using Jinja templates. We use reasonable defaults and strip whitespaces, so in most cases you should be fine with using `<%` and `%>`, but in more complex cases, you might have to experiment with +more fine grained [whitespace control](https://jinja.palletsprojects.com/en/stable/templates/#whitespace-control) using tags `<%-` and `-%>`. This is most often used in if-else blocks like this + +```yaml +age: |- + <% if age_group == "ALLAges" %> + ... + <%- elif age_group == "Age-standardized" %> + ... + <%- else %> + ... + <%- endif %> +``` + +The most straightforward way to check your metadata is in Admin, although that means waiting for your step to finish. There's a faster way to check your YAML file directly. Create a `playground.ipynb` notebook in the same folder as your YAML file and copy this to the first cell: + +```python +from etl import grapher_helpers as gh +dim_dict = { + "age_group": "YEARS0-4", "sex": "Male", "cause": "Drug use disorders" +} +d = gh.render_yaml_file("ghe.meta.yml", dim_dict=dim_dict) +d["tables"]["ghe"]["variables"]["death_count"] +``` + +An alternative is examining `VariableMeta` + +```python +from etl import grapher_helpers as gh +from etl import paths + +tb = Dataset(paths.DATA_DIR / "garden/who/2024-07-30/ghe")['ghe'] + +# Sample a random row to get the dimension values +dim_dict = dict(zip(tb.index.names, tb.sample(1).index[0])) + +gh.render_variable_meta(tb.death_count.m, dim_dict=dim_dict) +``` diff --git a/docs/architecture/workflow/index.md b/docs/architecture/workflow/index.md index 6fb2c7f3b16..66bfd66fe0a 100644 --- a/docs/architecture/workflow/index.md +++ b/docs/architecture/workflow/index.md @@ -1,7 +1,3 @@ ---- -status: new ---- - Our World in Data has a whole team dedicated to data management that takes data from publicly available sources (e.g. the _UN Food and Agriculture Organisation_), and makes it available to our researchers to analyse and create visualisation for their articles. ## Five stages @@ -9,18 +5,17 @@ Our World in Data has a whole team dedicated to data management that takes data The ETL project provides an opinionated data management workflow, which separates a data manager's work into five stages: ```mermaid -graph TB +graph LR -snapshot --> format --> harmonise --> import --> publish +snapshot --> format --> harmonize/process --> import --> publish ``` The design of the ETL involves steps that mirror the stages above, which help us to meet several design goals of the project: -1. [Snapshot step](#snapshot): Take a **snapshot** of the upstream data product and store it on our end. - -- [Meadow step](#meadow): Bring the data into a **common format**. -- [Garden step](#garden): **Harmonise** the names of countries, genders and any other columns we may want to join on. Also do the necessary data processing to make the dataset usable for our needs. -- [Grapher step](#grapher): **Import** the data to our internal MySQL database. +1. [Snapshot step](#snapshot): Take a **snapshot** of the upstream data product and store it. +2. [Meadow step](#meadow): Bring the data into a **common format**. +3. [Garden step](#garden): **Harmonise** the names of countries, genders and any other columns we may want to join on. Also do the necessary **data processing** to make the dataset usable for our needs. +4. [Grapher step](#grapher): **Import** the data to our internal MySQL database. A data manager must implement all these steps to make something chartable on the Our World in Data site. @@ -32,9 +27,7 @@ A data manager must implement all these steps to make something chartable on the ## Snapshot -The initial step in importing data from an upstream source involves **transferring an external file directly into our platform**. This process is essential to ensure both reliable and secure access to the file. - -It's important to recognize that an external source might remove the file at any time. Furthermore, this method supports the reproducibility of all Extract, Transform, Load (ETL) processes. This is crucial because the content of the file at the source may undergo changes, such as the removal or addition of datapoints, or alterations in field names. +The initial step consists in **transferring an external file from an upstream provider into our platform**. This ensures, that the source data is always accessible. This is because the upstream provider might remove the file at any time, or change it. The accompanying diagram illustrates the process of importing various versions of the same dataset into our snapshot catalog, depicted over time. Imagine that the vertical axis represents time. @@ -60,9 +53,9 @@ flowchart LR The snapshot step typically consists of a DVC file and a script that downloads the upstream data and saves it to our snapshot catalog. Snapshot files are located in the [`snapshots/`](https://github.com/owid/etl/tree/master/snapshots) directory of the project. -Note that we need a DVC file per upstream data file; hence, in some instances, if the source publishes a datset using multiple files, we need multiple DVC files. +Note that we need a DVC file per upstream data file; hence, in some instances, if the source publishes a dataset using multiple files, we need multiple DVC files. -### Metadata +### Snapshot metadata A Snapshot is a picture of a data product (e.g. a data CSV file) provided by an upstream data provider at a particular point in time. It is the entrypoint to ETL and where we define metadata attributes of that picture. This is fundamental to ensure that the data is properly documented and that the metadata is propagated to the rest of the system. @@ -70,7 +63,7 @@ The metadata in Snapshot consists mainly of one object: `meta.origin`. !!! info "Learn more in our [metadata reference](../metadata/reference#origin)." -This metadata is captured in a DVC file (similar to a yaml file), which contains all the snapshot metadata fields as key-value pairs. + This metadata is captured in a DVC file (similar to a YAML file), which contains all the snapshot metadata fields as key-value pairs. ??? example "Example of [`snapshots/animal_welfare/2023-10-24/fur_laws.xlsx.dvc`](https://github.com/owid/etl/blob/master/snapshots/animal_welfare/2023-10-24/fur_laws.xlsx.dvc)" @@ -99,49 +92,17 @@ This metadata is captured in a DVC file (similar to a yaml file), which contains ## Meadow -The meadow step is the first Transform step of our ETL. - -In a meadow step, we load a `snapshot` and adapt it to be in a convenient format. A convenient format means creating an instance of a [`Dataset`](../../design/common-format/#datasets-owidcatalogdataset), with the appropriate data as a table (or tables). +The meadow step is the first Transform step of our ETL. In it, we load a [`Snapshot`](../../architecture/design/phases/#snapshot) and adapt it to be in a convenient format. A convenient format means creating an instance of a [`Dataset`](../../architecture/design/phases/#datasets), with the appropriate data as a `Table` (or tables). In this step, you can add and define metadata, but we rarely do this. Instead, we propagate the metadata defined in the Snapshot step and leave it to the Garden step to enhance the metadata. Meadow steps should only have `snapshot` (or `walden`) dependencies and ー by definition ー should not depend on `garden` steps. -A typical flow up to the Meadow step could look like: - -```mermaid -flowchart LR - - upstream1((____)):::node -.->|copy| snapshot1((____)):::node - snapshot1((____)):::node -->|format| meadow1((____)):::node - - subgraph id0 [Upstream] - upstream1 - end - - subgraph id1 [Snapshot] - snapshot1 - end - - subgraph id2 [Meadow] - meadow1 - end - - - subgraph id [ETL] - id1 - id2 - end - - classDef node fill:#002147,color:#002147 - classDef node_ss fill:#002147,color:#fff -``` - ## Garden The Garden step is where most of the work falls in. This is where the data manager needs to carefully look at the data, filter outliers, harmonize labels (e.g. country names), improve the dataset metadata, etc. -Garden steps typically depend on meadow steps. For instance, the Garden step `data://garden/un/2022-07-11/un_wpp`, which generates the dataset _World Population Prospects (UN, 2022)_, depends on this same process but in Meadow (i.e. `data://meadow/un/2022-07-11/un_wpp`). After some pre-liminary work (mostly re-formating table, and some minor cleaning), we can now focus on more complex processing steps in Garden. +Garden steps typically depend on meadow steps. For instance, the Garden step `data://garden/un/2024-07-12/un_wpp`, which generates the dataset _World Population Prospects (UN, 2024)_, depends on this same process but in Meadow (i.e. `data://meadow/un/2024-07-12/un_wpp`). After some pre-liminary work (mostly re-formating tables, and some minor cleaning), we can now focus on more complex processing steps in Garden. A typical flow up to the Garden step could look like: @@ -150,7 +111,7 @@ flowchart LR upstream1((____)):::node -.->|copy| snapshot1((____)):::node snapshot1((____)):::node -->|format| meadow1((____)):::node - meadow1((____)):::node -->|harmonize| garden1((____)):::node + meadow1((____)):::node -->|process| garden1((____)):::node subgraph id0 [Upstream] upstream1 @@ -178,7 +139,7 @@ flowchart LR classDef node_ss fill:#002147,color:#fff ``` -However, garden steps could also depend on other garden steps. This is often the case for datasets containing _long-run indicators_, where different `garden` datasets are combined. +However, Garden steps could also depend on other garden steps. This is often the case for datasets containing _long-run indicators_, or just in general when different `garden` datasets are combined. !!! info "Long-run indicators" @@ -190,27 +151,27 @@ However, garden steps could also depend on other garden steps. This is often the ```yaml data://garden/demography/2023-03-31/population: - - data://garden/hyde/2017/baseline - - data://garden/gapminder/2023-03-31/population - - data://garden/un/2022-07-11/un_wpp - - data://open_numbers/open_numbers/latest/gapminder__systema_globalis + - data://garden/hyde/2017/baseline + - data://garden/gapminder/2023-03-31/population + - data://garden/un/2022-07-11/un_wpp + - data://open_numbers/open_numbers/latest/gapminder__systema_globalis ``` -An important processing step in Garden is to standardise (or harmonise) the country names. You can learn more about this in our [country standardisation guide](../../guides/harmonize-countries). +An important processing step in Garden is to standardize (or harmonize) the country names. You can learn more about this in our [country harmonization guide](../../guides/harmonize-countries). -### Metadata +### Garden metadata After adapting and processing the origin's data, we have a curated dataset. This dataset, contains indicators (maybe not present in the origin) that we need to properly document. -The metadata in Garden consists mainly of two objects: `dataset` and `tables`. The metadata comes as a YAML file next to the processing scripts. +The metadata in Garden consists mainly of two objects: [`Dataset`](../../architecture/metadata/reference/#dataset) and [`Table`](../../architecture/metadata/reference/#table) (list). The metadata comes as a YAML file next to the processing scripts. !!! info "Learn more in our [dataset reference](../metadata/reference/#dataset), [tables reference](../metadata/reference/#table) and [indicator reference](../metadata/reference/#variable)." ## Grapher -In the `grapher` step the work should be minimal. Here, we create a `grapher` view by adapting our Garden dataset to fit the Grapher requirements. +In the Grapher step the work should be minimal. Here, we create a `Grapher` view by adapting our Garden dataset to adhere to the Grapher requirements. -Grapher views are still normal datasets, but they adapt the data to the way it must look when being inserted to MySQL. For each grapher view, there is a corresponding matching `grapher://` step automatically generated which does the actual insert to MySQL, if MySQL credentials have been configured. +Grapher views are still normal datasets, but they adapt the data to the way it must look when being inserted into MySQL. For each grapher view, there is a corresponding matching `grapher://` step automatically generated which does the actual insert to MySQL, if MySQL credentials have been configured. A typical flow up to the Grapher step could look like: @@ -258,159 +219,6 @@ flowchart LR classDef node_ss fill:#002147,color:#fff ``` -In principle, a grapher step only loads a single garden step. +In principle, a Grapher step only loads a single garden step. -Note that the diagram shows a final step outside of the ETL. This is when the `grapher://` step is executed, and takes data from the ETL (from the etl `garden` step) and imports it to oure database. - -!!! bug "TODO: Add an example of code" - -## Export - -Sometimes we want to perform an action instead of creating a dataset. For instance, we might want to create a TSV file for an explorer, commit a CSV to a GitHub repository, or create a config for a multi-dimensional indicator. This is where the `export` step comes in. - -Export steps are defined in `etl/steps/export` directory and have similar structure to regular steps. They are run with the `--export` flag. - -```bash -etlr export://explorers/minerals/latest/minerals --export -``` - -The `def run(dest_dir):` function doesn't save a dataset, but calls a method that performs the action. For instance `create_explorer(...)` or `gh.commit_file_to_github(...)`. Once the step is executed successfully, it won't be run again unless its code or dependencies change (it won't be "dirty"). - -### Creating explorers - -TSV files for explorers are created using the `create_explorer` function, usually from a configuration YAML file - -```python -# Create a new explorers dataset and tsv file. -ds_explorer = create_explorer(dest_dir=dest_dir, config=config, df_graphers=df_graphers) -ds_explorer.save() -``` - -### Creating multi-dimensional indicators - -Multi-dimensional indicators are powered by a configuration that is typically created from a YAML file. The structure of the YAML file looks like this: - -```yaml title="etl/steps/export/multidim/covid/latest/covid.deaths.yaml" -definitions: - table: {definitions.table} - -title: - title: COVID-19 deaths - titleVariant: by interval -defaultSelection: - - World - - Europe - - Asia -topicTags: - - COVID-19 - -dimensions: - - slug: interval - name: Interval - choices: - - slug: weekly - name: Weekly - description: null - - slug: biweekly - name: Biweekly - description: null - - - slug: metric - name: Metric - choices: - - slug: absolute - name: Absolute - description: null - - slug: per_capita - name: Per million people - description: null - - slug: change - name: Change from previous interval - description: null - -views: - - dimensions: - interval: weekly - metric: absolute - indicators: - y: "{definitions.table}#weekly_deaths" - - dimensions: - interval: weekly - metric: per_capita - indicators: - y: "{definitions.table}#weekly_deaths_per_million" - - dimensions: - interval: weekly - metric: change - indicators: - y: "{definitions.table}#weekly_pct_growth_deaths" - - - dimensions: - interval: biweekly - metric: absolute - indicators: - y: "{definitions.table}#biweekly_deaths" - - dimensions: - interval: biweekly - metric: per_capita - indicators: - y: "{definitions.table}#biweekly_deaths_per_million" - - dimensions: - interval: biweekly - metric: change - indicators: - y: "{definitions.table}#biweekly_pct_growth_deaths" -``` - -The `dimensions` field specifies selectors, and the `views` field defines views for the selection. Since there are numerous possible configurations, `views` are usually generated programmatically. However, it's a good idea to create a few of them manually to start. - -You can also combine manually defined views with generated ones. See the `etl.multidim` module for available helper functions or refer to examples from `etl/steps/export/multidim/`. Feel free to add or modify the helper functions as needed. - -The export step loads the YAML file, adds `views` to the config, and then calls the function. - -```python title="etl/steps/export/multidim/covid/latest/covid.py" -def run(dest_dir: str) -> None: - engine = get_engine() - - # Load YAML file - config = paths.load_mdim_config("covid.deaths.yaml") - - multidim.upsert_multidim_data_page("mdd-energy", config, engine) -``` - -To see the multi-dimensional indicator in Admin, run - -```bash -etlr export://multidim/energy/latest/energy --export -``` - -and check out the preview at http://staging-site-my-branch/admin/grapher/mdd-name. - - -### Exporting data to GitHub - -One common use case for the `export` step is to commit a dataset to a GitHub repository. This is useful when we want to make a dataset available to the public. The pattern for this looks like this: - -```python -if os.environ.get("CO2_BRANCH"): - dry_run = False - branch = os.environ["CO2_BRANCH"] -else: - dry_run = True - branch = "master" - -gh.commit_file_to_github( - combined.to_csv(), - repo_name="co2-data", - file_path="owid-co2-data.csv", - commit_message=":bar_chart: Automated update", - branch=branch, - dry_run=dry_run, -) -``` - -This code will commit the dataset to the `co2-data` repository on GitHub if you specify the `CO2_BRANCH` environment variable, i.e. - -```bash -CO2_BRANCH=main etlr export://co2/latest/co2 --export -``` +Note that the diagram shows a final step outside of the ETL. This is when the `grapher://` step is executed, and takes data from the ETL (from the etl `garden` step) and imports it to our database. diff --git a/docs/architecture/workflow/other-steps.md b/docs/architecture/workflow/other-steps.md index e738bb318c2..bdd1bf186ea 100644 --- a/docs/architecture/workflow/other-steps.md +++ b/docs/architecture/workflow/other-steps.md @@ -1,11 +1,35 @@ +--- +status: new +--- + So far you have learned about the standard steps. These should cover most of the cases. However, there are some other steps worth mentioning. -## Explorers +## Export steps + +Sometimes we want to perform an action instead of creating a dataset. For instance, we might want to create a TSV file for an explorer, commit a CSV to a GitHub repository, or create a config for a multi-dimensional indicator. This is where the `Export` step comes in. + +Export steps are used to perform an action on an already created dataset. This action typically implies making the data available to other parts of the system. There are different types of export steps: + +- **Explorers**: Create a TSV file for a data explorer. +- **Multi-dimensional indicators**: Create a configuration for a multi-dimensional indicator. +- **Export to GitHub**: Commit a dataset to a GitHub repository. + +Export steps should be used after the data has been processed and is ready to be used (post-Garden). + +!!! note "Learn more about [export steps](../../guides/data-work/export-data.md)" -Data explorers are Grapher charts expanded with additional functionalities to facilitate exploration, such as dynamic entity filters or customizable menus. They are powered by CSV files generated by ETL [served from S3](https://dash.cloudflare.com/078fcdfed9955087315dd86792e71a7e/r2/default/buckets/owid-catalog). Explorers data step in ETL is responsible for generating these CSV files. It works in the same way as e.g. garden step, but the transformations made there are meant to get the data ready for the data explorer (and not be consumed by users of catalog). +### Explorers + +Data explorers are Grapher charts expanded with additional functionalities to facilitate exploration, such as dynamic entity filters or customizable menus. They are usually powered by indicators from OWID's Grapher database. !!! info "Learn more about creating Data explorers [on Notion :octicons-arrow-right-24:](https://www.notion.so/owid/Creating-Data-Explorers-cf47a5ef90f14c1fba8fc243aba79be7)." +!!! note "Legacy explorers" + + In the past Explorers were manually defined from our Admin. Data was sourced by CSV files generated by ETL [served from S3](https://dash.cloudflare.com/078fcdfed9955087315dd86792e71a7e/r2/default/buckets/owid-catalog), or on GitHub. + + We have slowly transitioned into a new system where explorers are generated from the ETL pipeline. This is a more scalable and maintainable solution. + ## Backport Datasets from our production grapher database can be backported to ETL catalog. @@ -42,9 +66,6 @@ flowchart LR classDef node_ss fill:#002147,color:#fff ``` -## Open Numbers - -!!! warning "TO BE DONE" ## ETag diff --git a/docs/assets/annotations-chart.png b/docs/assets/annotations-chart.png new file mode 100644 index 00000000000..6d4383983f2 Binary files /dev/null and b/docs/assets/annotations-chart.png differ diff --git a/docs/assets/pr-1.png b/docs/assets/pr-1.png new file mode 100644 index 00000000000..a4dd30c215b Binary files /dev/null and b/docs/assets/pr-1.png differ diff --git a/docs/assets/pr-2.png b/docs/assets/pr-2.png new file mode 100644 index 00000000000..33f2c9bd9d8 Binary files /dev/null and b/docs/assets/pr-2.png differ diff --git a/docs/guides/auto-regular-updates.md b/docs/guides/auto-regular-updates.md index 50912419fbf..f82957e6809 100644 --- a/docs/guides/auto-regular-updates.md +++ b/docs/guides/auto-regular-updates.md @@ -1,7 +1,6 @@ --- tags: - 👷 Staff -status: new --- !!! warning "This is a work in progress" diff --git a/docs/guides/data-work/add-data.md b/docs/guides/data-work/add-data.md index fa83c3c5dc6..e42cf58169c 100644 --- a/docs/guides/data-work/add-data.md +++ b/docs/guides/data-work/add-data.md @@ -35,6 +35,16 @@ There are different ways you can add data to the catalog, depending on your tech +## Create your new environment +Before starting to add a new dataset, make sure to create your new environment. This means creating a new branch, its corresponding pull request and staging server. This can all be made with one command: + +```bash +etl pr "{short_name}: new dataset" data +``` + +This will create a new git branch in your local repository with an empty commit, which will be pushed to remote. It will also create a draft pull request in github, and a staging server. Wait for a notification from [@owidbot](https://github.com/owidbot). It should take a few minutes, and will inform you that the staging server `http://staging-site-data-{short_name}` has been created. + + ## Using Wizard (recommended) !!! info diff --git a/docs/guides/data-work/export-data.md b/docs/guides/data-work/export-data.md new file mode 100644 index 00000000000..8dbe49b9ae2 --- /dev/null +++ b/docs/guides/data-work/export-data.md @@ -0,0 +1,149 @@ +--- +status: new +--- + +!!! warning "Export steps are a work in progress" + +Export steps are defined in `etl/steps/export` directory and have similar structure to regular steps. They are run with the `--export` flag: + +```bash +etlr export://explorers/minerals/latest/minerals --export +``` + +The `def run(dest_dir):` function doesn't save a dataset, but calls a method that performs the action. For instance `create_explorer(...)` or `gh.commit_file_to_github(...)`. Once the step is executed successfully, it won't be run again unless its code or dependencies change (it won't be "dirty"). + +## Creating explorers + +TSV files for explorers are created using the `create_explorer` function, usually from a configuration YAML file + +```py +# Create a new explorers dataset and tsv file. +ds_explorer = create_explorer(dest_dir=dest_dir, config=config, df_graphers=df_graphers) +ds_explorer.save() +``` + +!!! info "Creating explorers on staging servers" + + Explorers can be created or edited on staging servers and then manually migrated to production. Each staging server creates a branch in the `owid-content` repository. Editing explorers in Admin or running the `create_explorer` function pushes changes to that branch. Once the PR is merged, the branch gets pushed to the `owid-content` repository (not to the `master` branch, but its own branch). You then need to manually create a PR from that branch and merge it into `master`. + + +## Creating multi-dimensional indicators + +Multi-dimensional indicators are powered by a configuration that is typically created from a YAML file. The structure of the YAML file looks like this: + +```yaml title="etl/steps/export/multidim/energy/latest/energy_prices.yaml" +title: + title: "Energy prices" + titleVariant: "by energy source" +defaultSelection: + - "European Union (27)" +topicTags: + - "Energy" +dimensions: + - slug: "frequency" + name: "Frequency" + choices: + - slug: "annual" + name: "Annual" + description: "Annual data" + - slug: "monthly" + name: "Monthly" + description: "Monthly data" + - slug: "source" + name: "Energy source" + choices: + - slug: "electricity" + name: "Electricity" + - slug: "gas" + name: "Gas" + - slug: "unit" + name: "Unit" + choices: + - slug: "euro" + name: "Euro" + description: "Price in euros" + - slug: "pps" + name: "PPS" + description: "Price in Purchasing Power Standard" +views: + # Views will be filled out programmatically. + [] + +``` + +The `dimensions` field specifies selectors, and the `views` field defines views for the selection. Since there are numerous possible configurations, `views` are usually generated programmatically (using function `etl.multidim.generate_views_for_dimensions`). + +You can also combine manually defined views with generated ones. See the `etl.multidim` module for available helper functions or refer to examples from `etl/steps/export/multidim/`. Feel free to add or modify the helper functions as needed. + +The export step loads the data dependencies and the config YAML file, adds `views` to the config, and then pushes the configuration to the database. + +```python title="etl/steps/export/multidim/energy/latest/energy_prices.py" +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load data on energy prices. + ds_grapher = paths.load_dataset("energy_prices") + + # Read table of prices in euros. + tb_annual = ds_grapher.read("energy_prices_annual") + tb_monthly = ds_grapher.read("energy_prices_monthly") + + # + # Process data. + # + # Load configuration from adjacent yaml file. + config = paths.load_mdim_config() + + # Create views. + config["views"] = multidim.generate_views_for_dimensions( + dimensions=config["dimensions"], + tables=[tb_annual, tb_monthly], + dimensions_order_in_slug=("frequency", "source", "unit"), + warn_on_missing_combinations=False, + additional_config={"chartTypes": ["LineChart"], "hasMapTab": True, "tab": "map"}, + ) + + # + # Save outputs. + # + multidim.upsert_multidim_data_page(slug="mdd-energy-prices", config=config, engine=get_engine()) + +``` + +To see the multi-dimensional indicator in Admin, run + +```bash +etlr export://multidim/energy/latest/energy_prices --export +``` + +and check out the preview at: http://staging-site-my-branch/admin/grapher/mdd-energy-prices + + +## Exporting data to GitHub + +One common use case for the `export` step is to commit a dataset to a GitHub repository. This is useful when we want to make a dataset available to the public. The pattern for this looks like this: + +```python +if os.environ.get("CO2_BRANCH"): + dry_run = False + branch = os.environ["CO2_BRANCH"] +else: + dry_run = True + branch = "master" + +gh.commit_file_to_github( + combined.to_csv(), + repo_name="co2-data", + file_path="owid-co2-data.csv", + commit_message=":bar_chart: Automated update", + branch=branch, + dry_run=dry_run, +) +``` + +This code will commit the dataset to the `co2-data` repository on GitHub if you specify the `CO2_BRANCH` environment variable, i.e. + +```bash +CO2_BRANCH=main etlr export://co2/latest/co2 --export +``` diff --git a/docs/guides/data-work/index.md b/docs/guides/data-work/index.md index a4730cde44b..21ea8aab625 100644 --- a/docs/guides/data-work/index.md +++ b/docs/guides/data-work/index.md @@ -3,8 +3,6 @@ tags: - 👷 Staff --- -# Data work - Adding and updating datasets in ETL is part of our routinary work. To this end, we've simplified the process as much as possible. Find below the list of the steps involved in the workflow. Click on each step to learn more about it. ```mermaid diff --git a/docs/guides/data-work/update-data.md b/docs/guides/data-work/update-data.md index e2a898db898..c0a66de4366 100644 --- a/docs/guides/data-work/update-data.md +++ b/docs/guides/data-work/update-data.md @@ -12,9 +12,9 @@ This guide explains the general workflow to update a dataset that already exists In a nutshell, these are the steps to follow: - Switch to `master` branch (`git switch master`), and ensure it's up-to-date (`git pull`). - - Create a new branch and a draft pull request (PR) with a staging server: + - Create a new branch (name is auto-generated) and a draft pull request (PR) with a staging server: ```bash - etl pr update-{short_name} --title "Update {short_name}" --category data + etl pr "{short_name}: update" data ``` - Use the ETL Dashboard to create new versions of the steps (this will duplicate the code of the old steps). - Execute the newly created snapshot scripts, if any. @@ -47,12 +47,12 @@ This guide assumes you have already a [working installation of `etl`](../../../g - **Create a draft PR and a temporary staging server** - Create a PR with the following command (replace `{short_name}` with the short name of the dataset, e.g. `temperature-anomaly`): ```bash - etl pr update-{short_name} --title "Update {short_name}" --category data + etl pr "{short_name}: update" data ``` - This will create a new git branch in your local repository with an empty commit, which will be pushed to remote. - It will also create a draft pull request in github, and a staging server. - - Wait for a notification from `owidbot`. It should take a few minutes, and will inform you that the staging server [http://staging-site-update-temperature-anomaly/admin](http://staging-site-update-temperature-anomaly/admin) has been created. + This will create a new git branch in your local repository with an empty commit, which will be pushed to remote. It will also create a draft pull request in github, and a staging server. + + - Wait for a notification from [@owidbot](https://github.com/owidbot). It should take a few minutes, and will inform you that the staging server `http://staging-site-update-{short_name}` has been created. - **Update steps using the ETL Dashboard**: - Start the ETL Wizard, by running: diff --git a/docs/guides/etl-tips.md b/docs/guides/etl-tips.md new file mode 100644 index 00000000000..c96fa33e03f --- /dev/null +++ b/docs/guides/etl-tips.md @@ -0,0 +1,170 @@ +--- +status: new +--- + +!!! warning "This is a work in progress" + + This page has been created to collect some practices when working with ETL that can be helpful for all the team to know. + + Please contribute by adding some of your tricks and tips. [Learn how to edit the documentation](../../dev/docs/). + + The content and structure of this page may change in the future. + + +## Interpolate values +Sometimes, you may have empty values in your dataset. In general, a solution for these cases is to use interpolation to fill those gaps based on previous and following values. In `data_helpers.misc` module, you will find the function `interpolate_table` that can be used to interpolate values in a table. + +!!! note "Assumptions on the structure of `tb`" + + The function assumes that the input table has an entity column (typically for country) and a time column (year or date). + +A simple call can be done as follows: + +```python +from etl.data_helpers.misc import interpolate_table + +tb = interpolate_table( + tb, + entity_col="country", + time_col="year", +) +``` + +This will interpolate all the columns in the table `tb` for each country and year. It will use all years between the minimum and maximum years present in `tb`. It will use "linear" interpolation. + +You can adapt the function to your needs, and perform very different kind of interpolations. + +=== "Other interpolations" + + You can use any [method from pandas](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html). + + ```python + tb = interpolate_table( + tb, + entity_col="country", + time_col="year", + method="quadratic" + ) + ``` + +=== "Interpolate within the year range of each country" + + Sometimes, you may have different time ranges for each country. You can interpolate within the year range of each country. That is, if one country has data from 2000 to 2010, and another from 2005 to 2015, the interpolation will be done within those ranges for each country. + + ```python + tb = interpolate_table( + tb, + entity_col="country", + time_col="year", + mode="full_range_entity", + ) + ``` + +## Expand a timeseries for all years +Sometimes, you may need to expand a timeseries to include all years within a specific range, even if some years are missing in the original data. The `expand_time_column` function in the `data_helpers.misc` module can help you achieve this. + + +A simple call can be done as follows: + +```python +from etl.data_helpers.misc import expand_time_column + +tb = expand_time_column( + tb, + entity_col="country", + time_col="year", +) +``` + +This will expand the table `tb` to include all years between the minimum and maximum years present in `tb` for each country. Missing years will be filled with NaN values. + +You can adapt the function to your needs, and perform different kinds of expansions. + +=== "Expand to full range for each entity" + + Expand the timeseries to include all years within the minimum and maximum years present in the data for each entity (e.g., country). Missing years will be filled with NaN values. + + ```python + # Expand timeseries + tb = expand_time_column( + tb, + entity_col="country", + time_col="year", + method="full_range_entity" + ) + ``` + +=== "Expand to a specific range for ell entities" + + Expand the timeseries to include all years from 2000 to 2020 for all entities. Missing years will be filled with NaN values. + + ```python + tb = expand_time_column( + tb, + entity_col="country", + time_col="year", + method="full_range", + since_time=2000, + until_time=2020 + ) + ``` + +=== "Expand with Custom Fill Value" + + Expand the timeseries to include all years within the minimum and maximum years present in the data for each entity, and fill missing years with a custom value (e.g., 0). + + ```python + tb = expand_time_column( + tb, + entity_col="country", + time_col="year", + method="full_range_entity", + fillna_value=0 + ) + ``` + +=== "Expand to Observed Years" + + Expand the timeseries to include all years that appear in the data for any entity. This ensures that all entities have rows for all observed years. + + ```python + tb = expand_time_column( + tb, + entity_col="country", + time_col="year", + method="observed" + ) + ``` + +## Deprecate code +Our codebase has lots of code. Some of it may no longer be maintained or used. To avoid confusion, it is a good practice to slowly deprecate code. This can be done by adding a deprecation warning to the code, and then removing it after a certain period of time: + +```python +from deprecated import deprecated + +@deprecated("This function is deprecated and will be removed in the future. Please use this other function.") +``` + +Make sure to point users to an alternative function or method that they can use instead. + +Please deprecate function with care, and make sure to check if the function is widely used, and communicate the deprecation to the team. + +## Add entity annotations to your dataset +Just add the field `display.entityAnnotationsMap` to the desired indicator. + +```yaml +display: + entityAnnotationsMap: |- + Spain: Some annotation + France: Another annotation +``` + +!!! note "Space is limited" + + The space for annotations in charts is limited. Please be mindful and keep the annotations short and to the point. 2-4 words is usually enough, ideally 2. + + +
+ OWID chart with annotations +
Example chart with entity annotations. Note that the space for annotations.
+
diff --git a/docs/guides/private-import.md b/docs/guides/private-import.md index 120ce5849e2..8da97007592 100644 --- a/docs/guides/private-import.md +++ b/docs/guides/private-import.md @@ -3,11 +3,10 @@ tags: - 👷 Staff --- -While most of the data at OWID is publicly available, some datasets are added to our catalogue with some restrictions. These include datasets that are not redistributable, or that are not meant to be shared with the public. This can happen due to a strict license by the data provider, or because the data is still in a draft stage and not ready for public consumption. +While most of the data at OWID is publicly available, some datasets are added to our catalog with some restrictions. These include datasets that are not redistributable, or that are not meant to be shared with the public. This can happen due to a strict license by the data provider, or because the data is still in a draft stage and not ready for public consumption. Various privacy configurations are available: -- Skip re-publishing to GitHub. - Disable data downloading options on Grapher. - Disable public access to the original file (snapshot). - Hide the dataset from our public catalog (accessible via `owid-catalog-py`). @@ -16,6 +15,12 @@ In the following, we explain how to create private steps in the ETL pipeline and ## Create a private step + +!!! tip "Make your dataset completely private" + + - **Snapshot**: Set `meta.is_public` to `false` in the snapshot DVC file. + - **Meadow, Garden, Grapher**: Use `data-private://` prefix in the step name in the DAG. Set `dataset.non_redistributable` to `true` in the dataset garden metadata. + ### Snapshot To create a private snapshot step, set the `meta.is_public` property in the snapshot .dvc file to false: @@ -34,7 +39,7 @@ This will prevent the file to be publicly accessible without the appropriate cre ### Meadow, Garden, Grapher -Creating a private data step means that the data will not be listed in the public catalog, and therefore will not be accessible via `owid-catalog-py`. In addition, private datasets will not be re-published to GitHub. +Creating a private data step means that the data will not be listed in the public catalog, and therefore will not be accessible via `owid-catalog-py`. To create a private data step (meadow, garden or grapher) simply use `data-private` prefix in the step name in the DAG. For example, the step `grapher/ihme_gbd/2024-06-10/leading_causes_deaths` (this is from [health.yml](https://github.com/owid/etl/blob/master/dag/health.yml)) is private: @@ -70,8 +75,8 @@ etl run run [step-name] --private If you want to make a private step public simply follow the steps below: -- **In the DAG:** Replace `data-private/` prefix with `data/`. -- **In the snapshot DVC file**: Set `meta.is_public` to `true` (or simply remove `is_public` property). -- (Optional) **Allow for Grapher downloads**: Set `dataset.non_redistributable` to `false` in the dataset garden metadata (or simply remove the property from the metadata). +- **In the DAG:** Replace `data-private://` prefix with `data://`. +- **In the snapshot DVC file**: Set `meta.is_public` to `true` (or simply remove this property). +- (Optional) **Allow for Grapher downloads**: Set `dataset.non_redistributable` to `false` in the dataset garden metadata (or simply remove this property). After this, re-run the snapshot step and commit your changes. diff --git a/docs/guides/pull-requests.md b/docs/guides/pull-requests.md new file mode 100644 index 00000000000..e2152923f13 --- /dev/null +++ b/docs/guides/pull-requests.md @@ -0,0 +1,42 @@ +--- +tags: + - 👷 Staff +status: new +--- + +We use pull requests (PRs) to propose changes to the codebase. They are the best way to suggest changes to the codebase, and they are also the best way to get feedback on your work. + +Every PR in the ETL repository has an associated [staging server](staging-servers) created to it. To smooth this process, we have automated all of this with the command `etl pr`. + +!!! tip "[Learn more about how to use the `etl pr` command](../etl-cli/#etl-pr)" + +## PR work summary +Once you've created a PR, the automation user [@owidbot](https://github.com/owidbot) will add a comment to the PR summarizing your work in the PR and providing links to relevant resources. This comment will include the following information: + +- Quick links: Links to the site and tooling using changes introduced by the PR. This includes admin site, public site, Wizard, documentation. +- Login: Instructions on how to ssh into the staging server. +- chart-diff: Wizard app showing chart changes (if any) compared to PRODUCTION. Owidbot will complain if there are chart changes pending review. +- data-diff: Changes introduced in the data compared to PRODUCTION. + +
+ Chart Upgrader +
PR, and [comment by @owidbot](https://github.com/owid/etl/pull/3563#issuecomment-2485397175), as of 19th November 2024
+
+ + +## Scheduling a PR merge + +You can schedule a PR merge by using the command `/schedule` at the end of your PR description. This is useful whenever you want to merge your PR at a specific time, e.g. nightly if it could trigger a long deployment process in the main branch. + +You have multiple options to schedule a PR merge: + +- `/schedule`: The PR will be merged at the next sharp hour (e.g., 13:00, 14:00), based on the current UTC time. +- `/schedule 2024-11-19`: The PR will be merged at midnight (00:00 UTC) on the specified date. +- `/schedule 2024-11-19T12:50:00.000Z`: The PR will be merged at the next sharp hour immediately following the specified timestamp (e.g., if scheduled for 12:50 UTC, it will merge at 13:00 UTC). + +You can find an example [here](https://github.com/owid/etl/pull/3563). + +
+ Chart Upgrader +
[GitHub acction comment](https://github.com/owid/etl/pull/3563#issuecomment-2485414940), as of 19th November 2024
+
diff --git a/docs/guides/sharing-external.md b/docs/guides/sharing-external.md index 5e30cd059cc..f93b47e4376 100644 --- a/docs/guides/sharing-external.md +++ b/docs/guides/sharing-external.md @@ -4,20 +4,28 @@ tags: --- ## Sharing work with external people -Sometimes it's useful to share our work with external people to get feedback before publishing it to the public. Staging servers can be made available to public by creating a branch with `-public` suffix. This will make the staging site available at **https://staging-site-my-branch.tail6e23.ts.net**. +Sometimes it's useful to share our work with external people to get feedback before publishing it to the public. All staging servers are by default available to the public on `https://.owid.pages.dev/`. + + ### Sharing explorers To share explorers with the public, follow these steps: -1. Create a branch wiht `-public` suffix (thus creating staging server). +1. Set `isPublished` to `true` in your explorer configuration. +2. Commit to trigger a deploy (could be empty commit with `--allow-empty`) +3. Share your explorer with public on e.g. https://.owid.pages.dev/explorers/my-explorer. + + + diff --git a/docs/guides/staging-servers.md b/docs/guides/staging-servers.md index fa0c03e641b..6f63e0748c2 100644 --- a/docs/guides/staging-servers.md +++ b/docs/guides/staging-servers.md @@ -9,10 +9,12 @@ Dedicated staging servers are automatically created from every ETL pull request. !!! note "PR staging servers URLs" - You can visit your PR staging server at `http://staging-site-`. Note that `` might differ from the exact branch name, for example `feature/123` will be `feature-123` (all symbols are changed to dashes, and the maximum length is of 50 characters). + You can visit your PR staging server at `http://staging-site-` or `https://.owid.pages.dev/`. Note that `` might differ from the exact branch name, for example `feature/123` will be `feature-123` (all symbols are changed to dashes, and the maximum length is of 50 characters). For more details, refer to the [python code](https://github.com/owid/etl/blob/master/apps/chart_sync/cli.py#L284) generating `` from the branch name. +OWID site on staging servers is **public** by default. If you want to keep the work private (e.g. for embargoed data), use `-private` suffix in the branch name. This will make it available only on `http://staging-site-`. + Once the PR is ready and data manager merges into master, ETL will deploy the changes and automatically run `chart-sync` that syncs approved charts to production. Then the staging server is stopped and destroyed after 3 days. diff --git a/docs/guides/types-tables.md b/docs/guides/types-tables.md new file mode 100644 index 00000000000..43ffedfa82e --- /dev/null +++ b/docs/guides/types-tables.md @@ -0,0 +1,58 @@ +--- +status: new +--- + +In ETL, we work with Table object, which is derived from pandas.DataFrame to adjust it to our needs. Setting the types of your columns is crucial for performance and memory optimization. In this guide, we’ll cover the types we use in our ETL pipeline and how to set them. + +As a general summary, we use nullable data types, and therefore recommend the usage of Float64, Int64 and string[pyarrow] types. We also avoid using np.nan and prefer pd.NA. + +### Loading tables +The preferred way to load a table from a dataset is + +=== "✅ New" + ```python + tb = ds_meadow.read("my_table") + ``` +=== "❌ Old" + ```python + tb = ds_meadow["my_table"] + ``` + +This process automatically converts all columns to the recommended types: + +* `float64` or `float32` -> `Float64` +* `int32`, `uint32`, or `int64` -> `Int64` +* `category` -> `string[pyarrow]` + +To disable this conversion, use `.read("my_table", safe_types=False)`. This is especially useful when working with large tables where conversion to string type would significantly increase memory usage. + + +### Repacking datasets +We use a "repacking process" to reduce the size of the dataset before saving it to disk (`dataset.save()`). This process also converts the data to the recommended types, even if you have converted the data to old NumOy types (e.g. `.astype(float)`). + +### String dtypes +To convert a column to a string type use + +=== "✅ New" + ```python + # Option 1 + tb["my_column"] = tb["my_column"].astype("string") + # Option 2 + tb["my_column"] = tb["my_column"].astype("string[pyarrow]") + ``` + +=== "❌ Old" + ```python + tb["my_column"] = tb["my_column"].astype(str) + ``` + +However, if you don’t use the new method, repack will handle this conversion when saving. + + +!!! info "Difference between `string` and `string[pyarrow]`" + + Both types are very similar, but `string[pyarrow]` is more efficient and will be the default in Pandas 3.0. In practice, you won't notice a difference. + + +### `NaN` values +Avoid using `np.nan`! Always use `pd.NA`. diff --git a/docs/guides/upgrade-python-version.md b/docs/guides/upgrade-python-version.md new file mode 100644 index 00000000000..1a6242b2691 --- /dev/null +++ b/docs/guides/upgrade-python-version.md @@ -0,0 +1,16 @@ +--- +status: new +--- + +To upgrade the version of Python used in the ETL pipeline, follow these steps on the ETL terminal: + +1. Remove the current environment configuration: + ``` + rm -rf .venv + ``` +2. Rebuild the environment with the new Python version (replace xx.x with the desired version): + ``` + PYTHON_VERSION=3.xx.x make .venv + ``` + +The ETL currently supports versions 3.9 to 3.12. diff --git a/docs/guides/wizard.md b/docs/guides/wizard.md index b7d7713b7c2..894260cf7cf 100644 --- a/docs/guides/wizard.md +++ b/docs/guides/wizard.md @@ -72,23 +72,30 @@ Additionally, Expert can also help out create Datasette queries!
Asking the Expert to generate a Datasette query to get the charts with most views.
-### Data tools +### Data curation Pages to help us improve our charts (e.g. keeping them up to date). The current pages are: - [**Indicator Upgrader**](update-charts/#indicator-upgrader): Upgrade old indicators with their corresponding new versions to keep the charts up to date. You will need to (mostly) manually map "old indicators" to "new indicators". Then, the tool will update all affected charts with the new indicators. These modified charts can be reviewed with **Chart diff**. +- **Anomalist**: Presents all anomalies detected in a given selected dataset. By default, it attempts to load anomalies from the newly added datasets (and, if applicable, it also compares the new indicators with the old counterparts). - [**Chart diff**](update-charts/#chart-diff): Shows all charts in your environment (e.g. staging server) that have been modified compared to the production. This is useful to review the changes before they are pushed to the production. - [**Harmonizer**](harmonize-countries): Harmonize the entity names of a table. !!! info "Learn more about [updating charts](data-work/update-charts.md) section" -### Monitoring +### Data monitoring - **Dashboard**: Monitor all our datasets and update them quickly! - **Dataset Explorer**: A tool to explore the datasets in the ETL catalog. You can check the step dependancies and its metadata. If it is a Garden step, you can also perform some actions with it. +### Explorers + +- **Map Bracketer**: Define map brackets for explorers smartly. +- **ID to Path**: Convert an explorer that is ID-based to a path-based explorer. + ### Research +- **Insight search**: Browse insights from our archive by semantic similarity. This is helpful if you want to explore if there are any insights related to a specific topic. - **Insighter**: Generate insights from a chart using LLMs. ### Misc diff --git a/docs/ignore/generate_dynamic_docs.py b/docs/ignore/generate_dynamic_docs.py index 5a6a96c0355..7cdd20532f4 100644 --- a/docs/ignore/generate_dynamic_docs.py +++ b/docs/ignore/generate_dynamic_docs.py @@ -15,7 +15,7 @@ - __[Indicator](#variable)__ (variable) - __[Origin](#origin)__ -- __[Table](#tables)__ +- __[Table](#table)__ - __[Dataset](#dataset)__ diff --git a/docs/overrides/main_aux.html b/docs/overrides/main_aux.html new file mode 100644 index 00000000000..e70aa10c879 --- /dev/null +++ b/docs/overrides/main_aux.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} + +{% block content %} +{{ super() }} + +{% if git_page_authors %} +
+ + Authors: {{ git_page_authors | default('enable mkdocs-git-authors-plugin') }} + +
+{% endif %} +{% endblock %} diff --git a/etl/command.py b/etl/command.py index 1c419901c2e..c70b8ec046c 100644 --- a/etl/command.py +++ b/etl/command.py @@ -38,6 +38,12 @@ config.enable_bugsnag() +# NOTE: I tried enabling this, but ran into weird errors with unit tests and inconsistencies +# with owid libraries. It's better to wait for an official pandas 3.0 release and update +# it all at once. +# Use string[pyarrow] by default, this will become True in pandas 3.0 +# pd.options.future.infer_string = True + # if the number of open files allowed is less than this, increase it LIMIT_NOFILE = 4096 @@ -394,7 +400,13 @@ def exec_steps(steps: List[Step], strict: Optional[bool] = None) -> None: with strictness_level(strict): # Execute the step and measure the time taken - time_taken = timed_run(lambda: step.run()) + try: + time_taken = timed_run(lambda: step.run()) + except Exception: + # log which step failed and re-raise the exception, otherwise it gets lost + # in logs and we don't know which step failed + log.error("step_failed", step=str(step)) + raise execution_times[str(step)] = time_taken click.echo(f"{click.style('OK', fg='blue')}{_create_expected_time_message(time_taken)}") @@ -524,7 +536,11 @@ def _exec_step_job( step = parse_step(step_name, dag) strict = _detect_strictness_level(step, strict) with strictness_level(strict): - execution_times[step_name] = timed_run(lambda: step.run()) + try: + execution_times[step_name] = timed_run(lambda: step.run()) + except Exception: + log.error("step_failed", step=step_name) + raise print(f"--- Finished {step_name} ({execution_times[step_name]:.1f}s)") diff --git a/etl/config.py b/etl/config.py index d9b055ddedc..5706ce503b7 100644 --- a/etl/config.py +++ b/etl/config.py @@ -222,7 +222,7 @@ def variable_metadata_url(variable_id): TLS_VERIFY = bool(int(env.get("TLS_VERIFY", 1))) # Default schema for presentation.grapher_config in metadata. Try to keep it up to date with the latest schema. -DEFAULT_GRAPHER_SCHEMA = "https://files.ourworldindata.org/schemas/grapher-schema.005.json" +DEFAULT_GRAPHER_SCHEMA = "https://files.ourworldindata.org/schemas/grapher-schema.006.json" def enable_bugsnag() -> None: diff --git a/etl/data_helpers/geo.py b/etl/data_helpers/geo.py index 476ba569d8f..e9a85671b63 100644 --- a/etl/data_helpers/geo.py +++ b/etl/data_helpers/geo.py @@ -10,6 +10,7 @@ import numpy as np import owid.catalog.processing as pr import pandas as pd +from deprecated import deprecated from owid.catalog import Dataset, Table, Variable from owid.datautils.common import ExceptionFromDocstring, warn_on_list_of_entities from owid.datautils.dataframes import groupby_agg, map_series @@ -601,7 +602,7 @@ def _add_population_to_dataframe( # Load population data. if ds_population is not None: - population = ds_population.read_table("population") + population = ds_population.read("population", safe_types=False) else: population = _load_population() population = population.rename( @@ -653,6 +654,7 @@ def _add_population_to_dataframe( return cast(TableOrDataFrame, df_with_population) +@deprecated("This function is deprecated. Use `etl.data_helpers.misc.interpolate_table` instead.") def interpolate_table( df: TableOrDataFrame, country_col: str, @@ -1363,7 +1365,7 @@ def make_table_population_daily(ds_population: Dataset, year_min: int, year_max: Uses linear interpolation. """ # Load population table - population = ds_population.read_table("population") + population = ds_population.read("population", safe_types=False) # Filter only years of interest population = population[(population["year"] >= year_min) & (population["year"] <= year_max)] # Create date column diff --git a/etl/data_helpers/misc.py b/etl/data_helpers/misc.py index 310cdfef71a..e5354d04c1a 100644 --- a/etl/data_helpers/misc.py +++ b/etl/data_helpers/misc.py @@ -12,7 +12,7 @@ """ import math -from datetime import datetime +from datetime import date, datetime from typing import Any, Iterable, List, Literal, Optional, Set, TypeVar, Union, cast import owid.catalog.processing as pr @@ -23,6 +23,7 @@ from tqdm.auto import tqdm TableOrDataFrame = TypeVar("TableOrDataFrame", pd.DataFrame, Table) +DIMENSION_COL_NONE = "temporary" def check_known_columns(df: pd.DataFrame, known_cols: list) -> None: @@ -73,7 +74,7 @@ def interpolate_table( time_col: str Name of the column with years. mode: str - How to compelte time series. 'full_range' for complete range, 'full_range_entity' for complete range within an entity, 'reduced' for only time values appearing in the data. Use 'none' to interpolate with existing values. + How to complete time series. 'full_range' for complete range, 'full_range_entity' for complete range within an entity, 'reduced' for only time values appearing in the data. Use 'none' to interpolate with existing values. """ SINGLE_ENTITY = isinstance(entity_col, str) MULTIPLE_ENTITY = isinstance(entity_col, list) @@ -85,7 +86,12 @@ def interpolate_table( if time_mode != "none": # Expand time - df = expand_time_column(df, entity_col, time_col, method=time_mode) + df = expand_time_column( + df, + dimension_col=entity_col, + time_col=time_col, + method=time_mode, + ) # Set index df = cast(TableOrDataFrame, df.set_index(index).sort_index()) @@ -102,8 +108,8 @@ def interpolate_table( def expand_time_column( df: TableOrDataFrame, - dimension_col: str | Iterable[str], time_col: str, + dimension_col: Optional[str | Iterable[str]] = None, method: Literal["full_range", "full_range_entity", "observed", "none"] = "full_range", until_time: Optional[int | datetime] = None, since_time: Optional[int | datetime] = None, @@ -151,6 +157,12 @@ def expand_time_column( # Sanity check assert isinstance(time_col, str), "`time_col` must be a string!" + # TODO: This is temporary hack + if dimension_col is None: + dimension_col = DIMENSION_COL_NONE + df[DIMENSION_COL_NONE] = "" + df[DIMENSION_COL_NONE] = df[DIMENSION_COL_NONE].astype("string") + # Determine if we have a single or multiple dimensiosn (will affect how groupbys are done) SINGLE_DIMENSION = isinstance(dimension_col, str) MULTIPLE_DIMENSION = isinstance(dimension_col, list) @@ -171,6 +183,8 @@ def _get_complete_date_range(ds): date_max = ds.max() if isinstance(date_max, datetime): return pd.date_range(start=date_min, end=date_max) + if isinstance(date_max, date): + return pd.date_range(start=date_min, end=date_max).date else: return range(int(date_min), int(date_max) + 1) @@ -326,6 +340,10 @@ def _fillna(df: Any, method: Any): df = df.astype(dtypes) except pd.errors.IntCastingNaNError: pass + + if dimension_col == DIMENSION_COL_NONE: + df = df.drop(columns=dimension_col) + return df @@ -391,7 +409,7 @@ def explode_rows_by_time_range( ######################################################################################################################## -# TODO: Remote this temporary function once WDI has origins. +# TODO: Remove this temporary function once WDI has origins. def add_origins_to_mortality_database(tb_who: Table) -> Table: tb_who = tb_who.copy() @@ -423,7 +441,7 @@ def add_origins_to_mortality_database(tb_who: Table) -> Table: ################################################################################## -# TODO: Remote this temporary function once WDI has origins. +# TODO: Remove this temporary function once WDI has origins. def add_origins_to_global_burden_of_disease(tb_gbd: Table) -> Table: tb_gbd = tb_gbd.copy() diff --git a/etl/data_helpers/population.py b/etl/data_helpers/population.py index 204332a26ef..fe1b2e68e3e 100644 --- a/etl/data_helpers/population.py +++ b/etl/data_helpers/population.py @@ -76,7 +76,7 @@ def add_population( log.warning(f"Dataset {ds_un_wpp_path} is silently being loaded.") # Load granular population dataset ds_un_wpp = Dataset(ds_un_wpp_path) - pop = ds_un_wpp.read_table("population_granular") # type: ignore + pop = ds_un_wpp.read("population_granular", safe_types=False) # type: ignore # Keep only variant='medium' pop = pop[pop["variant"] == "medium"].drop(columns=["variant"]) # Keep only metric='population' diff --git a/etl/datadiff.py b/etl/datadiff.py index 0a3a0d9db4f..cbe37ae3fb1 100644 --- a/etl/datadiff.py +++ b/etl/datadiff.py @@ -40,11 +40,13 @@ def __init__( ds_b: Optional[Dataset], verbose: bool = False, cols: Optional[str] = None, + tables: Optional[str] = None, print: Callable = rich.print, snippet: bool = False, ): """ :param cols: Only compare columns matching pattern + :param tables: Only compare tables matching pattern :param print: Function to print the diff summary. Defaults to rich.print. :param snippet: Print snippet for loading both tables """ @@ -54,6 +56,7 @@ def __init__( self.p = print self.verbose = verbose self.cols = cols + self.tables = tables self.snippet = snippet def _diff_datasets(self, ds_a: Optional[Dataset], ds_b: Optional[Dataset]): @@ -257,6 +260,8 @@ def summary(self): if self.ds_a and self.ds_b: for table_name in set(self.ds_a.table_names) | set(self.ds_b.table_names): + if self.tables and not re.search(self.tables, table_name): + continue self._diff_tables(self.ds_a, self.ds_b, table_name) @@ -311,6 +316,11 @@ def __getitem__(self, name: str) -> Table: type=str, help="Compare only columns matching pattern.", ) +@click.option( + "--tables", + type=str, + help="Compare only tables matching pattern.", +) @click.option( "--exclude", "-e", @@ -341,6 +351,7 @@ def cli( channel: Iterable[CHANNEL], include: Optional[str], cols: Optional[str], + tables: Optional[str], exclude: Optional[str], verbose: bool, snippet: bool, @@ -417,7 +428,13 @@ def cli( def func(ds_a, ds_b): lines = [] differ = DatasetDiff( - ds_a, ds_b, cols=cols, print=lambda x: lines.append(x), verbose=verbose, snippet=snippet + ds_a, + ds_b, + cols=cols, + tables=tables, + print=lambda x: lines.append(x), + verbose=verbose, + snippet=snippet, ) differ.summary() return lines @@ -451,7 +468,9 @@ def _append_and_print(x): console.print(x) try: - differ = DatasetDiff(ds_a, ds_b, cols=cols, print=_append_and_print, verbose=verbose, snippet=snippet) + differ = DatasetDiff( + ds_a, ds_b, tables=tables, cols=cols, print=_append_and_print, verbose=verbose, snippet=snippet + ) differ.summary() except DatasetError as e: # soft fail and continue with another dataset diff --git a/etl/explorer.py b/etl/explorer.py index b6ae5c20fce..a250811ff21 100644 --- a/etl/explorer.py +++ b/etl/explorer.py @@ -15,7 +15,7 @@ from structlog import get_logger from etl import config -from etl.files import upload_file_to_server +from etl.files import download_file_from_server, run_command_on_server, upload_file_to_server from etl.grapher_io import get_variables_data from etl.paths import EXPLORERS_DIR @@ -194,6 +194,10 @@ def from_owid_content(cls, name: str) -> "Explorer": """ path = (Path(EXPLORERS_DIR) / name).with_suffix(".explorer.tsv") + # If working on staging server, pull the file from there and replace the local owid-content version + if cls._on_staging(): + download_file_from_server(path, f"owid@{config.DB_HOST}:~/owid-content/explorers/{name}.explorer.tsv") + # Build explorer from file explorer = cls.from_file(str(path), name=name) @@ -208,6 +212,10 @@ def export(self, path: Union[str, Path]): # Write parsed content to file. path.write_text(self.content) + @staticmethod + def _on_staging() -> bool: + return config.STAGING and "staging-site" in config.DB_HOST and "staging-site-master" not in config.DB_HOST # type: ignore + def to_owid_content(self, path: Optional[Union[str, Path]] = None): """Save your config in owid-content and push to server if applicable. @@ -221,10 +229,14 @@ def to_owid_content(self, path: Optional[Union[str, Path]] = None): self.export(path) # Upload it to staging server. - if config.STAGING: - if isinstance(path, str): - path = Path(path) - upload_file_to_server(path, f"owid@{config.DB_HOST}:~/owid-content/explorers/") + if self._on_staging(): + upload_file_to_server(Path(path), f"owid@{config.DB_HOST}:~/owid-content/explorers/") + + # Commit on the staging server + run_command_on_server( + f"owid@{config.DB_HOST}", + "cd owid-content && git add . && git diff-index --quiet HEAD || git commit -m ':robot: Update explorer from ETL'", + ) def save(self, path: Optional[Union[str, Path]] = None) -> None: """See docs for `to_owid_content`.""" diff --git a/etl/files.py b/etl/files.py index a5fe2d6911f..03ccfe83d1b 100644 --- a/etl/files.py +++ b/etl/files.py @@ -16,12 +16,15 @@ import pandas as pd import ruamel.yaml +import structlog import yaml from ruamel.yaml import YAML from yaml.dumper import Dumper from etl.paths import BASE_DIR +log = structlog.get_logger() + class RuntimeCache: """Runtime cache, we need locks because we usually run it in threads.""" @@ -315,7 +318,7 @@ def upload_file_to_server(local_file_path: Path, target: str) -> None: # Execute the command subprocess.run(scp_command, check=True, text=True, capture_output=True) - print(f"File {local_file_path} successfully uploaded to {target}") + log.info("file.uploaded", target=target, path=local_file_path) except subprocess.CalledProcessError as e: raise RuntimeError(f"Failed to upload file {local_file_path} to {target}") from e @@ -326,3 +329,62 @@ def create_folder(folder_path: str | Path) -> None: folder_path = Path(folder_path) if not folder_path.exists(): folder_path.mkdir(parents=True, exist_ok=True) + + +def download_file_from_server( + local_file_path: Path, + target: str, +) -> None: + """ + Download a remote file from a server to a local path using scp. + + :param target: The source file on the remote server in the format 'user@host:/remote/path'. + Example: 'user@example.com:/remote/path/to/file.txt' + :param local_file_path: Path where the downloaded file will be saved locally. + """ + # Validate the target format (basic check) + if "@" not in target or ":" not in target: + raise ValueError(f"The target '{target}' is not properly formatted. Expected format: 'user@host:/remote/path'.") + + # Ensure the parent directory of the local file path exists + if not local_file_path.parent.exists(): + local_file_path.parent.mkdir(parents=True, exist_ok=True) + + try: + # Construct the scp command + scp_command = ["scp", target, str(local_file_path)] + + # Execute the command + subprocess.run(scp_command, check=True, text=True, capture_output=True) + log.info("file.downloaded", target=target, path=local_file_path) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to download file {target} to {local_file_path}") from e + + +def run_command_on_server( + ssh_target: str, + command: str, +) -> str: + """ + Run a command on a remote server via SSH using subprocess. + + :param ssh_target: The SSH target in the format 'user@hostname'. + :param command: The command to execute on the remote server. + :return: The stdout output from the command. + """ + try: + # Construct the SSH command + ssh_command = ["ssh", ssh_target, command] + + # Execute the command + result = subprocess.run( + ssh_command, + check=True, + text=True, + capture_output=True, + ) + + log.info("command.executed", target=ssh_target, command=command) + return result.stdout + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to execute command on {ssh_target}:\n{e.stderr}") from e diff --git a/etl/grapher_helpers.py b/etl/grapher_helpers.py index 058598ee9f2..b1426a752b1 100644 --- a/etl/grapher_helpers.py +++ b/etl/grapher_helpers.py @@ -2,7 +2,7 @@ from dataclasses import dataclass, field, is_dataclass from functools import lru_cache from pathlib import Path -from typing import Any, Dict, Iterable, List, Literal, Optional, Set, cast +from typing import Any, Dict, Iterable, List, Literal, Optional, Set, Union, cast import jinja2 import numpy as np @@ -10,21 +10,19 @@ import pymysql import sqlalchemy import structlog -from jinja2 import Environment from owid import catalog from owid.catalog import warnings -from owid.catalog.utils import underscore +from owid.catalog.utils import dynamic_yaml_load, dynamic_yaml_to_dict, underscore from sqlalchemy import text from sqlalchemy.engine import Engine from sqlalchemy.orm import Session from etl.db import get_engine, read_sql -from etl.files import checksum_str -from etl.grapher_io import add_entity_code_and_name +from etl.grapher_io import add_entity_code_and_name, trim_long_variable_name log = structlog.get_logger() -jinja_env = Environment( +jinja_env = jinja2.Environment( block_start_string="<%", block_end_string="%>", variable_start_string="<<", @@ -33,10 +31,21 @@ comment_end_string="#>", trim_blocks=True, lstrip_blocks=True, + undefined=jinja2.StrictUndefined, ) + +# Helper function to raise an error with << raise("uh oh...") >> +def raise_helper(msg): + raise Exception(msg) + + +jinja_env.globals["raise"] = raise_helper + # this might work too pd.api.types.is_integer_dtype(col) -INT_TYPES = tuple({f"{n}{b}" for n in ("int", "Int", "uint", "UInt") for b in ("8", "16", "32", "64")}) +INT_TYPES = tuple( + {f"{n}{b}{p}" for n in ("int", "Int", "uint", "UInt") for b in ("8", "16", "32", "64") for p in ("", "[pyarrow]")} +) def as_table(df: pd.DataFrame, table: catalog.Table) -> catalog.Table: @@ -203,19 +212,29 @@ def _cached_jinja_template(text: str) -> jinja2.environment.Template: return jinja_env.from_string(text) -def _expand_jinja_text(text: str, dim_dict: Dict[str, str]) -> str: +def _expand_jinja_text(text: str, dim_dict: Dict[str, str]) -> Union[str, bool]: if not _uses_jinja(text): return text try: - return _cached_jinja_template(text).render(dim_dict) + # NOTE: we're stripping the result to avoid trailing newlines + out = _cached_jinja_template(text).render(dim_dict).strip() + # Convert strings to booleans. Getting boolean directly from Jinja is not possible + if out in ("false", "False", "FALSE"): + return False + elif out in ("true", "True", "TRUE"): + return True + return out except jinja2.exceptions.TemplateSyntaxError as e: new_message = f"{e.message}\n\nDimensions:\n{dim_dict}\n\nTemplate:\n{text}\n" raise e.__class__(new_message, e.lineno, e.name, e.filename) from e + except jinja2.exceptions.UndefinedError as e: + new_message = f"{e.message}\n\nDimensions:\n{dim_dict}\n\nTemplate:\n{text}\n" + raise e.__class__(new_message) from e def _expand_jinja(obj: Any, dim_dict: Dict[str, str]) -> Any: - """Expand Jinja in all metadata fields.""" + """Expand Jinja in all metadata fields. This modifies the original object in place.""" if obj is None: return None elif isinstance(obj, str): @@ -232,6 +251,33 @@ def _expand_jinja(obj: Any, dim_dict: Dict[str, str]) -> Any: return obj +def render_yaml_file(path: Union[str, Path], dim_dict: Dict[str, str]) -> Dict[str, Any]: + """Load YAML file and render Jinja in all fields. Return a dictionary. + + Usage: + from etl import grapher_helpers as gh + from etl import paths + + tb = Dataset(paths.DATA_DIR / "garden/who/2024-07-30/ghe")['ghe'] + gh.render_variable_meta(tb.my_col.m, dim_dict={"sex": "male"}) + """ + meta = dynamic_yaml_to_dict(dynamic_yaml_load(path)) + return _expand_jinja(meta, dim_dict) + + +def render_variable_meta(meta: catalog.VariableMeta, dim_dict: Dict[str, str]) -> catalog.VariableMeta: + """Render Jinja in all fields of VariableMeta. Return a new VariableMeta object. + + Usage: + # Create a playground.ipynb next to YAML file and run this in notebook + from etl import grapher_helpers as gh + m = gh.render_yaml_file("ghe.meta.yml", dim_dict={"sex": "male"}) + m['tables']['ghe']['variables']['death_count'] + """ + # TODO: move this as a method to VariableMeta class + return _expand_jinja(meta.copy(), dim_dict) + + def _title_column_and_dimensions(title: str, dim_dict: Dict[str, Any]) -> str: """Create new title from column title and dimensions. For instance `Deaths`, ["age", "sex"], ["10-18", "male"] will be converted into @@ -252,8 +298,7 @@ def _underscore_column_and_dimensions(column: str, dim_dict: Dict[str, Any], tri if len(short_name) > 255: if trim_long_short_name: - unique_hash = f"_{checksum_str(short_name)}" - short_name = short_name[: (255 - len(unique_hash))] + unique_hash + short_name = trim_long_variable_name(short_name) log.warning( "short_name_trimmed", short_name=short_name, diff --git a/etl/grapher_import.py b/etl/grapher_import.py index fd3e84d41a8..576dfd21a72 100644 --- a/etl/grapher_import.py +++ b/etl/grapher_import.py @@ -432,9 +432,8 @@ def fetch_db_checksum(dataset: catalog.Dataset) -> Optional[str]: with Session(get_engine()) as session: q = select(gm.Dataset).where( - gm.Dataset.shortName == dataset.metadata.short_name, - gm.Dataset.version == dataset.metadata.version, - gm.Dataset.namespace == dataset.metadata.namespace, + gm.Dataset.catalogPath + == f"{dataset.metadata.namespace}/{dataset.metadata.version}/{dataset.metadata.short_name}" ) ds = session.scalars(q).one_or_none() return ds.sourceChecksum if ds is not None else None diff --git a/etl/grapher_io.py b/etl/grapher_io.py index 5ee8490ba60..497c5409a49 100644 --- a/etl/grapher_io.py +++ b/etl/grapher_io.py @@ -8,7 +8,6 @@ """ import concurrent.futures -import functools as ft import io import warnings from collections import defaultdict @@ -34,6 +33,7 @@ from etl import grapher_model as gm from etl.config import OWID_ENV, OWIDEnv from etl.db import get_connection, read_sql +from etl.files import checksum_str from etl.paths import CACHE_DIR, DATA_DIR log = structlog.get_logger() @@ -484,18 +484,32 @@ def variable_data_table_from_catalog( tbs = [] for (ds_path, table_name), variables in to_read.items(): try: - tb = Dataset(DATA_DIR / ds_path).read_table(table_name) + tb = Dataset(DATA_DIR / ds_path).read(table_name, safe_types=False) except FileNotFoundError as e: raise FileNotFoundError(f"Dataset {ds_path} not found in local catalog.") from e - # Simple case with no dimensions - if not variables[0].dimensions: - variable_names = [variable.shortName for variable in variables] - tb = tb[["country", "year"] + variable_names] + if "date" in tb.columns: + year_or_date = "date" + elif "year" in tb.columns: + year_or_date = "year" + else: + raise ValueError(f"Table {table_name} has no 'date' or 'year' column") - # Rename from shortName to id - tb = tb.rename(columns={variable.shortName: variable.id for variable in variables}) - tbs.append(tb) + dim_names = [k for k in tb.metadata.primary_key if k not in ("country", year_or_date)] + + # Simple case with no dimensions + if not dim_names: + col_mapping = {"country": "country", year_or_date: year_or_date} + for col in set(tb.columns) - {"country", year_or_date}: + # Variable names in MySQL are trimmed to 255 characters + name = trim_long_variable_name(col) + matches = [variable for variable in variables if name == variable.shortName] + if matches: + col_mapping[col] = matches[0].id # type: ignore + + tb = tb[col_mapping.keys()] + tb.columns = col_mapping.values() + tbs.append(tb.set_index(["country", year_or_date])) # Dimensional case else: @@ -506,27 +520,33 @@ def variable_data_table_from_catalog( # {'name': 'gender', 'value': 'all'}, # {'name': 'age_group', 'value': '15-29'} # ] - filters = variables[0].dimensions["filters"] - dim_names = [f["name"] for f in filters] - tb_pivoted = tb.pivot(index=["country", "year"], columns=dim_names) + dim_names = [k for k in tb.metadata.primary_key if k not in ("country", year_or_date)] + tb_pivoted = tb.pivot(index=["country", year_or_date], columns=dim_names) labels = [] for variable in variables: - assert variable.dimensions, f"Variable {variable.id} has no dimensions" - labels.append( - tuple( - [variable.dimensions["originalShortName"]] - + [f["value"] for f in variable.dimensions["filters"]] - ) - ) + if not variable.dimensions: + label = [variable.shortName] + [None] * len(dim_names) + # assert variable.dimensions, f"Variable {variable.id} has no dimensions" + else: + # Construct label for multidim columns + label = [variable.dimensions["originalShortName"]] + for dim_name in dim_names: + for f in variable.dimensions["filters"]: + if f["name"] == dim_name: + label.append(f["value"]) + break + else: + label.append(None) # type: ignore + labels.append(label) tb = tb_pivoted.loc[:, labels] tb.columns = [variable.id for variable in variables] - tbs.append(tb.reset_index()) + tbs.append(tb) - # TODO: this could be slow for datasets with a lot of tables - return ft.reduce(lambda left, right: pd.merge(left, right, on=["country", "year"], how="outer"), tbs) # type: ignore + # NOTE: this can be pretty slow for datasets with a lot of tables + return pd.concat(tbs, axis=1).reset_index() # type: ignore ####################################################################################################### @@ -978,3 +998,12 @@ def _get_variables_data_with_filter( log.warning(f"Values of {field_name} not found in database: {missing_values}") return variables_data + + +def trim_long_variable_name(short_name: str) -> str: + """Trim long variable name to 255 characters and add a hash to make it unique.""" + if len(short_name) > 255: + unique_hash = f"_{checksum_str(short_name)}" + return short_name[: (255 - len(unique_hash))] + unique_hash + else: + return short_name diff --git a/etl/grapher_model.py b/etl/grapher_model.py index 3e495e414f6..3be83cee5ca 100644 --- a/etl/grapher_model.py +++ b/etl/grapher_model.py @@ -254,11 +254,16 @@ class User(Base): createdAt: Mapped[datetime] = mapped_column(DateTime, server_default=text("CURRENT_TIMESTAMP"), init=False) isActive: Mapped[int] = mapped_column(TINYINT(1), server_default=text("'1'")) fullName: Mapped[str] = mapped_column(VARCHAR(255)) + githubUsername: Mapped[str] = mapped_column(VARCHAR(255)) password: Mapped[Optional[str]] = mapped_column(VARCHAR(128)) lastLogin: Mapped[Optional[datetime]] = mapped_column(DateTime) updatedAt: Mapped[Optional[datetime]] = mapped_column(DateTime, init=False) lastSeen: Mapped[Optional[datetime]] = mapped_column(DateTime) + @classmethod + def load_user(cls, session: Session, github_username: str) -> Optional["User"]: + return session.scalars(select(cls).where(cls.githubUsername == github_username)).one_or_none() + class ChartRevisions(Base): __tablename__ = "chart_revisions" @@ -289,6 +294,13 @@ class ChartConfig(Base): slug: Mapped[Optional[str]] = mapped_column( String(255), Computed("(json_unquote(json_extract(`full`, '$.slug')))", persisted=True) ) + chartType: Mapped[Optional[str]] = mapped_column( + String(255), + Computed( + "(CASE WHEN full ->> '$.chartTypes' IS NULL THEN 'LineChart' ELSE full ->> '$.chartTypes[0]' END)", + persisted=True, + ), + ) createdAt: Mapped[datetime] = mapped_column(DateTime, server_default=text("CURRENT_TIMESTAMP"), nullable=False) updatedAt: Mapped[Optional[datetime]] = mapped_column(DateTime, onupdate=func.current_timestamp()) @@ -413,13 +425,15 @@ def load_chart_variables(self, session: Session) -> Dict[int, "Variable"]: rows = session.execute(stm).scalars().all() variables = {r.id: r for r in rows} + # NOTE: columnSlug must always exist in dimensions and in chart_dimensions, so there's + # no need to include columnSlug # add columnSlug if present - column_slug = self.config.get("map", {}).get("columnSlug") - if column_slug: - try: - variables[int(column_slug)] = Variable.load_variable(session, column_slug) - except NoResultFound: - raise ValueError(f"columnSlug variable {column_slug} for chart {self.id} not found") + # column_slug = self.config.get("map", {}).get("columnSlug") + # if column_slug: + # try: + # variables[int(column_slug)] = Variable.load_variable(session, column_slug) + # except NoResultFound: + # raise ValueError(f"columnSlug variable {column_slug} for chart {self.id} not found") return variables @@ -488,7 +502,13 @@ def migrate_config(self, source_session: Session, target_session: Session) -> Di # copy chart as a new object config = copy.deepcopy(self.config) - config = _remap_variable_ids(config, remap_ids) + try: + config = _remap_variable_ids(config, remap_ids) + except KeyError as e: + # This should not be happening - it means that there's a chart with a variable that doesn't exist in + # chart_dimensions and possibly not even in variables table. It's possible that you see it admin, but + # only because it is cached. + raise ValueError(f"Issue with chart {self.id} - variable id not found in chart_dimensions table: {e}") return config @@ -1846,7 +1866,9 @@ def _remap_variable_ids(config: Union[List, Dict[str, Any], Any], remap_ids: Dic out[k] = remap_ids[int(v)] # columnSlug is actually a variable id, but stored as a string (it wasn't a great decision) elif k in ("columnSlug", "sortColumnSlug"): - out[k] = str(remap_ids[int(v)]) + # sometimes columnSlug stays in config, but is deleted from dimensions. Ignore it + if int(v) in remap_ids: + out[k] = str(remap_ids[int(v)]) # if new fields with variable ids are added, try to handle them and raise a warning elif isinstance(v, int) and v in remap_ids: log.warning("remap_variable_ids.new_field", field=k, value=v) diff --git a/etl/harmonize.py b/etl/harmonize.py index 202596a9672..acda3bc3e55 100644 --- a/etl/harmonize.py +++ b/etl/harmonize.py @@ -120,11 +120,15 @@ def harmonize_ipython( # Run automatic harmonization harmonizer.run_automatic(logging="ipython") - # Need user input - harmonizer.run_interactive_ipython( - institution=institution, - num_suggestions=num_suggestions, - ) + # Export mapping immediately after automatic harmonization + harmonizer.export_mapping() + + # If there are ambiguous countries, proceed with interactive harmonization + if harmonizer.ambiguous: + harmonizer.run_interactive_ipython( + institution=institution, + num_suggestions=num_suggestions, + ) def read_table(input_file: str) -> pd.DataFrame: @@ -144,6 +148,7 @@ class CountryRegionMapper: # known aliases of our canonical geo-regions aliases: Dict[str, str] valid_names: Set[str] + owid_continents: Set[str] def __init__(self) -> None: try: @@ -173,6 +178,11 @@ def __init__(self) -> None: self.aliases = aliases self.valid_names = valid_names + # continents defined by OWID that require explicit confirmation + self.owid_continents = set( + rc_df[(rc_df.defined_by == "owid") & (rc_df.region_type.isin({"continent", "aggregate"}))].name + ) - {"World"} + def __contains__(self, key: str) -> bool: return key.lower() in self.aliases @@ -324,15 +334,20 @@ def run_automatic( for region in self.geo: if region in self.mapping: # we did this one in a previous run - continue + pass + + elif region in self.mapper.owid_continents: + # continents require explicit confirmation to avoid accidental mappings to "Asia" instead of "Asia (UN)" + ambiguous.append(region) - if region in self.mapper: + elif region in self.mapper: # it's an exact match for a country/region or its known aliases name = self.mapper[region] self.mapping[region] = name - continue - ambiguous.append(region) + else: + # unknown country + ambiguous.append(region) # logging if logging == "ipython": @@ -579,7 +594,6 @@ def export_mapping(self): def export_excluded_countries(self): if not self.excluded: return - if self.output_file is None: raise ValueError("`output_file` not provided") assert ".countries." in str(self.output_file), "Output file is not in **/*.countries.json format" diff --git a/etl/helpers.py b/etl/helpers.py index 48731e03b29..9b47f5ba353 100644 --- a/etl/helpers.py +++ b/etl/helpers.py @@ -100,7 +100,7 @@ def grapher_checks(ds: catalog.Dataset, warn_title_public: bool = True) -> None: year = tab["year"] else: year = tab.index.get_level_values("year") - assert year.dtype in gh.INT_TYPES, f"year must be of an integer type but was: {tab['year'].dtype}" + assert year.dtype in gh.INT_TYPES, f"year must be of an integer type but was: {year.dtype}" elif {"date", "country"} <= set(tab.all_columns): pass else: @@ -594,7 +594,7 @@ def _get_attributes_from_step_name(step_name: str) -> Dict[str, str]: if channel_type.startswith(("walden", "snapshot")): channel = channel_type namespace, version, short_name = path.split("/") - elif channel_type.startswith(("data",)): + elif channel_type.startswith(("data", "export")): channel, namespace, version, short_name = path.split("/") else: raise WrongStepName @@ -1206,7 +1206,7 @@ def map_indicator_path_to_id(catalog_path: str) -> str | int: def get_schema_from_url(schema_url: str) -> dict: """Get the schema of a chart configuration. Schema URL is saved in config["$schema"] and looks like: - https://files.ourworldindata.org/schemas/grapher-schema.005.json + https://files.ourworldindata.org/schemas/grapher-schema.006.json More details on available versions can be found at https://github.com/owid/owid-grapher/tree/master/packages/%40ourworldindata/grapher/src/schema. diff --git a/etl/indicator_upgrade/schema.py b/etl/indicator_upgrade/schema.py index cecefa3efbd..a41729a26a5 100644 --- a/etl/indicator_upgrade/schema.py +++ b/etl/indicator_upgrade/schema.py @@ -69,7 +69,7 @@ def _set_defaults(validator, properties, instance, schema): # type: ignore raise Exception(f"Could not validate schema for chart {config['id']}: {e}") # Add minTime if not set (no default provided in schema) # Kinda hacky - if config_new["type"] not in {"StackedDiscreteBar", "Marimekko", "DiscreteBar"}: + if config_new["chartTypes"][0] not in {"StackedDiscreteBar", "Marimekko", "DiscreteBar"}: if "minTime" not in config_new: config_new["minTime"] = "earliest" return config_new diff --git a/etl/multidim.py b/etl/multidim.py index 0931cb0f946..f22c68cc87a 100644 --- a/etl/multidim.py +++ b/etl/multidim.py @@ -1,14 +1,20 @@ import json +from itertools import product import pandas as pd import yaml from sqlalchemy.engine import Engine +from structlog import get_logger from apps.chart_sync.admin_api import AdminAPI from etl.config import OWID_ENV from etl.db import read_sql +from etl.grapher_io import trim_long_variable_name from etl.helpers import map_indicator_path_to_id +# Initialize logger. +log = get_logger() + def upsert_multidim_data_page(slug: str, config: dict, engine: Engine) -> None: validate_multidim_config(config, engine) @@ -162,3 +168,103 @@ def fetch_variables_from_table(table: str, engine: Engine) -> pd.DataFrame: df_dims = pd.DataFrame(dims, index=df.index) return df.join(df_dims) + + +def generate_views_for_dimensions( + dimensions, tables, dimensions_order_in_slug=None, additional_config=None, warn_on_missing_combinations=True +): + """Generate individual views for all possible combinations of dimensions in a list of flattened tables. + + Parameters + ---------- + dimensions : List[Dict[str, Any]] + Dimensions, as given in the configuration of the multidim step, e.g. + [ + {'slug': 'frequency', 'name': 'Frequency', 'choices': [{'slug': 'annual','name': 'Annual'}, {'slug': 'monthly', 'name': 'Monthly'}]}, + {'slug': 'source', 'name': 'Energy source', 'choices': [{'slug': 'electricity', 'name': 'Electricity'}, {'slug': 'gas', 'name': 'Gas'}]}, + ... + ] + tables : List[Table] + Tables whose indicator views will be generated. + dimensions_order_in_slug : Tuple[str], optional + Dimension names, as they appear in "dimensions", and in the order in which they are spelled out in indicator names. For example, if indicator names are, e.g. annual_electricity_euros, then dimensions_order_in_slug would be ("frequency", "source", "unit"). + additional_config : _type_, optional + Additional config fields to add to each view, e.g. + {"chartTypes": ["LineChart"], "hasMapTab": True, "tab": "map"} + warn_on_missing_combinations : bool, optional + True to warn if any combination of dimensions is not found among the indicators in the given tables. + + Returns + ------- + results : List[Dict[str, Any]] + Views configuration, e.g. + [ + {'dimensions': {'frequency': 'annual', 'source': 'electricity', 'unit': 'euro'}, 'indicators': {'y': 'grapher/energy/2024-11-20/energy_prices/energy_prices_annual#annual_electricity_household_total_price_including_taxes_euro'}, + {'dimensions': {'frequency': 'annual', 'source': 'electricity', 'unit': 'pps'}, 'indicators': {'y': 'grapher/energy/2024-11-20/energy_prices/energy_prices_annual#annual_electricity_household_total_price_including_taxes_pps'}, + ... + ] + + """ + # Extract all choices for each dimension as (slug, choice_slug) pairs. + choices = {dim["slug"]: [choice["slug"] for choice in dim["choices"]] for dim in dimensions} + dimension_slugs_in_config = set(choices.keys()) + + # Sanity check for dimensions_order_in_slug. + if dimensions_order_in_slug: + dimension_slugs_in_order = set(dimensions_order_in_slug) + + # Check if any slug in the order is missing from the config. + missing_slugs = dimension_slugs_in_order - dimension_slugs_in_config + if missing_slugs: + raise ValueError( + f"The following dimensions are in 'dimensions_order_in_slug' but not in the config: {missing_slugs}" + ) + + # Check if any slug in the config is missing from the order. + extra_slugs = dimension_slugs_in_config - dimension_slugs_in_order + if extra_slugs: + log.warning( + f"The following dimensions are in the config but not in 'dimensions_order_in_slug': {extra_slugs}" + ) + + # Reorder choices to match the specified order. + choices = {dim: choices[dim] for dim in dimensions_order_in_slug if dim in choices} + + # Generate all combinations of the choices. + all_combinations = list(product(*choices.values())) + + # Create the views. + results = [] + for combination in all_combinations: + # Map dimension slugs to the chosen values. + dimension_mapping = {dim_slug: choice for dim_slug, choice in zip(choices.keys(), combination)} + slug_combination = "_".join(combination) + + # Find relevant tables for the current combination. + relevant_table = [] + for table in tables: + if slug_combination in table: + relevant_table.append(table) + + # Handle missing or multiple table matches. + if len(relevant_table) == 0: + if warn_on_missing_combinations: + log.warning(f"Combination {slug_combination} not found in tables") + continue + elif len(relevant_table) > 1: + log.warning(f"Combination {slug_combination} found in multiple tables: {relevant_table}") + + # Construct the indicator path. + indicator_path = f"{relevant_table[0].metadata.dataset.uri}/{relevant_table[0].metadata.short_name}#{trim_long_variable_name(slug_combination)}" + indicators = { + "y": indicator_path, + } + # Append the combination to results. + results.append({"dimensions": dimension_mapping, "indicators": indicators}) + + if additional_config: + # Include additional fields in all results. + for result in results: + result.update({"config": additional_config}) + + return results diff --git a/etl/paths.py b/etl/paths.py index f5a52647586..f55d81d3c71 100644 --- a/etl/paths.py +++ b/etl/paths.py @@ -23,7 +23,6 @@ # Snapshots SNAPSHOTS_DIR = BASE_DIR / "snapshots" -SNAPSHOTS_DIR_ARCHIVE = BASE_DIR / "snapshots_archive" # ETL library ETL_DIR = BASE_DIR / "etl" @@ -32,7 +31,6 @@ STEPS_MEADOW_DIR = STEPS_DATA_DIR / "meadow" STEPS_GARDEN_DIR = STEPS_DATA_DIR / "garden" STEPS_GRAPHER_DIR = STEPS_DATA_DIR / "grapher" -STEP_DIR_ARCHIVE = STEP_DIR / "archive" # Apps APPS_DIR = BASE_DIR / "apps" diff --git a/etl/scripts/anomalies/detect_anomalies.py b/etl/scripts/anomalies/detect_anomalies.py index 184b2811bdd..72422855b15 100644 --- a/etl/scripts/anomalies/detect_anomalies.py +++ b/etl/scripts/anomalies/detect_anomalies.py @@ -87,7 +87,7 @@ def load_data_for_dataset_id(dataset_id: int) -> Tuple[pd.DataFrame, List[gm.Var # log.info(f"Loading data from local ETL file: {etl_file}") # ds_etl = catalog.Dataset(etl_file) # if ds_etl.table_names == [ds.shortName]: - # df = pd.DataFrame(ds_etl.read_table(ds.shortName)) # type: ignore + # df = pd.DataFrame(ds_etl.read(ds.shortName)) # type: ignore # # Change column names to variable ids. # df = df.rename(columns={column: ds_variable_ids[column] for column in df.columns if column in ds_variable_ids}, errors="raise").rename(columns={"country": "entity_name"}, errors="raise") diff --git a/etl/scripts/archive/run_all_snapshots/run_all_snapshots.py b/etl/scripts/archive/run_all_snapshots/run_all_snapshots.py index 859786f0249..bc1c25ff2f8 100644 --- a/etl/scripts/archive/run_all_snapshots/run_all_snapshots.py +++ b/etl/scripts/archive/run_all_snapshots/run_all_snapshots.py @@ -58,7 +58,7 @@ def main(): log.info(f"Skipping {snapshot_script} because it does not have --upload flag.") continue # Skip scripts that require the use of a local file. - if "--path-to-file" in snapshot_text: + if ("--path-to-file" in snapshot_text) or ("-f " in snapshot_text): log.info(f"Skipping {snapshot_script} because it requires a local file.") continue diff --git a/etl/snapshot.py b/etl/snapshot.py index b27cf7755ed..6dea1c71181 100644 --- a/etl/snapshot.py +++ b/etl/snapshot.py @@ -4,17 +4,17 @@ import tempfile from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Iterator, Optional, Union +from typing import Any, Iterator, Optional, Union, cast import owid.catalog.processing as pr import pandas as pd import structlog import yaml -from dataclasses_json import dataclass_json from owid.catalog import Table, s3_utils from owid.catalog.meta import ( DatasetMeta, License, + MetaBase, Origin, Source, TableMeta, @@ -22,6 +22,7 @@ ) from owid.datautils import dataframes from owid.datautils.io import decompress_file +from owid.repack import to_safe_types from owid.walden import files from etl import config, paths @@ -59,11 +60,7 @@ def path(self) -> Path: @property def metadata_path(self) -> Path: """Path to metadata file.""" - archive_path = Path(f"{paths.SNAPSHOTS_DIR_ARCHIVE / self.uri}.dvc") - if archive_path.exists(): - return archive_path - else: - return Path(f"{paths.SNAPSHOTS_DIR / self.uri}.dvc") + return Path(f"{paths.SNAPSHOTS_DIR / self.uri}.dvc") def _download_dvc_file(self, md5: str) -> None: """Download file from remote to self.path.""" @@ -287,9 +284,8 @@ def read_in_archive(self, filename: str, *args, **kwargs) -> Table: @pruned_json -@dataclass_json @dataclass -class SnapshotMeta: +class SnapshotMeta(MetaBase): # how we identify the dataset, determined automatically from snapshot path namespace: str # a short source name (usually institution name) version: str # date, `latest` or year (discouraged) @@ -342,7 +338,7 @@ def to_yaml(self) -> str: return yaml_dump({"meta": d}) # type: ignore - def save(self) -> None: + def save(self) -> None: # type: ignore self.path.parent.mkdir(exist_ok=True, parents=True) with open(self.path, "w") as f: f.write(self.to_yaml()) @@ -408,13 +404,6 @@ def md5(self) -> str: assert len(self.outs) == 1 return self.outs[0]["md5"] - def to_dict(self) -> Dict[str, Any]: - ... - - @staticmethod - def from_dict(d: Dict[str, Any]) -> "SnapshotMeta": - ... - def fill_from_backport_snapshot(self, snap_config_path: Path) -> None: """Load metadat from backported snapshot. @@ -492,6 +481,7 @@ def read_table_from_snapshot( table_metadata: TableMeta, snapshot_origin: Union[Origin, None], file_extension: str, + safe_types: bool = True, *args, **kwargs, ) -> Table: @@ -508,24 +498,29 @@ def read_table_from_snapshot( } # Read table if file_extension == "csv": - return pr.read_csv(*args, **kwargs) + tb = pr.read_csv(*args, **kwargs) elif file_extension == "feather": - return pr.read_feather(*args, **kwargs) + tb = pr.read_feather(*args, **kwargs) elif file_extension in ["xlsx", "xls", "xlsm", "xlsb", "odf", "ods", "odt"]: - return pr.read_excel(*args, **kwargs) + tb = pr.read_excel(*args, **kwargs) elif file_extension == "json": - return pr.read_json(*args, **kwargs) + tb = pr.read_json(*args, **kwargs) elif file_extension == "dta": - return pr.read_stata(*args, **kwargs) + tb = pr.read_stata(*args, **kwargs) elif file_extension == "rds": - return pr.read_rds(*args, **kwargs) + tb = pr.read_rds(*args, **kwargs) elif file_extension == "rda": - return pr.read_rda(*args, **kwargs) + tb = pr.read_rda(*args, **kwargs) elif file_extension == "parquet": - return pr.read_parquet(*args, **kwargs) + tb = pr.read_parquet(*args, **kwargs) else: raise ValueError(f"Unknown extension {file_extension}") + if safe_types: + tb = cast(Table, to_safe_types(tb)) + + return tb + def add_snapshot( uri: str, diff --git a/etl/steps/__init__.py b/etl/steps/__init__.py index 59417ec7ea4..9be0e7c4132 100644 --- a/etl/steps/__init__.py +++ b/etl/steps/__init__.py @@ -29,6 +29,7 @@ from owid import catalog from owid.catalog import s3_utils from owid.catalog.catalogs import OWID_CATALOG_URI +from owid.catalog.datasets import DEFAULT_FORMATS from owid.walden import CATALOG as WALDEN_CATALOG from owid.walden import Catalog as WaldenCatalog from owid.walden import Dataset as WaldenDataset @@ -662,8 +663,18 @@ def _download_dataset_from_catalog(self) -> bool: if self.checksum_output() != ds_meta["source_checksum"]: return False + # only download DEFAULT_FORMATS + include = [".meta.json"] + [f".{format}" for format in DEFAULT_FORMATS] + r2 = s3_utils.connect_r2_cached() - s3_utils.download_s3_folder(f"s3://owid-catalog/{self.path}", self._dest_dir, client=r2, ignore="index.json") + s3_utils.download_s3_folder( + f"s3://owid-catalog/{self.path}/", + self._dest_dir, + client=r2, + include=include, + exclude=["index.json"], + delete=True, + ) """download files over HTTPS, the problem is that we don't have a list of tables to download in index.json diff --git a/etl/steps/archive/explorers/agriculture/2023-05-26/crop_yields.py b/etl/steps/archive/explorers/agriculture/2023-05-26/crop_yields.py deleted file mode 100644 index b4e4a2d3f0b..00000000000 --- a/etl/steps/archive/explorers/agriculture/2023-05-26/crop_yields.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Load a garden dataset and create an explorers dataset. - -The output csv file will feed our Crop Yields explorer: -https://ourworldindata.org/explorers/crop-yields -""" - -from typing import cast - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden = cast(Dataset, paths.load_dependency("attainable_yields")) - - # Read table from garden dataset. - tb_garden = ds_garden["attainable_yields"] - - # Rename table to have the same name as the current step, for consistency. - tb_garden.metadata.short_name = paths.short_name - - # Create explorer dataset, with garden table and metadata in csv format - ds_explorer = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata, formats=["csv"]) - ds_explorer.save() diff --git a/etl/steps/archive/explorers/agriculture/2023-05-30/crop_yields.py b/etl/steps/archive/explorers/agriculture/2023-05-30/crop_yields.py deleted file mode 100644 index b4e4a2d3f0b..00000000000 --- a/etl/steps/archive/explorers/agriculture/2023-05-30/crop_yields.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Load a garden dataset and create an explorers dataset. - -The output csv file will feed our Crop Yields explorer: -https://ourworldindata.org/explorers/crop-yields -""" - -from typing import cast - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden = cast(Dataset, paths.load_dependency("attainable_yields")) - - # Read table from garden dataset. - tb_garden = ds_garden["attainable_yields"] - - # Rename table to have the same name as the current step, for consistency. - tb_garden.metadata.short_name = paths.short_name - - # Create explorer dataset, with garden table and metadata in csv format - ds_explorer = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata, formats=["csv"]) - ds_explorer.save() diff --git a/etl/steps/archive/explorers/emdat/2022-12-07/natural_disasters.py b/etl/steps/archive/explorers/emdat/2022-12-07/natural_disasters.py deleted file mode 100644 index 3c2fcb17b0c..00000000000 --- a/etl/steps/archive/explorers/emdat/2022-12-07/natural_disasters.py +++ /dev/null @@ -1,110 +0,0 @@ -"""Natural disasters explorer data step. - -Loads the latest EM-DAT natural_disasters data from garden and stores a table (as a csv file) for yearly data, and -another for decadal data. - -NOTES: -* Some of the columns in the output files are not used by the explorer (but they appear in the "Sort by" dropdown menu), - consider removing them. For now, we'll ensure all of the old columns are present, to avoid any possible issues. -* Most charts in the explorer are generated from the data in the files, but 3 of them are directly linked to grapher - charts, namely: - "All disasters (by type) - Deaths - Decadal average - false" - "All disasters (by type) - Deaths - Decadal average - true" - "All disasters (by type) - Economic damages (% GDP) - Decadal average - false" - At some point it would be good to let the explorer take all the data from files. - -""" - -from copy import deepcopy - -from owid import catalog - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -N = PathFinder(__file__) - -# Mapping of old to new disaster type names. -DISASTER_TYPE_RENAMING = { - "all_disasters": "all_disasters", - "drought": "drought", - "earthquake": "earthquake", - "extreme_temperature": "temperature", - "flood": "flood", - "fog": "fog", - "glacial_lake_outburst": "glacial_lake", - "landslide": "landslide", - "dry_mass_movement": "mass_movement", - "extreme_weather": "storm", - "volcanic_activity": "volcanic", - "wildfire": "wildfire", -} - - -def create_wide_tables(table: catalog.Table) -> catalog.Table: - """Convert input table from long to wide format, and adjust column names to adjust to the old names in the files - used by the explorer. - """ - # Adapt disaster type names to match those in the old explorer files. - table = table.reset_index() - table["type"] = table["type"].replace(DISASTER_TYPE_RENAMING) - - # Create wide dataframes. - table_wide = table.pivot(index=["country", "year"], columns="type") - - # Flatten column indexes and rename columns to match the old names in explorer. - table_wide.columns = [ - f"{column}_{subcolumn}".replace("per_100k_people", "rate_per_100k") - .replace("total_dead", "deaths") - .replace("total_damages_per_gdp", "total_damages_pct_gdp") - for column, subcolumn in table_wide.columns - ] - - # Remove unnecessary columns. - table_wide = table_wide[ - [ - column - for column in table_wide.columns - if not column.startswith( - ("gdp_", "n_events_", "population_", "insured_damages_per_gdp", "reconstruction_costs_per_gdp_") - ) - if column - not in [ - "affected_rate_per_100k_glacial_lake", - "homeless_rate_per_100k_glacial_lake", - "total_damages_pct_gdp_fog", - ] - ] - ] - - # Adapt table to the format for explorer files. - table_wide = table_wide.reset_index() - - # Set an appropriate index and sort conveniently. - table_wide = table_wide.set_index(["country", "year"], verify_integrity=True).sort_index() - - return table_wide - - -def run(dest_dir: str) -> None: - # Load the latest dataset from garden. - dataset_garden_latest_dir = sorted((DATA_DIR / "garden" / "emdat").glob("*/natural_disasters"))[-1] - dataset_garden = catalog.Dataset(dataset_garden_latest_dir) - - # Load tables with yearly and decadal data. - table_yearly = dataset_garden["natural_disasters_yearly"] - table_decade = dataset_garden["natural_disasters_decadal"] - - # Create wide tables adapted to the old format in explorers. - table_yearly_wide = create_wide_tables(table=table_yearly) - table_decade_wide = create_wide_tables(table=table_decade) - - # Initialize a new grapher dataset and add dataset metadata. - dataset = catalog.Dataset.create_empty(dest_dir) - dataset.metadata = deepcopy(dataset_garden.metadata) - dataset.metadata.version = N.version - dataset.save() - - # Add tables to dataset. Force publication in csv. - dataset.add(table_yearly_wide, formats=["csv"]) - dataset.add(table_decade_wide, formats=["csv"]) diff --git a/etl/steps/archive/explorers/faostat/2023-02-22/food_explorer.py b/etl/steps/archive/explorers/faostat/2023-02-22/food_explorer.py deleted file mode 100644 index e4097980325..00000000000 --- a/etl/steps/archive/explorers/faostat/2023-02-22/food_explorer.py +++ /dev/null @@ -1,117 +0,0 @@ -"""Food explorer data step. - -Loads the faostat_food_explorer dataset from garden and stores a table (as a csv file) for each food product. - -""" - -import sys -from typing import List - -from owid.catalog import Dataset, Table, utils -from tqdm.auto import tqdm - -from etl.helpers import PathFinder, create_dataset - -paths = PathFinder(__file__) - -# Rename columns to be used by the food explorer. -# Note: Include here all columns, even if the name is not changed. -EXPECTED_COLUMNS = { - "population": "population", - "area_harvested__hectares": "area_harvested__ha", - "area_harvested__hectares_per_capita": "area_harvested__ha__per_capita", - "domestic_supply__tonnes": "domestic_supply__tonnes", - "domestic_supply__tonnes_per_capita": "domestic_supply__tonnes__per_capita", - "exports__tonnes": "exports__tonnes", - "exports__tonnes_per_capita": "exports__tonnes__per_capita", - "feed__tonnes": "feed__tonnes", - "feed__tonnes_per_capita": "feed__tonnes__per_capita", - "food__tonnes": "food__tonnes", - "food__tonnes_per_capita": "food__tonnes__per_capita", - "food_available_for_consumption__grams_of_fat_per_day_per_capita": "food_available_for_consumption__fat_g_per_day__per_capita", - "food_available_for_consumption__kilocalories_per_day_per_capita": "food_available_for_consumption__kcal_per_day__per_capita", - "food_available_for_consumption__kilograms_per_year_per_capita": "food_available_for_consumption__kg_per_year__per_capita", - "food_available_for_consumption__grams_of_protein_per_day_per_capita": "food_available_for_consumption__protein_g_per_day__per_capita", - "imports__tonnes": "imports__tonnes", - "imports__tonnes_per_capita": "imports__tonnes__per_capita", - "other_uses__tonnes": "other_uses__tonnes", - "other_uses__tonnes_per_capita": "other_uses__tonnes__per_capita", - "producing_or_slaughtered_animals__animals": "producing_or_slaughtered_animals__animals", - "producing_or_slaughtered_animals__animals_per_capita": "producing_or_slaughtered_animals__animals__per_capita", - "production__tonnes": "production__tonnes", - "production__tonnes_per_capita": "production__tonnes__per_capita", - "waste_in_supply_chain__tonnes": "waste_in_supply_chain__tonnes", - "waste_in_supply_chain__tonnes_per_capita": "waste_in_supply_chain__tonnes__per_capita", - "yield__kilograms_per_animal": "yield__kg_per_animal", - "yield__tonnes_per_hectare": "yield__tonnes_per_ha", -} - - -def create_table_for_each_product(tb_garden: Table) -> List[Table]: - """Create a list of tables, one for each product found in a garden table. - - Parameters - ---------- - tb_garden : Table - Table of products from garden dataset. - - Returns - ------- - tables : List[Table] - List of tables, one for each product. - - """ - # List all products in table - products = sorted(tb_garden.index.get_level_values("product").unique().tolist()) - - tables = [] - for product in tqdm(products, file=sys.stdout): - # Save a table for each food product. - table_product = tb_garden.loc[product].copy() - - # Update table metadata. - table_product.title = product - - # Rename columns, select the required ones, and sort columns and rows conveniently. - table_product = table_product[list(EXPECTED_COLUMNS)].rename(columns=EXPECTED_COLUMNS) - table_product = table_product[ - ["population"] + [column for column in sorted(table_product.columns) if column not in ["population"]] - ] - table_product = table_product.sort_index() - - table_product.metadata.short_name = ( - utils.underscore(name=product, validate=True).replace("__", "_").replace("_e_g_", "_eg_") - ) - - # Add table to list of all tables to include in the explorers dataset. - tables.append(table_product) - - return tables - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load the dataset for FAOSTAT food explorer from garden. - ds_garden: Dataset = paths.load_dependency("faostat_food_explorer") - - # Get the table of all food products. - tb_garden = ds_garden["faostat_food_explorer"] - - # - # Process data. - # - tables = create_table_for_each_product(tb_garden=tb_garden) - - # - # Save outputs. - # - # Initialize new explorers dataset. - ds_explorers = create_dataset( - dest_dir=dest_dir, tables=tables, default_metadata=ds_garden.metadata, formats=["csv"] - ) - ds_explorers.metadata.short_name = "food_explorer" - - # Create new explorers dataset. - ds_explorers.save() diff --git a/etl/steps/archive/explorers/owid/2021/food_explorer.elements.std.csv b/etl/steps/archive/explorers/owid/2021/food_explorer.elements.std.csv deleted file mode 100644 index 5af29a1a588..00000000000 --- a/etl/steps/archive/explorers/owid/2021/food_explorer.elements.std.csv +++ /dev/null @@ -1,33 +0,0 @@ -code,name,unit,unit_description,number_occurrences,Dataset,name_standardised,unit_name_standardised_with_conversion,unit_factor -5312,Area harvested,ha,hectares,539828,QCL,Area harvested,ha,1 -5301,Domestic supply quantity,1000 tonnes,thousand tonnes,1043347,FBSC,Domestic supply,tonnes,1.00E+03 -5911,Export Quantity,1000 tonnes,thousand tonnes,842139,FBSC,Exports,tonnes,1.00E+03 -684,Fat supply quantity (g/capita/day),g/capita/day,grams per capita per day,866317,FBSC,Food available for consumption,fat_g_per_day_per_capita,1 -5521,Feed,1000 tonnes,thousand tonnes,219816,FBSC,Feed,tonnes,1.00E+03 -5142,Food,1000 tonnes,thousand tonnes,966295,FBSC,Food,tonnes,1.00E+03 -664,Food supply (kcal/capita/day),kcal/capita/day,kilocalorie per capita per day,1005702,FBSC,Food available for consumption,kcal_per_day_per_capita,1 -645,Food supply quantity (kg/capita/yr),kg,kilograms,966185,FBSC,Food available for consumption,kg_per_capita_per_year,1 -5611,Import Quantity,1000 tonnes,thousand tonnes,1008063,FBSC,Imports,tonnes,1.00E+03 -5313,Laying,1000 Head,thousand head,23081,QCL,,, -5123,Losses,1000 tonnes,thousand tonnes,339465,FBSC,Waste in Supply Chain,tonnes,1.00E+03 -5154,Other uses (non-food),1000 tonnes,thousand tonnes,418035,FBSC,Other uses,tonnes,1.00E+03 -5131,Processing,1000 tonnes,thousand tonnes,199001,FBSC,,, -5321,Producing Animals/Slaughtered,1000 Head,thousand head,35354,QCL,Producing or slaughtered animals,animals,1000 -5320,Producing Animals/Slaughtered,Head,head,149439,QCL,Producing or slaughtered animals,animals,1 -5513,Production,1000 No,thousand number,13174,QCL,,, -5510,Production,tonnes,tonnes,996973,QCL,Production,tonnes,1 -5511,Production,1000 tonnes,thousand tonnes,683798,FBSC,,, -674,Protein supply quantity (g/capita/day),g/capita/day,grams per capita per day,868087,FBSC,Food available for consumption,protein_g_per_day_per_capita,1 -5170,Residuals,1000 tonnes,thousand tonnes,89573,FBSC,,, -5527,Seed,1000 tonnes,thousand tonnes,159077,FBSC,,, -5072,Stock Variation,1000 tonnes,thousand tonnes,552014,FBSC,,, -5114,Stocks,No,number,6085,QCL,,, -5112,Stocks,1000 Head,thousand head,36311,QCL,,, -5111,Stocks,Head,head,86112,QCL,,, -5171,Tourist consumption,1000 tonnes,thousand tonnes,19516,FBSC,,, -5422,Yield,hg,hectogram,8857,QCL,,, -5410,Yield,100mg/An,100 milligrams per animal,23088,QCL,Yield,kg_per_animal,1.00E-04 -5420,Yield,hg/An,hectograms per animal,63705,QCL,Yield,kg_per_animal,1.00E-01 -5419,Yield,hg/ha,hectograms per hectare,534847,QCL,Yield,tonnes_per_ha,1.00E-04 -5424,Yield/Carcass Weight,0.1g/An,0.1 grams per animal,34674,QCL,Yield,kg_per_animal,1.00E-04 -5417,Yield/Carcass Weight,hg/An,hectograms per animal,69432,QCL,Yield,kg_per_animal,1.00E-01 \ No newline at end of file diff --git a/etl/steps/archive/explorers/owid/2021/food_explorer.items.std.csv b/etl/steps/archive/explorers/owid/2021/food_explorer.items.std.csv deleted file mode 100644 index 2b89f2b3157..00000000000 --- a/etl/steps/archive/explorers/owid/2021/food_explorer.items.std.csv +++ /dev/null @@ -1,428 +0,0 @@ -code,name,type,number_occurences,dataset,name_standardised,Description -800,Agave fibres nes,,484,QCL,,"Including inter alia: Haiti hemp (Agave foetida); henequen (A. fourcroydes); ixtle, tampico (A. lecheguilla); maguey (A. cantala); pita (A. americana); Salvador hemp (A. letonae). See 789. The leaves of some agave varieties are used for the production of alcoholic beverages, such as aquamiel, mezcal, pulque and tequila." -2659,"Alcohol, Non-Food",,8710,FBSC,,Default composition: 632 Alcohol non food -2924,Alcoholic Beverages,Group,9513,FBSC,, -221,"Almonds, with shell",,2450,QCL,Almonds,"Prunus amygdalus; P. communis; Amygdalus communis. Produced mainly in Mediterranean countries, the United States and Asia." -2946,Animal fats,Group,9528,FBSC,, -2941,Animal Products,Group,9513,FBSC,, -711,"Anise, badian, fennel, coriander",,2250,QCL,Herbs (e.g. fennel),"Include: anise (Pimpinella anisum); badian or star anise (Illicium verum); caraway (Carum carvi); coriander (Coriandrum sativum); cumin (Cuminum cyminum); fennel (Foeniculum vulgare); juniper berries (Juniperus communis). Seeds and berries from the various plants listed. They are normally used as spices, but also have industrial (e.g. in distilleries) and medicinal applications." -515,Apples,,4779,QCL,Apples,Malus pumila; M. sylvestris; M. communis; Pyrus malus. -2617,Apples and products,,9518,FBSC,Apples,"Default composition: 515 Apples, 518 Juice, apple, single strength, 519 Juice, apple, concentrated" -526,Apricots,,3550,QCL,Apricots,Prunus armeniaca. -2769,"Aquatic Animals, Others",,7465,FBSC,,"Default composition: 1587 Aqutc Anim F, 1588 Aq A Cured, 1589 Aquatic Animals Meals, 1590 Aq A Prep Ns" -2775,Aquatic Plants,,8863,FBSC,,"Default composition: 1594 Aquatic plants, fresh, 1595 Aquatic plants, dried, 1596 Aquatic plants, other preparations" -2961,"Aquatic Products, Other",Group,9078,FBSC,, -226,Areca nuts,,745,QCL,Areca nuts,"Areca, betel nut (Areca catechu). Produced mainly in the Far East. Areca nuts are used mainly as masticatory. These nuts contain alkaloids (arecoline and arecaidine)." -366,Artichokes,,1816,QCL,Artichokes,Cynara scolymus. -367,Asparagus,,2427,QCL,Asparagus,Asparagus officinalis. -1107,Asses,,6657,QCL,Asses,Equus asinus. See 866. -572,Avocados,,3967,QCL,Avocados,Persea americana. -203,Bambara beans,,361,QCL,,"Bambara groundnut, earth pea (Voandzeia subterranea). These beand are grown underground in a similar way to groundnuts." -486,Bananas,,7472,QCL,Bananas,Musa sapientum; M. cavendishii; M. nana. Bananas are normally eaten raw. Trade figures may include dried bananas. Data should be reported excluding the weight of the central stalk. -2615,Bananas,,9255,FBSC,Bananas,Default composition: 486 Bananas -44,Barley,,5273,QCL,Barley,"Hordeum spp.: two-row barley (H. disticum) six-row barley (H. hexasticum) four-row barley (H. vulgare). Tolerates poorer soils and lower temperatures better than does wheat. Varieties include with husk and without (naked). Used as a livestock feed, for malt and for preparing foods. The roasted grains are a coffee substitute." -2513,Barley and products,,9465,FBSC,Barley,"Default composition: 44 Barley, 45 Barley, pot, 46 Barley, pearled, 47 Bran, barley, 48 Flour, barley and grits, 49 Malt, 50 Malt extract" -782,"Bastfibres, other",,1266,QCL,,"Including inter alia: China jute (Abutilon avicennae); Congo jute, malva, paka (Urena lobata; U. sinuata); Indian flax (Abroma augusta); kenaf, meshta (Hibiscus cannabinus); rosella hemp (H. sabdariffa); sunn hemp (Crotalaria juncea). This definition covers all textile fibres extracted from the stems of dicotyledonous plants, o/t flax, ramie, true hemp and true jute. For trade coverage see 780." -2546,Beans,,9234,FBSC,"Beans, dry","Default composition: 176 Beans, dry" -176,"Beans, dry",,6639,QCL,"Beans, dry","Phaseolus spp.: kidney, haricot bean (Ph. vulgaris); lima, butter bean (Ph. lunatus); adzuki bean (Ph. angularis); mungo bean, golden, green gram (Ph. aureus); black gram, urd (Ph. mungo); scarlet runner bean (Ph. coccineus); rice bean (Ph. calcaratus); moth bean (Ph. aconitifolius); tepary bean (Ph. acutifolius). Only species of Phaseolus should be included, though several countries also include certain types of beans. Commonly classified as Vigna (angularis, mungo, radiata, aconitifolia). In the past, these species were also classified as Phaseolus." -414,"Beans, green",,5815,QCL,"Beans, green",Phaseolus and Vigna spp.. For shelling. -1806,Beef and Buffalo Meat,Group,10735,QCL,"Meat, beef and buffalo","The term ""LIVESTOCK"" is used in a broad sense to cover all grown animals regardless of age, location or purpose of breeding. Non-domesticated animals are excluded under this definition unless they are kept or raised in captivity. Domestic animals included are large and small quadrupeds, poultry, insects (bees) and larvae of insects (silkworms). Figures on livestock numbers should refer to live animals enumerated on a given day or on several consecutive days. The FAO practice is that figures for an indicated year relate to animals reported by countries for any day between October of the previous year and September of the year indicated. Statistics on live animals by age, sex and utilization generally are not included in the list that follows, even though such breakdowns are extremely desirable in terms of national statistics. For each animal species FAO proposes that information be maintained on changes in national herds during the year according to the following equation: initial herd + animals born + imports of live animals - exports of live animals - natural losses - slaughter = closing herd.FAO defines meat as the flesh of animals used for food. In production data, meat is normally reported inclusive of bone and exclusive of meat that is unfit for human consumption. As reported by individual countries, meat production data may refer either to commercial production (meat entering marketing channels), inspected production (from animals slaughtered under sanitary inspection), or total production (the total of the above- mentioned categories plus slaughter for personal consumption). All FAO annual production data refer to total production.Country statistics on meat production adhere to one or more of the following concepts: 1. Live weight: the weight of the animal immediately before slaughter. 2. Killed weight: the live weight less the uncollected blood lost during slaughter. 3. Dressed carcass weight: weight minus all parts - edible and inedible - that are removed in dressing the carcass. The concept varies widely from country to country and according to the various species of livestock. Edible parts generally include edible offals (head or head meat, tongue, brains, heart, liver, spleen, stomach or tripes and, in a few countries, other parts such as feet, throat and lungs. Slaughter fats (the unrendered fats that fall in the course of dressing the carcasses) are recorded as either edible or inedible according to country practice. Inedible parts generally include hides and skins (except in the case of pigs), as well as hoofs and stomach contents.Among individual countries, one of the following three concepts issued to measure production:A. Production from all animals, of both indigenous and foreign origin, that are slaughtered within national boundaries. B. Production from the slaughter of indigenous animals plus exports of live indigenous animals during the reference period. Derived from meat production as follows: production from slaughtered animals plus the meat equivalent of all animals exported alive, minus the meat equivalent of all animals imported alive. As imports/exports of live animals are recorded by FAO in numbers, not weight, animal type and size are of significance. C. The biological production concept covers indigenous animals that are either slaughtered or exported live, plus net additions to the stock during the reference period. Derived from indigenous productions follows: indigenous production plus (or minus) the meat equivalent of the change in the stock numbers during the reference period. Production is expressed in terms of live weight. Changes in the total live weight of all animals are not taken into account.FAO uses the first concept of meat production in the construction of its food balance sheets and for related indicators. The second concept, indigenous meat production, in measuring the output of the national livestock sector, is useful mainly in the construction of index numbers of agricultural production. The third concept, biological production, would be the most complete as it also reflects changes in the livestock herd, but it is not used because of difficulties in obtaining information from national reporting offices. The prices applied to indigenous meat production are derived from prices of live animals. This covers not only the value of meat, but also the value of offals, fats, hides and skins.Beef and Veal, Dried, Salted, Smok Meat of bovine animals, whether salted, in brine, dried or smoked. Includes edible flours and meals.Beef and Veal Preparations nes Meat and offal (o/t liver) that are boiled, steamed, grilled, fried, roasted or otherwise cooked. Includes prepared meals that contain more than 20% of meat and offal by weight.BUFFALO MEAT Fresh, chilled or frozen, with bone in or boneless.OFFALS OF BUFFALO, EDIBLE Fresh, chilled or frozen." -1181,Beehives,,6290,QCL,Beehives,Apis mellifica; A. dorsata; A. florea; A. indica. A beehive is an artificial habitation for bees. -2656,Beer,,9513,FBSC,,Default composition: 51 Beer of barley -51,Beer of barley,,8619,QCL,,"Beverage that may be alcoholic or non-alcoholic that is made from fermented malted cereals (mainly barley), water and hops. Non-malted cereals may also be used. The FAO definition differs from the main international classifications in that it includes non-alcoholic beer." -1183,Beeswax,,3089,QCL,Beeswax,"The substance bees use to build the hexagonal cells of the combs of beehives. Includes other insect waxes as well, i.e. lac and Chinese wax." -558,Berries nes,,3012,QCL,,"Including inter alia: blackberry (Morus nigra); loganberry; white, red mulberry (M. alba; M. rubra); myrtle berry (Myrtus communis) huckleberry, dangleberry (Gaylussacia spp.). Other berries not separately identified. In some countries, some or all of the berries listed previously are reported under this general category." -2658,"Beverages, Alcoholic",,9518,FBSC,,"Default composition: 634 Beverages, distilled alcoholic" -2657,"Beverages, Fermented",,9040,FBSC,,"Default composition: 26 Beverages, fermented wheat, 39 Beverages, fermented rice, 66 Beer of maize, 82 Beer of millet, 86 Beer of sorghum, 517 Cider etc" -552,Blueberries,,1464,QCL,Blueberries,"European blueberry, wild bilberry, whortleberry (Vaccinium myrtillus); American blueberry (V. corymbosum). Trade data may include cranberries, myrtle berries and other fruits of the genus Vaccinium." -2731,Bovine Meat,,9518,FBSC,"Meat, beef","Default composition: 867 Meat, cattle, 870 Meat, cattle, boneless (beef & veal), 872 Meat, beef, dried, salted, smoked, 873 Meat, extracts, 874 Meat, beef and veal sausages, 875 Meat, beef, preparations, 876 Meat, beef, canned, 877 Meat, homogenized preparations, 947 Meat, buffalo" -216,"Brazil nuts, with shell",,306,QCL,"Brazil nuts, with shell","Brazil, Para or cream nut (Bertholletia excelsa)." -181,"Broad beans, horse beans, dry",,3285,QCL,Broad beans,Vicia faba: horse-bean (var. equina); broad bean (var. major); field bean (var. minor). -89,Buckwheat,,1397,QCL,Buckwheat,"Fagopyrum esculentum (Polygonaceae). A minor cereal cultivated primarily in northern regions. Buckwheat is considered a cereal, although it does not belong to the gramineous family." -946,Buffaloes,,2367,QCL,Buffaloes,"Indian, Asiatic, pigmy, water buffalo (Bubalus bubalus; B. arnee; B. depressicornis); African buffalo (genus Syncerus); American bison (Bison bison); European bison (Bison bonasus); beeffalo (cross between a bison and a domesticated beef animal). See 866. Excludes wild bisons and buffaloes." -1811,Butter and Ghee,Group,6163,QCL,Butter and Ghee, -983,"Butter and ghee, sheep milk",,638,QCL,, -952,"Butter, buffalo milk",,351,QCL,, -886,"Butter, cow milk",,5757,QCL,,Emulsion of milk fat and water that is obtained by churning cream. Trade data cover butter from the milk of any animal. -2740,"Butter, Ghee",,9518,FBSC,,"Default composition: 886 Butter, cow milk, 887 Ghee, butteroil of cow milk, 952 Butter, buffalo milk, 953 Ghee, of buffalo milk, 983 Butter and ghee, sheep milk, 1022 Butter of goat mlk" -1022,"Butter, goat milk",,29,QCL,, -358,Cabbages and other brassicas,,7873,QCL,Cabbages,"Chinese, mustard cabbage, pak-choi (Brassica chinensis); white, red, savoy cabbage, Brussels sprouts, collards, kale and kohlrabi (Brassica oleracea all var. except botrytis)." -1157,"Camelids, other",,118,QCL,,Various species of Lama: e.g. glama pacos (alpaca); peruana (llama); huanacos (guanaco>); vicugna (vicu¤a). See 866. -1126,Camels,,2391,QCL,Camels,Bactrian camel (Camelus bactrianus); Arabian camel (C. dromedarius). See 866. -101,Canary seed,,689,QCL,Canary seed,Phalaris canariensis. Minor cereal normally used as bird feed. -461,Carobs,,845,QCL,,"Ceratonia siliqua Carob-tree, locust bean. Includes also seeds. Mainly used as an animal feed and for industrial purposes. Rich in pectin." -426,Carrots and turnips,,6686,QCL,Carrots and turnips,Daucus carota. Trade data may include turnips (Brassica rapa var. rapifera). -217,"Cashew nuts, with shell",,1945,QCL,Cashew nuts,"Anacardium occidentale. Produced mainly in East Africa, India and Brazil." -591,Cashewapple,,236,QCL,,"Anacardium occidentale. The thickened, fleshy stem below the cashew nut. When soft it is used for jam." -125,Cassava,,5916,QCL,Cassava,"Manioc, mandioca, yuca (Manihot esculenta, syn. M. utilissima); yuca dulce (M. palmata, syn. M. dulcis). A semi-permanent crop grown in tropical and subtropical regions. Sometimes bitter and sweet cassavas are referred to as separate species, the former being M. esculenta and the latter M. palmata, but this is incorrect since the toxicity varies according to location. Cassava is the staple food in many tropical countries. It is not traded internationally in its fresh state because tubers deteriorate very rapidly." -2532,Cassava and products,,9169,FBSC,Cassava,"Default composition: 125 Cassava, 126 Flour, cassava, 127 Tapioca, cassava, 128 Cassava dried, 129 Starch, cassava" -265,Castor oil seed,,2472,QCL,Castor oil seed,"Ricinus communis. Valued mainly for their oil, which is used in pharmaceutical products. Ground seedcakes are used as fertilizers (castor oil pomace)." -866,Cattle,,10706,QCL,Cattle,"Common ox (Bos taurus); zebu, humped ox (Bos indicus); Asiatic ox (subgenus Bibos); Tibetan yak (Poephagus grunniens). Animals of the genus listed, regardless of age, sex, or purpose raised. Data are expressed in number of heads." -1746,Cattle and Buffaloes,Group,10676,QCL,Cattle and Buffaloes, -393,Cauliflowers and broccoli,,5012,QCL,Cauliflowers and broccoli,"Brassica oleracea var. botrytis, subvariety cauliflora and cymosa. Includes headed broccoli." -2766,Cephalopods,,9346,FBSC,,"Default composition: 1570 Cephlp Fresh, 1571 Cphlp Frozen, 1572 Cphlp Cured, 1573 Cphlp Canned, 1574 Cphlp Pr nes, 1575 Cphlp Meals" -2905,Cereals - Excluding Beer,Group,9513,FBSC,, -108,Cereals nes,,3110,QCL,,"Including inter alia: canagua or coaihua (Chenopodium pallidicaule); quihuicha or Inca wheat (Amaranthus caudatus); adlay or Job's tears (Coix lacryma-jobi); wild rice (Zizania aquatica). Other cereal crops that are not identified separately because of their minor relevance at the international level. Because of their limited local importance, some countries report cereals under this commodity heading that are classified individually by FAO." -2520,"Cereals, Other",,9443,FBSC,,"Default composition: 68 Popcorn, 89 Buckwheat, 90 Flour, buckwheat, 91 Bran, buckwheat, 92 Quinoa, 94 Fonio, 95 Flour, fonio, 96 Bran, fonio, 97 Triticale, 98 Flour, triticale, 99 Bran, triticale, 101 Canary seed, 103 Grain, mixed, 104 Flour, mixed grain, 105 Bran, mixed grains, 108 Cereals, nes, 111 Flour, cereals, 112 Bran, cereals nes, 113 Cereal preparations, nes" -1717,"Cereals, Total",Group,9869,QCL,Cereals,"Cereals are generally of the gramineous family and, in the FAO concept, refer to crops harvested for dry grain only. Crops harvested green for forage, silage or grazingare classified as fodder crops. Also excluded are industrial crops, e.g. broom sorghum (Crude organic materials nes) and sweet sorghum when grown for syrup (Sugar crops nes). For international trade classifications, fresh cereals (other than sweet corn), whether or not suitable for use as fresh vegetables, are classified as cereals. Cereals are identified according to their genus. However, when two or more genera are sown and harvested as a mixture they should be classified and reported as ""mixed grains"". Production data are reported in terms of clean, dry weight of grains (12-14 percent moisture) in the form usually marketed. Rice, however, is reported in terms of paddy. Apart from moisture content and inedible substances such as cellulose, cereal grains contain, along with traces of minerals and vitamins, carbohydrates - mainly starches - (comprising 65-75 percent of their total weight), as well as proteins (6-12 percent) and fat (1-5 percent). The FAO definitions cover 17 primary cereals, of which one - white maize - is a component of maize. Each definition is listed along with its code, botanical name or names, and a short description. Cereal products derive either from the processing of grain through one or more mechanical or chemical operations, or from the processing of flour, meal or starch. Each cereal product is listed after the cereal from which it is derived." -1745,Cheese (All Kinds),Group,5380,QCL,Cheese, -955,"Cheese, buffalo milk",,225,QCL,, -1021,"Cheese, goat milk",,1835,QCL,, -984,"Cheese, sheep milk",,1990,QCL,, -904,"Cheese, skimmed cow milk",,2255,QCL,,May include some whole milk. -901,"Cheese, whole cow milk",,4626,QCL,,Curd of milk that has been coagulated and separated from whey. May include some skimmed milk. -531,Cherries,,3196,QCL,Cherries,"Mazzard, sweet cherry (Prunus avium; Cerasus avium); hard-fleshed cherry (var. duracina); heart cherry (var. juliana)." -530,"Cherries, sour",,1542,QCL,,Prunus cerasus; Cerasus acida. -220,Chestnut,,1550,QCL,Chestnut,Castanea spp.: C. vesca; C. vulgaris; C. sativa. Produced mainly in Europe and Asia. -191,Chick peas,,3022,QCL,Chickpeas,"Chickpea, Bengal gram, garbanzos (Cicer arietinum)." -1057,Chickens,,10893,QCL,Chickens,Fowl (Gallus domesticus); Guinea fowl (Numida meleagris. Domesticated birds only. Data are expressed in thousands. -459,Chicory roots,,901,QCL,,Horium intybus; C. sativum. Unroasted chicory roots. -689,"Chillies and peppers, dry",,3835,QCL,"Chillies and peppers","Red and cayenne pepper, paprika, chillies (Capsicum frutescens; C. annuum); allspice, Jamaica pepper (Pimenta officinalis). Uncrushed or unground fresh pimentos are considered to be vegetables." -401,"Chillies and peppers, green",,6435,QCL,,"Capsicum annuum; C. fructescens; Pimenta officinalis. Production data exclude crops cultivated explicitly as spices. In contrast, trade data include these crops, provided they are fresh, uncrushed and unground." -693,Cinnamon (cannella),,661,QCL,,"Ceylon cinnamon (Cinnamomum zeylanicum); Chinese, common cinnamon, cassia (C. cassia). The inner bark of young branches of certain trees of the Laurus family. Includes cinnamon- tree flowers, cinnamon fruit and cinnamon waste (chips), whether whole, crushed or ground." -1804,"Citrus Fruit, Total",Group,7917,QCL,Citrus Fruit, -2614,"Citrus, Other",,8725,FBSC,,"Default composition: 512 Fruit, citrus nes, 513 Juice, citrus, single strength, 514 Juice, citrus, concentrated" -698,Cloves,,641,QCL,,"Eugenia caryophyllata; Caryophyllus aromaticus. The whole fruit of the clove tree, including the flowers picked before maturity and dried in the sun, and the stems of the clove flowers." -2642,Cloves,,7872,FBSC,,Default composition: 698 Cloves -2633,Cocoa Beans and products,,9518,FBSC,Cocoa beans,"Default composition: 661 Cocoa, beans, 662 Cocoa, paste, 665 Cocoa, powder & cake, 666 Chocolate products nes" -661,"Cocoa, beans",,3678,QCL,,"Theobroma cacao. The seeds contained in the fruit of the cacao- tree, including whole or broken, raw or roasted." -2578,Coconut Oil,,8946,FBSC,Coconut oil,"Default composition: 252 Oil, coconut (copra)" -249,Coconuts,,5341,QCL,,"Cocos nucifera Husked coconut. In shell, covered by the endocarp, while exocarp (the smooth outer skin) and mesocarp (the fibrous covering) are removed. Immature nuts contain a milky juice that is consumed as a refreshing drink. Mature nuts are consumed as such, or processed for copra or desiccated coconut. The flesh, from which copra/oil is extracted, constitutes 40-70% of the weight of the husked coconut. The oil content is about 36% of the flesh." -2560,Coconuts - Incl Copra,,9315,FBSC,Coconuts,"Default composition: 249 Coconuts, 250 Coconuts, desiccated, 251 Copra" -2630,Coffee and products,,9518,FBSC,,"Default composition: 656 Coffee, green, 657 Coffee, roasted, 659 Coffee, extracts" -656,"Coffee, green",,4939,QCL,"Coffee, green","Coffea spp. (arabica, robusta, liberica). Raw coffee in all forms." -813,Coir,,478,QCL,,Cocos nucifera. Coir fibre is obtained from the fibrous covering of the mesocarp. For trade coverage see 789. -767,Cotton lint,,5363,QCL,Cotton,"Gossypium spp. Fibres from ginning seed cotton that have not been carded or combed. Trade data also include fibres that have been cleaned, bleached, dyed or rendered absorbent." -329,Cottonseed,,5387,QCL,Cottonseed,Used for extracting oil. -2559,Cottonseed,,7644,FBSC,Cottonseed,Default composition: 329 Cottonseed -2575,Cottonseed Oil,,8535,FBSC,Cottonseed oil,"Default composition: 331 Oil, cottonseed" -195,"Cow peas, dry",,2188,QCL,Cow peas,"Cowpea, blackeye pea/bean (Vigna sinensis; Dolichos sinensis)." -554,Cranberries,,652,QCL,Cranberries,"American cranberry (Vaccinium macrocarpon); European cranberry (V. oxycoccus). Trade data may include blueberries, myrtle berries and other fruits of the genus Vaccinium." -2743,Cream,,8720,FBSC,,Default composition: 885 Cream fresh -885,Cream fresh,,3221,QCL,,That portion of milk which is rich in milk fat and is separated by skimming or centrifuging. -2765,Crustaceans,,9468,FBSC,,"Default composition: 1553 Crstaceans F, 1554 Crstc Frozen, 1555 Crstc Cured, 1556 Crstc Canned, 1557 Crstc Pr nes, 1558 Crstc Meals" -397,Cucumbers and gherkins,,6926,QCL,Cucumbers and gherkins,Cucumis sativus. -550,Currants,,1767,QCL,Currants,Black (Ribes nigrum); red and white (R. rubrum). Trade data may sometimes include gooseberries. -577,Dates,,2210,QCL,Dates,Phoenix dactylifera. Includes fresh and dried fruit. -2619,Dates,,8554,FBSC,Dates,Default composition: 577 Dates -2762,Demersal Fish,,9470,FBSC,,"Default composition: 1514 Dmrsl Fresh, 1515 Dmrsl Fz Whl, 1516 Dmrsl Fillet, 1517 Dmrsl Fz Flt, 1518 Dmrsl Cured, 1519 Dmrsl Canned, 1520 Dmrsl Pr nes, 1521 Dmrsl Meals" -1068,Ducks,,5261,QCL,Ducks,Anas spp.. See 1057. -399,Eggplants (aubergines),,4803,QCL,Eggplants,Solanum melongena. Also called aubergines. -2744,Eggs,,9518,FBSC,,"Default composition: 916 Egg albumine, 1062 Eggs, hen, in shell, 1063 Eggs, liquid, 1064 Eggs, dried, 1091 Eggs, other bird, in shell" -2949,Eggs,Group,9513,FBSC,Eggs, -1783,Eggs Primary,Group,10869,QCL,Eggs,"Eggs and Egg products. Egg production by type of poultry should refer to the total production of eggs in the shell by all types of hens in both the traditional sector (individually owned small flocks) and the modern sector (large-scale, intensive commercial poultry farms).Total productions include eggs for hatching but exclude waste of farms. Countries should report in terms of both numbers and weight.FAO lists seven egg and egg products items, including four primary and three processed products." -1062,"Eggs, hen, in shell",,10875,QCL,Eggs from hens,Weight in shell. -1067,"Eggs, hen, in shell (number)",,10880,QCL,Number of hen eggs, -1091,"Eggs, other bird, in shell",,2504,QCL,Eggs from other birds (excl. hens),Weight in shell. -1092,"Eggs, other bird, in shell (number)",,2458,QCL,Number of eggs from other birds, -1816,Evaporat&Condensed Milk,Group,3741,QCL,, -949,"Fat, buffaloes",,898,QCL,"Fat, buffaloes",See 869. -1129,"Fat, camels",,1071,QCL,"Fat, camels",Unrendered slaughter fats. -869,"Fat, cattle",,5581,QCL,"Fat, cattle","Unrendered slaughter fats from bovine animals, including edible and inedible fats that are removed in the course of dressing the carcass." -1019,"Fat, goats",,5041,QCL,"Fat, goats",Unrendered slaughter fats of goats. See 869. -1037,"Fat, pigs",,5132,QCL,"Fat, pigs",Unrendered slaughter fats of pigs. See 869. -979,"Fat, sheep",,5146,QCL,"Fat, sheep",Unrendered slaughter fats of sheep. See 869. -2737,"Fats, Animals, Raw",,9518,FBSC,Animal fats,"Default composition: 869 Fat, cattle, 871 Fat, cattle butcher, 949 Fat, buffaloes, 979 Fat, sheep, 994 Grease incl. lanolin wool, 1019 Fat, goats, 1037 Fat, pigs, 1040 Fat, pig butcher, 1043 Lard, 1065 Fat, poultry, 1066 Fat, poultry, rendered, 1129 Fat, camels, 1160 Fat, other camelids, 1168 Oils, fats of animal nes, 1221 Lard stearine oil, 1222 Degras, 1225 Tallow, 1243 Fat, nes, prepared" -821,Fibre crops nes,,1603,QCL,,"Including inter alia: alfa, esparto (Lygeum spartum; Stipa tenacissima); bowstring hemp (Sansevieria spp.); caroa (Neoglaziovia variegata); fuque fibre (Furcraea macrophylla); Mauritius hemp (F. gigantea); New Zealand flax (Phormium tenax); palma ixtle (Samuela carnerosana). Other fibres that are not identified separately because of their minor relevance at the international level. Because of their limited local importance, some countries report vegetable fibres under this commodity heading that are classified individually by FAO. The fibre is obtained from the leaves, stalks or fruit of the plant. In instances where the fibrous part is normally used for other purposes, data cover only those fibres intended for spinning. For trade coverage see 789." -1753,Fibre Crops Primary,Group,7215,QCL,Fibre crops,"Vegetal Fibres obtained from annual and perennial plants are used to produce thread and yarn that are manufactured into fabrics. These manufactured products are not classified as agricultural, and therefore are not included in the FAO definitions. Among vegetal fibres, FAO includes 12 primary crops and five derived agricultural products. The code and name of each of these crops is listed below, along with its botanical name, or names, and a short description where necessary. Certain fibre crops yield seeds used for sowing and for processing into oil and cake, and these are listed with oil-bearing crops and derived products." -569,Figs,,2735,QCL,,Ficus carica. -2781,"Fish, Body Oil",,8754,FBSC,,"Default composition: 1509 Frwt Bdy Oil, 1522 Dmrs Bdy Oil, 1535 Pelg Bdy Oil, 1548 Marn Bdy Oil, 1582 Aq M Oils" -2782,"Fish, Liver Oil",,8065,FBSC,,"Default composition: 1510 Frwt Lvr Oil, 1523 Demersal Liver Oils, 1536 Pelg Lvr Oil, 1549 Marine nes Liver Oils" -2960,"Fish, Seafood",Group,9513,FBSC,Fish and seafood, -773,Flax fibre and tow,,1729,QCL,Flax fibre,"Broken, scutched, hackled etc. but not spun. Traditionally, FAO has used this commodity to identify production in its raw state; in reality, the primary agricultural product is the commodity 771, which can either be used for the production of fibre or for other purposes." -94,Fonio,,590,QCL,,Digitaria spp.: fonio or findi (D. exilis); black fonio or hungry rice (D. iburua). A minor cereal of importance only in West Africa where it is eaten in place of rice during famines. The seeds are cooked by steaming the whole grain. -2761,Freshwater Fish,,9528,FBSC,,"Default composition: 1501 Frwtr Diad F, 1502 Frwtr Fz Whl, 1503 Frwtr Fillet, 1504 Frwtr Fz Flt, 1505 Frwtr Cured, 1506 Frwtr Canned, 1507 Frwtr Pr nes, 1508 Frwtr Meals" -1738,Fruit Primary,Group,10909,QCL,Fruit,"Fruit Crops consist of fruits and berries that, with few exceptions, are characterized by their sweet taste. Nearly all are permanent crops, mainly from trees, bushes and shrubs, as well as vines and palms. Fruits and berries grow on branches, stalks or the trunks of plants, usually singly, but sometimes grouped in bunches or clusters (e.g. bananas and grapes). Commercial crops are cultivated in plantations, but significant quantities of fruits are also collected from scattered plants that may or may not be cultivated. Although melons and watermelons are generally considered to be fruits, FAO groups them with vegetables because they are temporary crops. Fruit crops are highly perishable. Their shelf life may be extended through the application of chemical substances that inhibit the growth of micro-organisms and through careful control of the surrounding temperature, pressure and humidity once the fruit has been picked. Fruits and berries have a very high water content accounting for some 70- 90 percent of their weight. They contain, in various degrees, minerals, vitamins and organic acids, some of which reside in the peel or skin. Some fruits have a high fibre content and other inedible components, so that wastage is high, e.g. 60 percent for passion fruit and 35-45 percent for pineapples. The waste in temperate zone fruit is lower, generally of the order of 10-15 percent, while berries contain very little waste. The carbohydrate content of fruits varies widely. Protein content is very low, averaging less than 1 percent, or below that in vegetables. Fat content in fruit is negligible, with the notable exception of avocados. Fruit crops are consumed directly as food and are processed into dried fruit, fruit juice, canned fruit, frozen fruit, jam, alcoholic beverages, etc. Fruit crops are not normally grown for animal feed, although significant quantities of diseased and substandard fruits, as well as certain by-products of the fruit processing industry, are fed to animals. Production data for fruit crops should relate to fruits actually harvested. Data on bananas and plantains should relate to the weight of single bananas or banana hands, excluding the weight of the central stalk. FAO lists 36 primary fruit crops." -512,"Fruit, citrus nes",,4042,QCL,,Including inter alia: bergamot (Citrus bergamia); citron (C. medica var. cedrata); chinotto (C. myrtifolia); kumquat (Fortunella japonica). Some minor varieties of citrus are used primarily in the preparation of perfumes and soft drinks. -619,"Fruit, fresh nes",,9130,QCL,,"Including inter alia: azarole (Crataegus azarolus); babaco (Carica pentagona); elderberry (Sambucus nigra); jujube (Zizyphus jujuba); litchi (nephelium litchi); loquat (Eriobotrya japonica); medlar (Mespilus germanica); pawpaw (Asimina triloba); pomegranate (Punica granatum); prickly pear (Opuntia ficus-indica); rose hips (Rosa spp.); rowanberry (Sorbus aucuparia); service-apple (Sorbus domestica); tamarind (Tamarindus indica); tree-strawberry (Arbutus unedo). Other fresh fruit that are not identified separately because of their minor relevance at the international level. Because of their limited local importance, some countries report fresh fruit under this heading that are classified separately by FAO." -542,"Fruit, pome nes",,114,QCL,,"Other pome fruit not separately identified. In some countries apples, pears and quinces are reported under this general category." -541,"Fruit, stone nes",,1948,QCL,,"Other stone fruit not separately identified. In some countries, apricots, cherries, peaches, nectarines and plums are reported under this general category." -603,"Fruit, tropical fresh nes",,4448,QCL,,"Including inter alia: breadfruit (Artocarpus incisa); carambola (Averrhoa carambola); cherimoya, custard apple (Annona spp.); durian (Durio zibethinus); feijoa (Feijoa sellowiana); guava (Psidium guajava); hog plum, mombin (Spondias spp.); jackfruit (Artocarpus integrifolia); longan (nephelium longan); mammee (Mammea americana); mangosteen (Garcinia mangostana); naranjillo (Solanum quitoense); passion fruit (Passiflora edulis); rambutan (nephelium lappaceum); sapote, mamey colorado (Calocarpum mammosum); sapodilla (Achras sapota); star apple, cainito (Chrysophyllum spp.). Other tropical fresh fruit that are not identified separately because of their minor relevance at the international level. In some countries mangoes, avocados, pineapples, dates and papayas are reported under this general category." -2919,Fruits - Excluding Wine,Group,9513,FBSC,Fruit, -2625,"Fruits, Other",,9518,FBSC,,"Default composition: 521 Pears, 523 Quinces, 526 Apricots, 527 Apricots, dry, 530 Cherries, sour, 531 Cherries, 534 Peaches and nectarines, 536 Plums and sloes, 537 Plums dried (prunes), 538 Juice, plum, single strength, 539 Juice, plum, concentrated, 541 Fruit, stone nes, 542 Fruit, pome nes, 544 Strawberries, 547 Raspberries, 549 Gooseberries, 550 Currants, 552 Blueberries, 554 Cranberries, 558 Berries nes, 567 Watermelons, 568 Melons, other (inc.cantaloupes), 569 Figs, 570 Figs dried, 571 Mangoes, mangosteens, guavas, 572 Avocados, 583 Juice, mango, 587 Persimmons, 591 Cashewapple, 592 Kiwi fruit, 600 Papayas, 603 Fruit, tropical fresh nes, 604 Fruit, tropical dried nes, 619 Fruit, fresh nes, 620 Fruit, dried nes, 622 Juice, fruit nes, 623 Fruit, prepared nes, 624 Flour, fruit, 625 Fruits, nuts, peel, sugar preserved, 626 Fruit, cooked, homogenized preparations" -406,Garlic,,4984,QCL,Garlic,Allium sativum. -1072,Geese and guinea fowls,,2818,QCL,Geese and guinea fowls,Anser spp.. See 1057. -953,"Ghee, buffalo milk",,268,QCL,, -887,"Ghee, butteroil of cow milk",,1026,QCL,,Butter from which the water has been removed. Very common in hot countries. Includes also anhydrous butterfat or butter oil. -720,Ginger,,2248,QCL,,"Zingiber officinale. Rhizome of a perennial herb. It also is used for making beverages. Includes fresh, provisionally preserved or dried, whereas ginger preserved in sugar or syrup is excluded." -1016,Goats,,10331,QCL,Goats,"Capra spp.. See 866. Includes Hircus, Ibex, Nubiana, Pyrenaica, Tibetana, Kashmir and Angora." -549,Gooseberries,,940,QCL,,"Ribes grossularia. Trade data may sometimes include black, white or red currants." -103,"Grain, mixed",,1391,QCL,Mixed grains,"A mixture of cereal species that are sown and harvested together. The mixture wheat/rye is known as meslin, but in trade is usually classified with wheat." -2901,Grand Total,Group,9513,FBSC,Total, -507,Grapefruit (inc. pomelos),,4613,QCL,Grapefruit,Citrus maxima; C. grandis; C. paradisi. -2613,Grapefruit and products,,8854,FBSC,Grapefruit,"Default composition: 507 Grapefruit (inc. pomelos), 509 Juice, grapefruit, 510 Juice, grapefruit, concentrated" -560,Grapes,,4684,QCL,Grapes,Vitis vinifera. Includes both table and wine grapes. -2620,Grapes and products (excl wine),,9412,FBSC,,"Default composition: 560 Grapes, 561 Raisins, 562 Juice, grape, 563 Grapes, must" -2572,Groundnut Oil,,9091,FBSC,Groundnut oil,"Default composition: 244 Oil, groundnut" -2552,Groundnuts,,880,FBSC,, -2556,Groundnuts (Shelled Eq),,8407,FBSC,Groundnuts,"Default composition: 242 Groundnuts, with shell, 243 Groundnuts, shelled, 246 Groundnuts, prepared, 247 Peanut butter" -242,"Groundnuts, with shell",,6651,QCL,Groundnuts,"Arachis hypogaea. For trade data, groundnuts in shell are converted at 70% and reported on a shelled basis." -839,"Gums, natural",,106,QCL,,"Including inter alia: balata (Manilkara bidentata); ceara (Manihot glaziovii); chicle gum (Achras zapota); guayule (Parthenium argentatum); gutta-percha (Palachium gutta); jelutong (Dieva costulana). Extracted from the latex of trees of various species. Although similar to rubber in many ways, natural gums are usually less elastic." -225,"Hazelnuts, with shell",,1512,QCL,Hazelnuts,Corylus avellana. Produced mainly in Mediterranean countries and the United States. -777,Hemp tow waste,,1350,QCL,,"Cannabis sativa. This plant is cultivated for seed as well as for fibre. The fibre is obtained from the stem of the plant. Trade data include raw, retted, scutched, combed fibre, tow and waste." -336,Hempseed,,858,QCL,Hempseed,Cannabis sativa. An annual herbaceous that is cultivated for its fibre as well as its oil. In major producing countries oil is extracted from the seeds. -957,"Hides, buffalo, fresh",,1738,QCL,Buffalo hides,See 919. Both adult and young animals. -919,"Hides, cattle, fresh",,10681,QCL,Cattle hides,Green hide or skin as removed from the carcass of the animal (adult bovine). Used for production data only. -2745,Honey,,9410,FBSC,Honey,"Default composition: 1182 Honey, natural" -1182,"Honey, natural",,6957,QCL,Honey,Honey produced by bees (Apis mellifera) or by other insects. -677,Hops,,1627,QCL,,"Humulus lupulus. Hop cones, fresh or dried, whether or not ground, powdered or in the form of pellets. Includes lupuline, a yellow resinous powder that covers the hop cones. Mainly used in the brewing industry to give flavour to beer." -1096,Horses,,8974,QCL,Horses,Equus caballus. See 866. -2680,Infant food,,9358,FBSC,,Default composition: 109 Infant food -277,Jojoba seed,,59,QCL,,Simmondsia californica (syn. S. chinensis). From the shrub or small tree of the Buxaceae family. -780,Jute,,1131,QCL,Jute,"White jute (Corchorus capsularis); red jute, tossa (C. olitorius). Trade data cover raw or processed jute (but not spun), tow and waste, yarn waste and garnetted stock and may include jute-like fibres." -778,Kapok fibre,,106,QCL,,"Ceiba pentandra. This plant is cultivated for seed as well as for fibre. Trade data cover only fibres that have been crushed, carded or combed for spinning." -310,Kapok fruit,,118,QCL,,"Ceiba pentandra, Bombacaceae. The fruit of kapok contains fibre and seeds, which FAO treats as primary crops." -311,Kapokseed in shell,,106,QCL,,The soft shell is approximately 40-50% of the total weight of the nut. -263,Karite nuts (sheanuts),,439,QCL,Karite nuts,Butyrospermum parkii. Production data refer only to the nut contained in the fruit although the pulp around the nut is also edible. -592,Kiwi fruit,,975,QCL,Kiwi,Actinidia chinensis. -224,Kola nuts,,385,QCL,Kola nuts,"Kola, cola, Sudan cola nut (Cola nitida; C. vera; C. acuminata). Produced mainly in Africa. Kola nuts, containing 2.4 to 2.6% caffeine, are commonly chewed by the local population. Much used in Europe and America in the production of beverages." -1043,Lard,,4237,QCL,,Rendered pig fat. -407,"Leeks, other alliaceous vegetables",,2668,QCL,Leeks,"Leeks (Allium porrum); chives (A. schoenoprasum); other alliac. (Allium varieties except those of 402, 403 and 406)." -497,Lemons and limes,,6223,QCL,Lemons and limes,Lemon (Citrus limon); sour lime (C. aurantifolia); sweet lime (C. limetta). -2612,"Lemons, Limes and products",,8964,FBSC,Lemons and limes,"Default composition: 497 Lemons and limes, 498 Juice, lemon, single strength, 499 Juice, lemon, concentrated" -201,Lentils,,2874,QCL,Lentils,Lens esculenta; Ervum lens. -372,Lettuce and chicory,,5627,QCL,Lettuce,Lactuca sativa; witloof chicory (Cichorium intybus var. foliosum); endive (C. endivia var. crispa); escarole chicory (C. endivia var. latifolia). -333,Linseed,,2949,QCL,Linseed,Linum usitatissimum Flaxseed. An annual herbaceous that is cultivated for its fibre as well as its oil. -210,Lupins,,1357,QCL,,"Lupinus spp.. Used primarily for feed, though in some parts of Africa and in Latin America some varieties are cultivated for human food." -56,Maize,,8961,QCL,Maize,"Zea mays Corn, Indian corn, mealies. A grain with a high germ content. At the national level, hybrid and ordinary maize should be reported separately owing to widely different yields and uses. Used largely for animal feed and commercial starch production." -2514,Maize and products,,9518,FBSC,Maize,"Default composition: 56 Maize, 57 Germ, maize, 58 Flour, maize, 59 Bran, maize, 63 Gluten, maize, 64 Starch, maize, 846 Feed and meal, gluten" -2582,Maize Germ Oil,,8775,FBSC,Maize oil,"Default composition: 60 Oil, maize" -446,"Maize, green",,2697,QCL,Green maize,"Zea mays, particularly var. saccharata. Maize harvested green for food. Saccharata variety is commonly known as sweet corn." -571,"Mangoes, mangosteens, guavas",,5851,QCL,Mangoes,"Mangifera indica. Trade figures may include dried mangoes, guavas and mangosteens, including both fresh and dried." -809,Manila fibre (abaca),,381,QCL,,Musa textilis. The fibre is obtained from stalks of certain banana trees. For trade coverage see 789. -1242,"Margarine, short",,4731,QCL,Margarine,"Margarine is made principally from one or more hydrogenated vegetable or animal fats or oils in which is dispersed an aqueous potion containing milk products, salt, flavouring agents and other additives. Shortening is a product similar to margarine, but with a higher animal fat content. Shortening and compound fats are used primarily for baking and frying. The fat content of margarine and shortening varies from 70 to 90%." -2764,"Marine Fish, Other",,9412,FBSC,,"Default composition: 1540 Marine nes F, 1541 Marin Fz Whl, 1542 Marin Fillet, 1543 Marin Fz Flt, 1544 Marin Cured, 1545 Marin Canned, 1546 Marin Pr nes, 1547 Marin Meals" -671,Maté,,222,QCL,,Ilex paraguayensis. The dried leaves of certain shrubs of the holly family which grow in South America. Prepared in a way similar to tea. -2943,Meat,Group,9513,FBSC,"Meat, Total", -1166,Meat nes,,2858,QCL,,"Including frog legs, marine mammals, etc. Some countries includeunder this heading meats that are listed above, but which are notreported separately. Fresh, chilled or frozen." -2768,"Meat, Aquatic Mammals",,174,FBSC,,"Default composition: 1580 Aq M Meat, 1583 Aq M Prep Ns" -1108,"Meat, ass",,507,QCL,"Meat, ass","Fresh, chilled or frozen." -1089,"Meat, bird nes",,639,QCL,,"Fresh, chilled or frozen." -947,"Meat, buffalo",,1714,QCL,"Meat, buffalo","Fresh, chilled or frozen, with bone in or boneless." -1127,"Meat, camel",,1949,QCL,"Meat, camel","Fresh, chilled or frozen." -867,"Meat, cattle",,10735,QCL,,"Meat of bovine animals, fresh, chilled or frozen, with bone in. Commontrade names are beef and veal." -1058,"Meat, chicken",,10883,QCL,"Meat, chicken","Fresh, chilled or frozen. May include all types of poultry meat ifnational statistics do not report separate data." -1069,"Meat, duck",,4604,QCL,"Meat, duck","Fresh, chilled or frozen." -1163,"Meat, game",,3390,QCL,"Meat, game","Meat and offals of wild animals, whether fresh, chilled or frozen." -1017,"Meat, goat",,9717,QCL,"Meat, goat","Meat of goats and kids, whether fresh, chilled or frozen, with bone inor boneless." -1073,"Meat, goose and guinea fowl",,2300,QCL,"Meat, goose and guinea fowl","Fresh, chilled or frozen." -1097,"Meat, horse",,4230,QCL,"Meat, horse","Fresh, chilled or frozen." -1111,"Meat, mule",,120,QCL,"Meat, mule","Fresh, chilled or frozen." -2735,"Meat, Other",,9518,FBSC,,"Default composition: 1089 Meat, bird nes, 1097 Meat, horse, 1108 Meat, ass, 1111 Meat, mule, 1127 Meat, camel, 1141 Meat, rabbit, 1151 Meat, other rodents, 1158 Meat, other camelids, 1163 Meat, game, 1164 Meat, dried nes, 1166 Meat, nes, 1172 Meat, nes, preparations, 1176 Snails, not sea" -1158,"Meat, other camelids",,118,QCL,,"Fresh, chilled or frozen." -1151,"Meat, other rodents",,118,QCL,,"Fresh, chilled or frozen." -1035,"Meat, pig",,9926,QCL,"Meat, pig","Meat, with the bone in, of domestic or wild pigs (e.g. wild boars),whether fresh, chilled or frozen." -1808,"Meat, Poultry",Group,10883,QCL,"Meat, poultry", -1141,"Meat, rabbit",,3034,QCL,"Meat, rabbit","Fresh, chilled or frozen. May include hare meat." -977,"Meat, sheep",,9910,QCL,"Meat, lamb and mutton","Meat of sheep and lamb, whether fresh, chilled or frozen, with bone inor boneless." -1765,"Meat, Total",Group,11055,QCL,"Meat, Total","FAO defines meat as the flesh of animals used for food. In production data, meat is normally reported inclusive of bone and exclusive of meat that is unfit for human consumption. As reported by individual countries, meat production data may refer either to commercial production (meat entering marketing channels), inspected production (from animals slaughtered under sanitary inspection), or total production (the total of the above- mentioned categories plus slaughter for personal consumption). All FAO annual production data refer to total production.Country statistics on meat production adhere to one or more of the following concepts: 1. Live weight: the weight of the animal immediately before slaughter. 2. Killed weight: the live weight less the uncollected blood lost during slaughter. 3. Dressed carcass weight: weight minus all parts - edible and inedible - that are removed in dressing the carcass. The concept varies widely from country to country and according to the various species of livestock. Edible parts generally include edible offals (head or head meat, tongue, brains, heart, liver, spleen, stomach or tripes and, in a few countries, other parts such as feet, throat and lungs. Slaughter fats (the unrendered fats that fall in the course of dressing the carcasses) are recorded as either edible or inedible according to country practice. Inedible parts generally include hides and skins (except in the case of pigs), as well as hoofs and stomach contents.Meat production data for minor animals (poultry, rabbits, etc.) are reported in one of the following three ways: ready-to-cook weight (giblets are sometimes included and sometimes excluded); eviscerated weight (including the feet and head); or dressed weight, i.e. the live weight less the blood, feathers and skin.FAO data relate to dressed carcass weight for livestock and, wherever possible, ready-to- cook weight for poultry.Among individual countries, one of the following three concepts issued to measure production:A. Production from all animals, of both indigenous and foreign origin, that are slaughtered within national boundaries. B. Production from the slaughter of indigenous animals plus exports of live indigenous animals during the reference period. Derived from meat production as follows: production from slaughtered animals plus the meat equivalent of all animals exported alive, minus the meat equivalent of all animals imported alive. As imports/exports of live animals are recorded by FAO in numbers, not weight, animal type and size are of significance. C. The biological production concept covers indigenous animals that are either slaughtered or exported live, plus net additions to the stock during the reference period. Derived from indigenous productions follows: indigenous production plus (or minus) the meat equivalent of the change in the stock numbers during the reference period. Production is expressed in terms of live weight. Changes in the total live weight of all animals are not taken into account.FAO uses the first concept of meat production in the construction of its food balance sheets and for related indicators. The second concept, indigenous meat production, in measuring the output of the national livestock sector, is useful mainly in the construction of index numbers of agricultural production. The third concept, biological production, would be the most complete as it also reflects changes in the livestock herd, but it is not used because of difficulties in obtaining information from national reporting offices. The prices applied to indigenous meat production are derived from prices of live animals. This covers not only the value of meat, but also the value of offals, fats, hides and skins." -1080,"Meat, turkey",,3753,QCL,"Meat, turkey","Fresh, chilled or frozen." -568,"Melons, other (inc.cantaloupes)",,5343,QCL,Melon,Cucumis melo;. -299,Melonseed,,1088,QCL,Melonseed,Cucumis melo. Includes seeds of other Cucurbitaceae. -2848,Milk - Excluding Butter,,9518,FBSC,,"Default composition: 882 Milk, whole fresh cow, 888 Milk, skimmed cow, 889 Milk, whole condensed, 890 Whey, condensed, 891 Yoghurt, 892 Yoghurt, concentrated or not, 893 Buttermilk, curdled, acidified milk, 894 Milk, whole evaporated, 895 Milk, skimmed evaporated, 896 Milk, skimmed condensed, 897 Milk, whole dried, 898 Milk, skimmed dried, 899 Milk, dry buttermilk, 900 Whey, dry, 901 Cheese, whole cow milk, 903 Whey, fresh, 904 Cheese, skimmed cow milk, 905 Whey, cheese, 907 Cheese, processed, 908 Milk, reconstituted, 909 Milk, products of natural constituents nes, 910 Ice cream and edible ice, 917 Casein, 951 Milk, whole fresh buffalo, 954 Milk, skimmed buffalo, 955 Cheese, buffalo milk, 982 Milk, whole fresh sheep, 984 Cheese, sheep milk, 985 Milk, skimmed sheep, 1020 Milk, whole fresh goat, 1021 Cheese of goat mlk, 1023 Milk, skimmed goat, 1130 Milk, whole fresh camel" -2948,Milk - Excluding Butter,Group,9513,FBSC,Milk, -899,"Milk, dry buttermilk",,595,QCL,, -896,"Milk, skimmed condensed",,597,QCL,,"Same as 889, but applied to skim milk." -888,"Milk, skimmed cow",,6382,QCL,,Milk from which most of the fat has been removed. -898,"Milk, skimmed dried",,2704,QCL,,"Same as 897, but from skim milk. Normally does not exceed 1.5% fat content." -895,"Milk, skimmed evaporated",,288,QCL,,"Same as 894, but applied to skim milk." -1780,"Milk, Total",Group,10323,QCL,Milk,"Milk, eggs, honey and beeswax are included as products of live animals. Fibres of animal origin (mainly wool and silk) are included with fibres of vegetal and animal origin. Milk and dairy products. Estimates of milk production as reported by countries refer to one or more of the following three concepts. Gross production is milk production plus milk sucked by young animals. Production available for consumption is net production less milk fed to animals, milk retained by farmers for food and feed, direct sales to consumers and farm waste. The FAO concept relates to net milk production. Data should be reported by kind of milking animal (cow, sheep, goat, etc.) in terms of whole milk and by weight. In most developed countries only 5-10 percent of whole milk is used directly for human consumption. The bulk of milk production is processed before being marketed as liquid milk (e. G. standardized, pasteurized, skimmed, etc.) or is manufactured in to products such as cream, butter, cheese, evaporated and condensed milk, milk powder, casein, yogurt, ice cream, etc. About 70 percent of whole milk is processed into dairy products; the by-products of these processes (e. G. skim milk, buttermilk and whey) are used either for feed or are manufactured into other dairy products, e. G. dry skim milk and low-fat cheese. Processed milk and dairy products are often supplemented with vitamins, mineral and various additives. FAO list 50 milk and dairy product items in the list that follows, of which five are primary products. Some food products containing milk are not listed separately by FAO, e. G. eggnog, sherbet, malted milk, chocolate milk drink and mellorine." -889,"Milk, whole condensed",,2639,QCL,,Milk and cream from which water has been partly removed after heat-treating and concentrating. Normally sucrose is added to give the product stability and bacteriological safety. -897,"Milk, whole dried",,2926,QCL,,"Milk and cream from which water has been completely removed by various methods. In form of powder, granules or other solid forms. May contain added sugar or other sweeteners." -894,"Milk, whole evaporated",,3158,QCL,,Milk and cream from which the water has been partly removed and which has been heat- treated to render it bacteriologically safe and stable. -951,"Milk, whole fresh buffalo",,1339,QCL,,See 882 and derived products. -1130,"Milk, whole fresh camel",,1357,QCL,,See 882. -882,"Milk, whole fresh cow",,10234,QCL,,"Production data refer to raw milk containing all its constituents. Trade data normally cover milk from any animal, and refer to milk that is not concentrated, pasteurized, sterilized or other-wise preserved, homogenized or peptonized." -1020,"Milk, whole fresh goat",,5500,QCL,,See 882 and derived products. -982,"Milk, whole fresh sheep",,4016,QCL,,See 882 and derived products. -79,Millet,,4610,QCL,Millet,"Including inter alia: barnyard or Japanese millet (Echinocloa frumentacea); ragi, finger or African millet (Eleusine coracana); teff (Eragrostis abyssinica); common, golden or proso millet (Panicum miliaceum); koda or ditch millet (Paspalum scrobiculatum); pearl or cattail millet (Pennisetum glaucum); foxtail millet (Setaria italica). Small-grained cereals that include a large number of different botanical species. Originated by the domestication of wild African grasses in the Nile valley and the Sahel zone, millets were subsequently taken to China and India. These cereals tolerate arid conditions and possess a small, highly nutritious grain that stores well. Used locally, both as a food and as a livestock feed. In all areas where they are cultivated, millets are used in traditional beer brewing. Also used as a feed for birds." -2517,Millet and products,,8469,FBSC,Millet,"Default composition: 79 Millet, 80 Flour, millet, 81 Bran, millet" -2899,Miscellaneous,,9284,FBSC,, -2928,Miscellaneous,Group,9438,FBSC,, -165,Molasses,,7259,QCL,Molasses,"A by-product of the extraction or refining of beet or cane sugar or of the production of fructose from maize. Used for feed, food, industrial alcohol, alcoholic beverages and ethanol." -2767,"Molluscs, Other",,9436,FBSC,,"Default composition: 1562 Mlluscs Frsh, 1563 Molsc Frozen, 1564 Molsc Cured, 1565 Molsc Canned, 1566 Molsc Meals" -1110,Mules,,4512,QCL,,Includes hinnies. Mules are offspring of a male ass and a female horse (mare); a hinny is the offspring of a female ass and a male horse (stallion). Both are sterile. See 866. -449,Mushrooms and truffles,,3347,QCL,Mushrooms,Including inter alia: Boletus edulis; Agaricus campestris; Morchella spp. and Tuber magnatum. Cultivated or spontaneous. Includes truffles. -292,Mustard seed,,1383,QCL,Mustard seed,"White mustard (Brassica alba; B. hirta; Sinapis alba); black mustard (Brassica nigra; Sinapis nigra). In addition to the oil extracted from them, white mustard seeds, may be processed into flour for food use. Black mustard seeds also yield oil and are processed into flour that is used mainly in pharmaceutical products." -2732,Mutton & Goat Meat,,9518,FBSC,"Meat, sheep and goat","Default composition: 977 Meat, sheep, 1017 Meat, goat" -702,"Nutmeg, mace and cardamoms",,1320,QCL,,"Nutmeg, mace (Myristica fragrans); cluster cardamon (Elettaria cardamomum); other cardamons (Aframomum angustifolium; A. hambury; Amomun aromaticum; A. cardamomum); Malaguetta pepper, grains of paradise (Aframomum melegueta). Nutmeg is the inner brown kernel of the fruit of the nutmeg tree. Mace is the net-like membrane between the outer shell and the kernel. Cardamon seeds are enclosed in the capsule produced by perennial herbs of the Zingiberaceae family." -2551,Nuts and products,,9465,FBSC,Nuts,"Default composition: 216 Brazil nuts, with shell, 217 Cashew nuts, with shell, 220 Chestnut, 221 Almonds, with shell, 222 Walnuts, with shell, 223 Pistachios, 224 Kola nuts, 225 Hazelnuts, with shell, 226 Areca nuts, 229 Brazil nuts, shelled, 230 Cashew nuts, shelled, 231 Almonds shelled, 232 Walnuts, shelled, 233 Hazelnuts, shelled, 234 Nuts, nes, 235 Nuts, prepared (exc. groundnuts)" -234,Nuts nes,,3961,QCL,,"Including inter alia: pecan nut (Carya illinoensis); butter or swarri nut (Caryocar nuciferum); pili nut, Java almond, Chinese olives (Canarium spp.); paradise or sapucaia nut (Lecythis zabucajo); Queensland, macadamia nut (Macadamia ternifolia); pignolia nut (Pinus pinea). Other nuts that are not identified separately because of their minor relevance at the international level. Because of their limited local importance, some countries report nuts under this heading that are classified individually by FAO." -75,Oats,,3824,QCL,Oats,"Avena spp., mainly Avena sativa. A plant with open, spreading panicle-bearing large spikelets. Used primarily in breakfast foods. Makes excellent fodder for horses." -2516,Oats,,9147,FBSC,,"Default composition: 75 Oats, 76 Oats rolled, 77 Bran, oats" -2945,Offals,Group,9513,FBSC,, -2736,"Offals, Edible",,9518,FBSC,Offals,"Default composition: 868 Offals, edible, cattle, 878 Liver prep., 948 Offals, edible, buffaloes, 978 Offals, sheep,edible, 1018 Offals, edible, goats, 1036 Offals, pigs, edible, 1059 Offals, liver chicken, 1074 Offals, liver geese, 1075 Offals, liver duck, 1081 Offals, liver turkeys, 1098 Offals, horses, 1128 Offals, edible, camels, 1159 Offals, other camelids, 1167 Offals, nes" -948,"Offals, edible, buffaloes",,898,QCL,"Offals, buffaloes","Fresh, chilled or frozen." -1128,"Offals, edible, camels",,1127,QCL,"Offals, camels","Fresh, chilled or frozen." -868,"Offals, edible, cattle",,5581,QCL,"Offals, cattle","Fresh, chilled or frozen." -1018,"Offals, edible, goats",,5041,QCL,"Offals, goats","Fresh, chilled or frozen." -1098,"Offals, horses",,2312,QCL,"Offals, horses","Fresh, chilled or frozen." -1036,"Offals, pigs, edible",,5132,QCL,"Offals, pigs","Fresh, chilled or frozen." -978,"Offals, sheep,edible",,5146,QCL,"Offals, sheep","Fresh, chilled or frozen." -254,Oil palm fruit,,2568,QCL,Palm fruit oil,"Elaeis guineensis. The oil palm produces bunches containing a large number of fruits with the fleshy mesocarp enclosing a kernel that is covered by a very hard shell. FAO considers palm oil (coming from the pulp) and palm kernels to be primary products. The oil extraction rate from a bunch varies from 17 to 27% for palm oil, and from 4 to 10% for palm kernels." -252,"Oil, coconut (copra)",,5212,QCL,Coconut oil,Obtained by pressure from copra and by solvent from the residues of pressure extraction. Has both food and industrial uses. -331,"Oil, cottonseed",,5391,QCL,Cottonseed oil,Obtained first by pressure extraction from the kernels of cotton seeds. The residue from this process is then exposed to a solvent. Used mainly as a food. -244,"Oil, groundnut",,5492,QCL,Groundnut oil,Obtained by pressure or solvent extraction. Used mainly for food. -334,"Oil, linseed",,3233,QCL,Linseed oil,Obtained by pressure extraction. Used mainly in non-food items. -60,"Oil, maize",,2732,QCL,Maize oil,Extracted from germ by pressure or by solvents. -261,"Oil, olive, virgin",,1762,QCL,Olive oil,Obtained from olives by mechanical or other physical means. Olive oil is the only vegetable oil that can be consumed without refining. -257,"Oil, palm",,2602,QCL,Palm oil,"Obtained from the mesocarp of the fruit of the oil palm by pressure, and also by solvent from the residues of the pressure extraction." -258,"Oil, palm kernel",,3074,QCL,Palm kernel oil,Obtained from the kernel of the nut of the fruits of the oil palm by pressure in two or three stages at different temperatures. Including oil of babassu kernels. -271,"Oil, rapeseed",,3020,QCL,Rapeseed oil,Obtained by pressure extraction for food use. Oil recovered with solvent from the residues of the pressure extraction is used for industrial purposes. Canola oil is produced from new varieties of rapeseed. -281,"Oil, safflower",,963,QCL,Safflower oil,Obtained either by pressure or by solvent. Has both food and industrial uses. -290,"Oil, sesame",,3181,QCL,Sesame oil,Obtained by pressure extraction in two or three stages at different temperatures. Sometimes the oil is also extracted by solvent from the residue of the pressure extraction. Used mainly for food. -237,"Oil, soybean",,4977,QCL,Soybean oil,Obtained by solvent extraction from the beans. Used mainly for food. -268,"Oil, sunflower",,3796,QCL,Sunflower oil,Obtained by pressure extraction. Mainly for food use. -1731,Oilcrops,Group,10411,QCL,Oilcrops,"Oil-Bearing Crops or Oil Crops include both annual (usually called oilseeds) and perennial plants whose seeds, fruits or mesocarp and nuts are valued mainly for the edible or industrial oils that are extracted from them. Dessert and table nuts, although rich in oil, are listed under Nuts (see Chapter .). Annual oilseed plants tha are either harvested green or are used for grazing and for green manure are included with Fodder Crops (see Chapter 11.). Some of the crops included in this chapter are also fibre crops in that both the seeds and the fibres are harvested from the same plant. Such crops include: coconuts, yielding coir from the mesocarp; kapok fruit; seed cotton; linseed; and hempseed. In the case of several other crops, both the pulp of the fruit and the kernels are used for oil. The main crops of this type are oil-palm fruit and tallow tree seeds. Production data are reported in terms of dry products as marketed. Exceptions to this general rule include: groundnuts, which are reported as groundnuts in the shell; coconuts, which are reported on the basis of the weight of the nut including the woody shell, but excluding the fibrous outer husk; and palm oil, which is reported in terms of oil, by weight. Because of the very different nature of the various oil crops, the primary products cannot be aggregated in their natural weight to obtain total oil crops. For this reason, FAO converts the crops to either an oil equivalent or an oilcake equivalent before aggregating them. Only 5-6 percent of the world production of oil crops is used for seed (oilseeds) and animal feed, while about 8 percent is used for food. The remaining 86 percent is processed into oil. The fat content of oil crops varies widely. Fat content ranges from as low as 10-15 percent of the weight of coconuts to over 50 percent of the weight of sesame seeds and palm kernels. Carbohydrates, mainly polysaccharides, range from 15 to 30 percent in the oilseeds, but are generally lower in other oil-bearing crops. The protein content is very high in soybeans, at up to 40 percent, but is much lower in many other oilseeds, at 15-25 percent, and is lower still in some other oil-bearing crops." -2913,Oilcrops,Group,9513,FBSC,Oilcrops, -2586,"Oilcrops Oil, Other",,9518,FBSC,,"Default composition: 264 Butter of karite nuts, 266 Oil, castor beans, 276 Oil, tung nuts, 278 Oil, jojoba, 281 Oil, safflower, 297 Oil, poppy, 306 Vegetable tallow, 307 Oil, stillingia, 313 Oil, kapok, 334 Oil, linseed, 337 Oil, hempseed, 340 Oil, vegetable origin nes, 664 Cocoa, butter, 1241 Margarine, liquid, 1242 Margarine, short, 1273 Castor oil, hydrogenated (opal wax), 1274 Oil, boiled etc, 1275 Oil, hydrogenated" -1841,"Oilcrops, Cake Equivalent",Group,10294,QCL,"Oilcrops, Cake Equivalent", -1732,"Oilcrops, Oil Equivalent",Group,10411,QCL,"Oilcrops, Oil Equivalent","Oil-Bearing Crops or Oil Crops include both annual (usually called oilseeds) and perennial plants whose seeds, fruits or mesocarp and nuts are valued mainly for the edible or industrial oils that are extracted from them. Dessert and table nuts, although rich in oil, are listed under Nuts (see Chapter .). Annual oilseed plants tha are either harvested green or are used for grazing and for green manure are included with Fodder Crops (see Chapter 11.). Some of the crops included in this chapter are also fibre crops in that both the seeds and the fibres are harvested from the same plant. Such crops include: coconuts, yielding coir from the mesocarp; kapok fruit; seed cotton; linseed; and hempseed. In the case of several other crops, both the pulp of the fruit and the kernels are used for oil. The main crops of this type are oil-palm fruit and tallow tree seeds. Production data are reported in terms of dry products as marketed. Exceptions to this general rule include: groundnuts, which are reported as groundnuts in the shell; coconuts, which are reported on the basis of the weight of the nut including the woody shell, but excluding the fibrous outer husk; and palm oil, which is reported in terms of oil, by weight. Because of the very different nature of the various oil crops, the primary products cannot be aggregated in their natural weight to obtain total oil crops. For this reason, FAO converts the crops to either an oil equivalent or an oilcake equivalent before aggregating them. Only 5-6 percent of the world production of oil crops is used for seed (oilseeds) and animal feed, while about 8 percent is used for food. The remaining 86 percent is processed into oil. The fat content of oil crops varies widely. Fat content ranges from as low as 10-15 percent of the weight of coconuts to over 50 percent of the weight of sesame seeds and palm kernels. Carbohydrates, mainly polysaccharides, range from 15 to 30 percent in the oilseeds, but are generally lower in other oil-bearing crops. The protein content is very high in soybeans, at up to 40 percent, but is much lower in many other oilseeds, at 15-25 percent, and is lower still in some other oil-bearing crops." -2570,"Oilcrops, Other",,9390,FBSC,,"Default composition: 263 Karite nuts (sheanuts), 265 Castor oil seed, 275 Tung nuts, 277 Jojoba seed, 280 Safflower seed, 296 Poppy seed, 299 Melonseed, 305 Tallowtree seed, 310 Kapok fruit, 311 Kapokseed in shell, 312 Kapokseed shelled, 333 Linseed, 336 Hempseed, 339 Oilseeds nes, 343 Flour, oilseeds" -339,Oilseeds nes,,3348,QCL,,"Includes inter alia: beech nut (Fagus sylvatica);(Aleurites moluccana);(Carapa guineensis);(Croton tiglium);(Bassia latifolia);(Guizotia abyssinica);(Licania rigida);(Perilla frutescens);(Jatropha curcas);(Shorea robusta);(Pongamia glabra);(Astrocaryum spp.). Other oilseeds, oleaginous fruits and nuts that are not identified separately because of their minor relevance at the international level. Because of their limited local importance, some countries report commodities under this heading that are classified individually by FAO. Also included under this code are tea seeds, grape pips and tomato seeds from which oil is extracted." -430,Okra,,2524,QCL,Okra,Abelmoschus esculentus; Hibiscus esculentus. Also called gombo. -2580,Olive Oil,,9340,FBSC,Olive oil,"Default composition: 261 Oil, olive, virgin, 274 Oil, olive residues" -260,Olives,,2209,QCL,Olives,Olea europaea. Includes table olives and olives for oil. -2563,Olives (including preserved),,8612,FBSC,Olives,"Default composition: 260 Olives, 262 Olives preserved" -2602,Onions,,8981,FBSC,Onions,"Default composition: 403 Onions, dry" -403,"Onions, dry",,7426,QCL,Onions,"Allium cepa. Includes onions at a mature stage, but not dehydrated onions." -402,"Onions, shallots, green",,3500,QCL,,Shallots (Allium ascalonicum); onions (A. cepa); welsh onions (A. fistulosum). Young onions pulled before the bulb has enlarged; used especially in salads. Includes onion sets. -490,Oranges,,6866,QCL,Oranges,"Common, sweet orange (Citrus sinensis); bitter orange (C. aurantium). Bitter oranges are used primarily in the preparation of marmalade." -2611,"Oranges, Mandarines",,9465,FBSC,Oranges,"Default composition: 490 Oranges, 491 Juice, orange, single strength, 492 Juice, orange, concentrated, 495 Tangerines, mandarins, clementines, satsumas, 496 Juice, tangerine" -256,Palm kernels,,2529,QCL,Palm kernels,Seeds of the oil palm. Babassu kernels (Orbignya speciosa) are often reported as palm kernels. -2562,Palm kernels,,6955,FBSC,Palm kernels,"Default composition: 254 Oil, palm fruit, 256 Palm kernels" -2577,Palm Oil,,9287,FBSC,,"Default composition: 257 Oil, palm, 1276 Fatty acids, 1277 Fatty substance residues" -2576,Palmkernel Oil,,8390,FBSC,Palm kernel oil,"Default composition: 258 Oil, palm kernel" -600,Papayas,,3870,QCL,Papayas,Carica papaya. -534,Peaches and nectarines,,4204,QCL,Peaches and nectarines,Prunus persica; Amygdalus persica; Persica laevis. -521,Pears,,4476,QCL,Pears,Pyrus communis. -2547,Peas,,8876,FBSC,"Peas, dry","Default composition: 187 Peas, dry" -187,"Peas, dry",,4835,QCL,"Peas, dry",Garden pea (Pisum sativum); field pea (P. arvense). -417,"Peas, green",,4386,QCL,"Peas, green","Pisum sativum. Mostly for shelling, but including edible- podded peas or sugar peas." -2763,Pelagic Fish,,9470,FBSC,,"Default composition: 1527 Pelagic Frsh, 1528 Pelgc Fz Whl, 1529 Pelgc Fillet, 1530 Pelgc Fz Flt, 1531 Pelgc Cured, 1532 Pelgc Canned, 1533 Pelgc Pr nes, 1534 Pelgc Meals" -2640,Pepper,,9287,FBSC,Pepper,Default composition: 687 Pepper (piper spp.) -687,Pepper (piper spp.),,2506,QCL,Pepper,"Black, white pepper (Piper nigrum); long pepper (P. longum). Perennial climbing vines. Includes whole, crushed or ground berries. Black pepper is produced from partially ripe berries, while white pepper is from fully ripe berries which have had the outer hull removed." -748,Peppermint,,520,QCL,,"Mentha spp.: M. piperita. Leaves and flowers are used in the perfumery, food and other industries." -587,Persimmons,,806,QCL,,Diospyros kaki: D. virginiana. -197,Pigeon peas,,1386,QCL,Pigeon peas,"Pigeon pea, cajan pea, Congo bean (Cajanus cajan)." -2733,Pigmeat,,9402,FBSC,Pork,"Default composition: 1035 Meat, pig, 1038 Meat, pork, 1039 Bacon and ham, 1041 Meat, pig sausages, 1042 Meat, pig, preparations" -1034,Pigs,,9856,QCL,Pigs,Domestic pig (Sus domestica); wild boar (Sus scrofa). See 866. Excludes non-domesticated wild boars. -2641,Pimento,,8809,FBSC,"Chillies and peppers","Default composition: 689 Chillies and peppers, dry" -574,Pineapples,,5131,QCL,,Ananas comosus; A. sativ. Trade figures may include dried pineapples. -2618,Pineapples and products,,9212,FBSC,Pineapples,"Default composition: 574 Pineapples, 575 Pineapples canned, 576 Juice, pineapple, 580 Juice, pineapple, concentrated" -223,Pistachios,,1082,QCL,Pistachios,Pistacia vera. Produced mainly in the Near East and the United States. -2616,Plantains,,6780,FBSC,Plantains,Default composition: 489 Plantains -489,Plantains and others,,3125,QCL,Plantains,Musa paradisiaca. Generally known as a cooking banana. Data should be reported excluding the weight of the central stalk. -536,Plums and sloes,,4356,QCL,Plums,"Greengage, mirabelle, damson (Prunus domestica); sloe (P. spinosa)." -296,Poppy seed,,886,QCL,Poppy seeds,"Papaver somniferum. The source of opium, poppy seeds are also used in baking and confectionery." -116,Potatoes,,8444,QCL,Potatoes,"Solanum tuberosum Irish potato. A seasonal crop grown in temperate zones all over the world, but primarily in the northern hemisphere." -2531,Potatoes and products,,9518,FBSC,Potatoes,"Default composition: 116 Potatoes, 117 Flour, potatoes, 118 Potatoes, frozen, 119 Starch, potatoes, 121 Tapioca, potatoes" -2029,Poultry Birds,Group,10863,QCL,Poultry, -2734,Poultry Meat,,9518,FBSC,"Meat, poultry","Default composition: 1058 Meat, chicken, 1060 Fat, liver prepared (foie gras), 1061 Meat, chicken, canned, 1069 Meat, duck, 1073 Meat, goose and guinea fowl, 1080 Meat, turkey" -2911,Pulses,Group,9513,FBSC,Pulses, -211,Pulses nes,,6236,QCL,,"Including inter alia: lablab or hyacinth bean (Dolichos spp.); jack or sword bean (Canavalia spp.); winged bean (Psophocarpus tetragonolobus); guar bean (Cyamopsis tetragonoloba); velvet bean (Stizolobium spp.); yam bean (Pachyrrhizus erosus);. Vigna spp. other than those included in 176 and 195 Other pulses that are not identified separately because of their minor relevance at the international level. Because of their limited local importance, some countries report pulses under this heading that are classified individually by FAO." -2549,"Pulses, Other and products",,9390,FBSC,,"Default composition: 181 Broad beans, horse beans, dry, 191 Chick peas, 195 Cow peas, dry, 197 Pigeon peas, 201 Lentils, 203 Bambara beans, 205 Vetches, 210 Lupins, 211 Pulses, nes, 212 Flour, pulses, 213 Bran, pulses" -1726,"Pulses, Total",Group,9320,QCL,Pulses,"Pulses are annual leguminous crops yielding from one to 12 grains or seeds of variable size, shape and color within a pod. They are used for both food and feed. The term ""pulses"" is limited to crops harvested solely for dry grain, thereby excluding crops harvested green for food (green peas, green beans, etc.) which are classified as vegetable crops. Also excluded are those crops used mainly for oil extraction (e.g. soybean and groundnuts) and leguminous crops (e.g. seeds of clover and alfalfa) that are used exclusively for sowing purposes. In addition to their food value, pulses also play an important role in cropping systems because of their ability to produce nitrogen and thereby enrich the soil. Pulses contain carbohydrates, mainly starches (55-65 percent of the total weight); proteins, including essential amino acids (18-25 percent, and much higher than cereals); and fat (1 - 4 percent). The remainder consists of water and inedible substances. Production data should be reported in terms of dry clean weight, excluding the weightof the pods. Certain kinds of pulses can be skinned and partially crushed or split toremove the seed-coat, but the resulting products are still considered raw for classification purposes. FAO covers 11 primary pulses. Each is listed below, along with its code, its botanical name, or names, and a short description. Only two processed products are included in the FAO list, namely flour of pulses and bran of pulses." -394,"Pumpkins, squash and gourds",,6342,QCL,,Cucurbita spp. Includes marrows. -754,"Pyrethrum, dried",,687,QCL,,"Chrysanthemum cinerariifolium. Includes leaves, stems and flowers. For insecticides, fungicides and similar products." -523,Quinces,,2533,QCL,,Cydonia oblonga; C. vulgaris; C. japonica. -92,Quinoa,,194,QCL,Quinoa,"Chenopodium quinoa (Chenopodiaceae). A minor cereal, which tolerates high altitudes, quinoa is cultivated primarily in andean countries. Used for food and to make chicha, a fermented beverage." -1140,Rabbits and hares,,3329,QCL,Rabbits,Oryctolagus cuniculus. See 866. May include domesticated hares (Lepus spp.). Data are expressed in thousands. -788,Ramie,,437,QCL,,"China grass, white ramie (Boehmeria nivea); rhea, green ramie (B. tenacissima). Ramie fibre is obtained from the bast of the plant. For trade coverage see 780." -2574,Rape and Mustard Oil,,8738,FBSC,,"Default composition: 271 Oil, rapeseed, 293 Oil, mustard" -2558,Rape and Mustardseed,,8862,FBSC,,"Default composition: 270 Rapeseed, 292 Mustard seed, 295 Flour, mustard" -270,Rapeseed,,3046,QCL,Rapeseed,"Brassica napus var. oleifera. Valued mainly for its oil. Older varieties are rich in Erucic acid, which is considered unhealthy." -547,Raspberries,,2064,QCL,Raspberries,"Ubus idaeus. Trade data may include blackberries, mulberries and loganberries (a cross between the raspberry and blackberry)." -2805,Rice (Milled Equivalent),,8638,FBSC,Rice,"Default composition: 27 Rice, paddy, 28 Rice, husked, 29 Rice, milled/husked, 31 Rice, milled, 32 Rice, broken, 33 Gluten, rice, 34 Starch, rice, 35 Bran, rice, 38 Flour, rice" -2807,Rice and products,,880,FBSC,, -27,"Rice, paddy",,6952,QCL,Rice,"Oryza spp., mainly oryza sativa. Rice grain after threshing and winnowing. Also known as rice in the husk and rough rice. Used mainly for human food." -30,"Rice, paddy (rice milled equivalent)",,6952,QCL,, -2581,Ricebran Oil,,1369,FBSC,,"Default composition: 36 Oil, rice bran" -1150,"Rodents, other",,118,QCL,,"See 866. Includes only those used mainly for meat, e.g. Guinea pig. Rodents used mainly for fur skins are included in Code 1169. Data are expressed in thousands." -149,Roots and tubers nes,,4684,QCL,,"Including inter alia: arracacha (Arracacoa xanthorrhiza); arrowroot (Maranta arundinacea); chufa (Cyperus esculentus); sago palm (Metroxylon spp.); oca and ullucu (Oxalis tuberosa and Ullucus tuberosus); yam bean, jicama (Pachyrxhizus erosus, P. angulatus); mashua (Tropaeolum tuberosum); Jerusalem artichoke, topinambur (Helianthus tuberosus). Other tubers, roots or rhizomes, fresh, that are not identified separately because of their minor relevance at the international level. Because of their limited local importance, some countries report roots and tubers under this commodity heading that are classified individually by FAO." -1720,"Roots and Tubers, Total",Group,10872,QCL,Roots and tubers,"Roots and Tubers are plants yielding starchy roots, tubers, rhizomes, corms and stems. They are used mainly for human food (as such or in processed form), for animal feed and for manufacturing starch, alcohol and fermented beverages including beer. The denomination ""roots and tubers"" excludes crops which are cultivated mainly for feed (mangolds, swedes) or for processing into sugar (sugar beets), and those classified as ""roots, bulb and tuberous vegetables"" (onions, garlic and beets). It does include starch and the starchy pith and flour obtained from the trunk of the sago palm and the stem of the Abyssinian banana (Musa ensete). Certain root crops, notably bitter cassava, contain toxic substances, particularly in the skins. As a result, certain processes must be undertaken to make the product safe for human consumption. Apart from their high water content (70-80 percent), these crops contain mainly carbohydrates (largely starches that account for 16-24 percent of their total weight) with very little protein and fat (0-2 percent each). Methods of propagating root crops vary. A live potato tuber or seed must be planted but only part of the live yam tuber and a piece of the stalk (not the root) in the case of cassava. Production data of root crops should be reported in terms of clean weight, i.e. free of earth and mud. FAO distinguishes among seven primary root and tuber crops. The code and name of each one appears in the list that follows, along with its botanical name, or names, and a short description. The processed products of roots and tubers are listed together with their parent primary crops." -2534,"Roots, Other",,8999,FBSC,,"Default composition: 135 Yautia (cocoyam), 136 Taro (cocoyam), 149 Roots and tubers, nes, 150 Flour, roots and tubers nes, 151 Roots and tubers dried" -836,"Rubber, natural",,2025,QCL,,"Hevea brasiliensis Latex. The liquid secreted by the rubber tree. Includes stabilized or concentrated latex and prevulcanized rubber latex. In trade figures, liquid weight is converted to dry weight at 60%." -71,Rye,,2992,QCL,Rye,"Secale cereale. A grain that is tolerant of poor soils, high latitudes and altitudes. Mainly used in making bread, whisky and beer. When fed to livestock, it is generally mixed with other grains." -2515,Rye and products,,8257,FBSC,Rye,"Default composition: 71 Rye, 72 Flour, rye, 73 Bran, rye" -280,Safflower seed,,1181,QCL,Safflower seed,Carthamus tinctorius. Valued mainly for its oil. Minor uses include as a human food and as poultry feed. -328,Seed cotton,,5516,QCL,Seed cotton,"Gossypium spp.: Unginned cotton. Grown for both seed and for fibre. FAO considers cottonseed, cotton lint and linters to be primary products. Lint content ranges from 30 to 40%, seed 55 to 65%, and linters 2 to 5% though they are not always separated." -289,Sesame seed,,4153,QCL,Sesame seed,"Sesamum indicum. Valued for its oil, but also as a food, either raw or roasted, as well as in bakery products and other food preparations." -2561,Sesame seed,,8855,FBSC,Sesame seed,Default composition: 289 Sesame seed -2579,Sesameseed Oil,,8410,FBSC,Sesame oil,"Default composition: 290 Oil, sesame" -976,Sheep,,9879,QCL,Sheep,"Ovis spp.. See 866. Includes Uriel, Argali, Bighorn, Karakul and Astrakhan." -1807,Sheep and Goat Meat,Group,10470,QCL,"Meat, sheep and goat", -1749,Sheep and Goats,Group,10478,QCL,Sheep and goats, -1185,"Silk-worm cocoons, reelable",,1680,QCL,,"Silkworm cocoons suitable for reeling. Foreign trade data are expressed in silk equivalent, which is 25% of their weight." -1186,"Silk, raw",,1521,QCL,Silk,Obtained by reeling the filaments from cocoons. Not thrown. -789,Sisal,,1456,QCL,,"Agave sisalana. Sisal fibre is obtained from the leaves of the plant. It also is used as an ornamental plant. Trade data cover fibres that are raw, prepared for spinning, and tow and waste, including yarn waste and garnetted stock." -1809,"Skim Milk&Buttermilk,Dry",Group,2878,QCL,, -1025,"Skins, goat, fresh",,9661,QCL,"Skins, goat",See 919. Both adult and young animals. -995,"Skins, sheep, fresh",,9826,QCL,"Skins, sheep",See 919. Both adult and young animals. -1176,"Snails, not sea",,147,QCL,,"Fresh, chilled, frozen, dried, salted or in brine." -83,Sorghum,,6268,QCL,Sorghum,"Sorghum spp.: guinea corn (S. guineense); common, milo, feterita, kaffir corn (S. vulgare); durra, jowar, kaoliang (S. dura). A cereal that has both food and feed uses. Sorghum is a major food grain in most of Africa, where it is also used in traditional beer brewing. It is desirable to report hybrid and other varieties separately." -2518,Sorghum and products,,8398,FBSC,Sorghum,"Default composition: 83 Sorghum, 84 Flour, sorghum, 85 Bran, sorghum" -2571,Soyabean Oil,,9352,FBSC,Soybean oil,"Default composition: 237 Oil, soybean" -2555,Soyabeans,,9075,FBSC,Soybeans,"Default composition: 236 Soybeans, 239 Soya sauce, 240 Soya paste, 241 Soya curd" -236,Soybeans,,4993,QCL,Soybeans,"Glycine soja. The most important oil crop. Also widely consumed as a bean and in the form of various derived products because of its high protein content, e.g. soya milk, meat, etc." -2923,Spices,Group,9513,FBSC,, -723,Spices nes,,4080,QCL,,"Including inter alia: bay leaves (Laurus nobilis); dill seed (Anethum graveolens); fenugreek seed (Trigonella foenum-graecum); saffron (Crocus sativus); thyme (Thymus vulgaris); turmeric (Curcuma longa). Other spices that are not identified separately because of their minor relevance at the international level. Because of their limited local importance, some countries report spices under this heading that are classified individually by FAO. This heading also includes curry powder and other mixtures of different spices." -2645,"Spices, Other",,9443,FBSC,,"Default composition: 692 Vanilla, 693 Cinnamon (canella), 702 Nutmeg, mace and cardamoms, 711 Anise, badian, fennel, coriander, 720 Ginger, 723 Spices, nes" -373,Spinach,,3413,QCL,Spinach,Spinacia oleracea. Trade figures may include New Zealand spinach (Tetragonia espansa) and orache (garden) spinach (Atriplex hortensis). -2907,Starchy Roots,Group,9513,FBSC,, -2922,Stimulants,Group,9513,FBSC,, -544,Strawberries,,3748,QCL,Strawberries,Fragaria spp.. -423,String beans,,1199,QCL,String beans,Phaseolus vulgaris; Vigna spp. Not for shelling. -2542,Sugar (Raw Equivalent),,9518,FBSC,,"Default composition: 158 Sugar, cane, raw, centrifugal, 159 Sugar, beet, raw, centrifugal, 162 Sugar Raw Centrifugal, 164 Sugar refined, 168 Sugar confectionery, 171 Sugar flavoured" -2909,Sugar & Sweeteners,Group,9513,FBSC,, -157,Sugar beet,,3194,QCL,Sugar beet,"Beta vulgaris var. altissima. In some producing countries, marginal quantities are consumed, either directly as food or in the preparation of jams." -2537,Sugar beet,,5676,FBSC,Sugar beet,Default composition: 157 Sugar beet -156,Sugar cane,,6249,QCL,Sugar cane,"Saccharum officinarum. In some producing countries, marginal quantities of sugar cane are consumed, either directly as food or in the form of juice." -2536,Sugar cane,,7312,FBSC,Sugar cane,Default composition: 156 Sugar cane -2908,Sugar Crops,Group,8723,FBSC,Sugar crops, -161,Sugar crops nes,,206,QCL,,"Including inter alia: sugar maple (Acer saccharum); sweet sorghum (Sorghum saccharatum); sugar palm (Arenga saccharifera). Includes minor sugar crops of local importance. In the case of saps, production is to be expressed in liquid equivalent." -1723,Sugar Crops Primary,Group,8222,QCL,Sugar crops,"Sugar Crops and Sweeteners and derived products: In addition to providing the source for the manufacture of sugar, SUGAR CROPS are used to produce alcohol and ethanol. In certain countries, sugar cane is eaten raw in minor quantities. It also is used in the preparation of juices and for animal feed. There are two major sugar crops: sugar beets and sugar cane. However, sugar and syrups are also produced from the sap of certain species of maple trees, from sweet sorghum when cultivated explicitly for making syrup and from sugar palm. Sugar beets that are cultivated solely as a fodder crop and red or garden beets that are classified as vegetable crops are excluded from the FAO list of sugar crops. Sugar cane is a perennial grass (replanted at intervals using pieces of the cane stalks) that is cultivated mainly in the tropics. Sugar beet is an annual crop that is propagated by the seeds of the flowers. It is cultivated in cooler climates than sugar cane, mainly above the 35th parallel of the Northern Hemisphere. Both sugar beets and sugar cane have high water content, accounting for about 75 percent of the total weight of the plants. The sugar content of sugar cane ranges from 10 to 15 percent of the total weight, while that of sugar beets is between 13 and 18 percent. The protein and fat content of both beets and cane is almost nil. Production data on sugar beets and sugar cane relate to the harvested crop, free of soil, plant tops and leaves. FAO lists three primary sugar crops. Under the name SWEETENERS, FAO includes products used for sweetening that are derived from sugar crops, cereals, fruits or milk, or that are produced by insects. This category includes a wide variety of monosaccharide (glucose and fructose) and disaccharides (sucrose and saccharose). They exist either in a crystallized state as sugar, or in thick liquid form as syrups. The traditional sources of sugar are sugar cane and sugar beets. But in recent years, ever larger quantities of cereals (mainly maize) have been used to produce sweeteners derived from starch. OTHER DERIVED PRODUCTS. In addition to sugar, molasses is also obtained with various degrees of sugar content. The by-product obtained from the extraction of sugar is called bagasse in the case of sugar cane, and beet pulp in the case of sugar beets." -2541,Sugar non-centrifugal,,1877,FBSC,,Default composition: 163 Sugar non-centrifugal -162,Sugar Raw Centrifugal,,7289,QCL,Sugar (raw),The sum of codes 158 and 159. Processed further to obtain refined sugar. -267,Sunflower seed,,3610,QCL,Sunflower seed,Helianthus annuus. Valued mainly for its oil. Minor uses include as a human food and as feed for birds. -2557,Sunflower seed,,8026,FBSC,Sunflower seed,Default composition: 267 Sunflower seed -2573,Sunflowerseed Oil,,9033,FBSC,Sunflower oil,"Default composition: 268 Oil, sunflower" -122,Sweet potatoes,,6839,QCL,Sweet potatoes,"Ipomoea batatas. A seasonal crop grown in tropical and subtropical regions. Used mainly for human food. Trade data cover fresh and dried tubers, whether or not sliced or in the form or pellets." -2533,Sweet potatoes,,8664,FBSC,Sweet potatoes,Default composition: 122 Sweet potatoes -2543,"Sweeteners, Other",,9518,FBSC,,"Default composition: 154 Fructose chemically pure, 155 Maltose chemically pure, 160 Maple sugar and syrups, 161 Sugar crops, nes, 165 Molasses, 166 Fructose and syrup, other, 167 Sugar, nes, 172 Glucose and dextrose, 173 Lactose, 175 Isoglucose, 633 Beverages, non alcoholic" -1225,Tallow,,3448,QCL,,"Rendered fats of animals other than pigs, excluding tallow oil or stearine." -305,Tallowtree seed,,59,QCL,,Borneo tallow tree (Shorea aptera; S. stenocarpa); Chinese tallow tree (Sapium sebiferum; Stillingia sebifera). Grown wild and cultivated. FAO considers vegetable tallow and stillingia oil to be primary products (see below). -495,"Tangerines, mandarins, clementines, satsumas",,4067,QCL,Tangerines,"Mandarin, tangerine (Citrus reticulata); clementine, satsuma (C. unshiu)." -136,Taro (cocoyam),,2996,QCL,,"Dasheen, eddoe, taro, old cocoyam(Colocasia esculenta). Aroids cultivated for their edible starchy corms or underground stems. Taro is grown throughout the tropics for food. Trade data cover both fresh and dried taro." -667,Tea,,2698,QCL,Tea,"Camellia sinensis; Thea sinensis; Thea assaamica. Includes green tea (unfermented), black tea (fermented), and partially fermented tea. Excludes green tea eaten as a vegetable." -2635,Tea (including mate),,9465,FBSC,,"Default composition: 667 Tea, 671 Maté, 672 Tea, mate extracts" -826,"Tobacco, unmanufactured",,7422,QCL,Tobacco,"Nicotiana tabacum. Unmanufactured dry tobacco, including refuse that is not stemmed or stripped, or is partly or wholly stemmed or stripped." -388,Tomatoes,,9139,QCL,Tomatoes,Lycopersicon esculentum. -2601,Tomatoes and products,,9465,FBSC,Tomatoes,"Default composition: 388 Tomatoes, 389 Juice, tomato, concentrated, 390 Juice, tomato, 391 Tomatoes, paste, 392 Tomatoes, peeled" -2912,Treenuts,Group,9460,FBSC,, -1729,"Treenuts, Total",Group,5977,QCL,Treenuts,"Tree NUTS are dry fruits or kernels enclosed in woody shells or hard husks, which in turn are generally covered by a thick, fleshy/fibrous outer husk that is removed during harvest. Similar products, such as groundnuts, sunflower seeds and melon seeds, although often used for similar purposes, are included with oil-bearing crops.FAO includes in this group only dessert or table nuts. Nuts that are used mainly for flavouring beverages and masticatory and stimulant nuts should be excluded. An exception is made for areca nuts and kola nuts, which FAO considers to be inedible nuts, but which are included with the nut and derived products group to be consistent with international trade classifications. Nuts used mainly for the extraction of oil or butter, (e.g. sheanuts) as well as nuts contained in other fruits (e.g. peaches) are excluded. It should be noted that some countries report certain nut crops (chestnuts, pignolia nuts) with forestry products. Production data relate to the weight of nuts in the shell or husk, but without the outer husk. The weight of the kernel contained in the nut ranges from as low as 30 percent for cashew nuts to as high as 80 percent in the case of chestnuts. The edible portion of nut kernels is, with the major exception of chestnuts, very rich in fat content at between 50 percent and 65 percent. Protein content makes up 15-20 percent and carbohydrate content is between 10 percent and 15 percent. Starch and saccharose are the main components of dry chestnuts, accounting for about 75 percent. FAO covers ten primary nut crops. Each is listed below along with its code, its botanical name, or names, and a short description. NUT PRODUCTS include shelled nuts, whole or split, and further processed products, including roasted nuts, meal/flour, paste, oil, etc. Nut oils are not separately identified in the FAO classification; instead they are included under the heading ""oil of vegetable origin nes"". The most commonly marketed oils are almon oil and cashew nut oil and its derivative cardol." -97,Triticale,,1639,QCL,,"A minor cereal that is a cross between wheat and rye, combining the quality and yield of wheat with the hardiness of rye." -275,Tung nuts,,397,QCL,,Aleurites cordata; A. fordii. Valued mainly for their oil. -1079,Turkeys,,4310,QCL,Turkeys,Meleagris gallopavo. See 1057. -692,Vanilla,,1080,QCL,,"Vanilla planifolia; V. pompona. The fruit (or bean) of a climbing plant of the orchid family. Includes whole, crushed or ground." -2914,Vegetable Oils,Group,9513,FBSC,, -2918,Vegetables,Group,9513,FBSC,Vegetables, -1735,Vegetables Primary,Group,10850,QCL,Vegetables,"Vegetables, as classified in this group, are mainly annual plants cultivated as field and garden crops in the open and under glass, and used almost exclusively for food. Vegetables grown principally for animal feed or seed should be excluded. Certain plants, normally classified as cereals and pulses, belong to this group when harvested green, such as green maize, green peas, etc. This grouping differs from international trade classifications for vegetables in that it includes melons and watermelons, which are normally considered to be fruit crops. But, whereas fruit crops are virtually all permanent crops, melons and watermelons are similar to vegetables in that they are temporary crops. Chillies and green peppers are included in this grouping when they are harvested for consumption as vegetables and not processed into spices. FAO production data for green peas and green beans refer to the total weight including pods, although some countries report on a shelled weight basis. The weight of the pods ranges from 40 to 50 percent for peas to up to 70 percent for broad beans. Area data on small vegetable gardens are often omitted in agricultural surveys, although production estimates may be reported. Trade data for fresh vegetables also include chilled vegetables, meaning the temperature of the products has been reduced to around 0øC without the products being frozen. Vegetables contain principally water, accounting for between 70 percent and 95 percent of their weight. They are low in nutrients, but contain minerals and vitamins. FAO covers 27 primary vegetable products. Each is listed along with its code, botanical name, or names, and a short description. PRODUCTS DERIVED FROM VEGETABLES refer to processed products. Apart from a few main products, international trade classifications do not permit a sufficiently detailed classification of processed products according to the primary commodity used in the preparation. A similar situation prevails for frozen vegetables." -463,"Vegetables, fresh nes",,10731,QCL,,"Including inter alia: bamboo shoots (Bambusa spp.); beets, chards (Beta vulgaris); capers (Capparis spinosa); cardoons (Cynara cardunculus); celery (Apium graveolens); chervil (Anthriscus cerefolium); cress (Lepidium sativum); fennel (Foeniculum vulgare); horseradish (Cochlearia armoracia); marjoram, sweet (Majorana hortensis); oyster plant (Tragopogon porrifolius); parsley (Petroselinum crispum); parsnips (Pastinaca sativa); radish (Raphanus sativus); rhubarb (Rheum spp.); rutabagas, swedes (Brassica napus); savory (Satureja hortensis); scorzonera (Scorzonera hispanica); sorrel (Rumex acetosa); soybean sprouts tarragon (Artemisia dracunculus); watercress (Nasturtium officinale). Other vegetables that are not identified separately because of their minor relevance at the international level. Because of their limited local importance, some countries report vegetables under this heading that are classified individually by FAO." -420,"Vegetables, leguminous nes",,3181,QCL,,Vicia faba. For shelling. -2605,"Vegetables, Other",,9518,FBSC,,"Default composition: 358 Cabbages and other brassicas, 366 Artichokes, 367 Asparagus, 372 Lettuce and chicory, 373 Spinach, 378 Cassava leaves, 393 Cauliflowers and broccoli, 394 Pumpkins, squash and gourds, 397 Cucumbers and gherkins, 399 Eggplants (aubergines), 401 Chillies and peppers, green, 402 Onions, shallots, green, 406 Garlic, 407 Leeks, other alliaceous vegetables, 414 Beans, green, 417 Peas, green, 420 Vegetables, leguminous nes, 423 String beans, 426 Carrots and turnips, 430 Okra, 446 Maize, green, 447 Sweet corn frozen, 448 Sweet corn prep or preserved, 449 Mushrooms and truffles, 450 Mushrooms, dried, 451 Mushrooms, canned, 459 Chicory roots, 461 Carobs, 463 Vegetables, fresh nes, 464 Vegetables, dried nes, 465 Vegetables, canned nes, 466 Juice, vegetables nes, 469 Vegetables, dehydrated, 471 Vegetables in vinegar, 472 Vegetables, preserved nes, 473 Vegetables, frozen, 474 Vegetables, temporarily preserved, 475 Vegetables, preserved, frozen, 476 Vegetables, homogenized preparations, 567 Watermelons, 568 Melons, other (inc.cantaloupes), 658 Coffee, substitutes containing coffee" -2903,Vegetal Products,Group,9513,FBSC,, -205,Vetches,,1944,QCL,,Spring/common vetch (Vicia sativa). Used mainly for animal feed. -222,"Walnuts, with shell",,2770,QCL,Walnuts,"Jugland spp.: J. regia. Produced in temperate zones of the Northern Hemisphere, particularly in the United States." -567,Watermelons,,6336,QCL,Watermelons,Citrullus vulgaris. -15,Wheat,,6442,QCL,Wheat,"Triticum spp.: common (T. aestivum) durum (T. durum) spelt (T. spelta). Common and durum wheat are the main types. Among common wheat, the main varieties are spring and winter, hard and soft, and red and white. At the national level, different varieties should be reported separately, reflecting their different uses. Used mainly for human food." -2511,Wheat and products,,9518,FBSC,Wheat,"Default composition: 15 Wheat, 16 Flour, wheat, 17 Bran, wheat, 18 Macaroni, 19 Germ, wheat, 20 Bread, 21 Bulgur, 22 Pastry, 23 Starch, wheat, 24 Gluten, wheat, 41 Cereals, breakfast, 110 Wafers, 114 Mixes and doughs, 115 Food preparations, flour, malt extract" -890,"Whey, condensed",,758,QCL,,Whey paste. -900,"Whey, dry",,1872,QCL,Whey,Used in both food and animal feed. -564,Wine,,3305,QCL,,"Wines of fresh grapes of all qualities, including sparkling, fortified and dessert wines." -2655,Wine,,9404,FBSC,Wine,"Default composition: 564 Wine, 565 Vermouths & similar" -987,"Wool, greasy",,4883,QCL,Wool,"A natural fibre taken from sheep or lambs. Includes fleece-washed, shorn and pulled wool (from slaughtered animals), but does not include carded or combed wool." -137,Yams,,3408,QCL,Yams,"Dioscorea spp.. The principal edible yams are widely grown throughout the tropics. A starchy staple foodstuff, normally eaten as a vegetable, boiled, baked or fried. In West Africa they are consumed mainly as ""fufu"", a stiff glutinous dough. Trade data cover both fresh and dried yams." -2535,Yams,,4814,FBSC,Yams,Default composition: 137 Yams -135,Yautia (cocoyam),,814,QCL,,"Xanthosoma spp.; malanga, new cocoyam, ocumo, tannia (X. sagittifolium). Several plants are included in this group, some with edible tubers and others with edible stems (also called aroids). Yautia is grown mainly in the Caribbean and is used for food. Trade data cover both fresh and dried yautia." -891,Yoghurt,,804,QCL,,A fermented milk food. \ No newline at end of file diff --git a/etl/steps/archive/explorers/owid/2021/food_explorer.outliers.json b/etl/steps/archive/explorers/owid/2021/food_explorer.outliers.json deleted file mode 100644 index 30d41523c45..00000000000 --- a/etl/steps/archive/explorers/owid/2021/food_explorer.outliers.json +++ /dev/null @@ -1,8 +0,0 @@ -[ - { - "column": "yield__tonnes_per_ha", - "index": [ - ["Spinach", "China", 1984] - ] - } -] \ No newline at end of file diff --git a/etl/steps/archive/explorers/owid/2021/food_explorer.py b/etl/steps/archive/explorers/owid/2021/food_explorer.py deleted file mode 100644 index e149a5ef83e..00000000000 --- a/etl/steps/archive/explorers/owid/2021/food_explorer.py +++ /dev/null @@ -1,1396 +0,0 @@ -# -*- coding: utf-8 -*- -# --- -# jupyter: -# jupytext: -# cell_metadata_filter: -all -# custom_cell_magics: kql -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.11.2 -# --- -# pyright: reportUnusedExpression=false - -# %% [markdown] -# # Food Explorer -# -# _Open as notebook with jupytext_ -# -# Produced using garden-level FAOstat datasets. -# -# So far the following datasets have been processed: -# -# - [x] QCL -# - [x] FBSC (FBS, FBSH) -# -# -# We process both datasets in parallel, until the _Final Processing_ section, where we actually merge the datasets. - -# %% [markdown] -# ## 1. Imports & paths -# Import the required libraries and define paths to load files (including data files and standardisation mappings for item and element names). - -# %% -import json -from typing import Any, Dict, List, Optional, cast - -import numpy as np -import pandas as pd -from owid import catalog -from owid.catalog.meta import DatasetMeta - -from etl.paths import BASE_DIR, DATA_DIR - -# %% -HERE = BASE_DIR / "etl/steps/data/explorers/owid/2021" - -# %% -PATH_DATASET_QCL = DATA_DIR / "garden/faostat/2021-03-18/faostat_qcl" -PATH_DATASET_FBSC = DATA_DIR / "garden/faostat/2021-04-09/faostat_fbsc" -PATH_DATASET_POPULATION = DATA_DIR / "garden/owid/latest/key_indicators" -PATH_DATASET_POPULATION_GAPMINDER = DATA_DIR / "open_numbers/open_numbers/latest/gapminder__systema_globalis" # add - -PATH_MAP_ITEM = HERE / "food_explorer.items.std.csv" -PATH_MAP_ELEM = HERE / "food_explorer.elements.std.csv" -PATH_REGIONS = HERE / "food_explorer.regions.json" -PATH_OUTLIERS = HERE / "food_explorer.outliers.json" - -# %% [markdown] -# ## 2. Load garden dataset -# In this step we load the required datasets from Garden: QCL and FBSC. - -# %% -qcl_garden = catalog.Dataset(PATH_DATASET_QCL) -fbsc_garden = catalog.Dataset(PATH_DATASET_FBSC) - -# %% [markdown] -# We obtain table `bulk` from the dataset, which contains the data itself. - -# %% -# Bulk data and items metadata -qcl_bulk = qcl_garden["bulk"] -fbsc_bulk = fbsc_garden["bulk"] - -# %% [markdown] -# In the following step we discard column `variable_name`, which although useful for its clarity we don't actually need it in this process. Also, we reset the index as this will be needed in following operations. - -# %% -# QCL -qcl_bulk = qcl_bulk.reset_index() -qcl_bulk = qcl_bulk.drop(columns=["variable_name"]) -# FBSC -fbsc_bulk = fbsc_bulk.reset_index() -fbsc_bulk = fbsc_bulk.drop(columns=["variable_name"]) - -# %% [markdown] -# Brief overview of the data. - -# %% -# QCL -print(qcl_bulk.shape) -qcl_bulk.head() - -# %% -# FBSC -print(fbsc_bulk.shape) -fbsc_bulk.head() - -# %% [markdown] -# ### Group some items -# We know from Garden process to generate the FBSC dataset, that there are some items that "changed" its ID from one dataset to another: -# -# - `2556 Groundnuts (Shelled Eq)` --> `2552 Groundnuts` -# - `2805 Rice (Milled Equivalent)` --> `2807 Rice and products` - - -# %% -def group_item_codes( - df: pd.DataFrame, ids_old: List[int], ids_new: List[int], assign_to_old: List[bool] -) -> pd.DataFrame: - # Check - msk = df["item_code"].isin(ids_old + ids_new) - x = df[msk].groupby("item_code").agg({"year": ["min", "max"]}) - for id_old, id_new in zip(ids_old, ids_new): - assert x.loc[id_new, ("year", "min")] > x.loc[id_old, ("year", "max")] - # Replace - if isinstance(assign_to_old, list): - id_map = dict((n, o) if f else (o, n) for o, n, f in zip(ids_old, ids_new, assign_to_old)) - elif assign_to_old: - id_map = dict(zip(ids_new, ids_old)) - else: - id_map = dict(zip(ids_old, ids_new)) - print(id_map) - df["item_code"] = df["item_code"].replace(id_map).astype(int) - return df - - -# %% -fbsc_bulk = group_item_codes(fbsc_bulk, ids_old=[2556, 2805], ids_new=[2552, 2807], assign_to_old=[True, True]) - -# %% [markdown] -# ## 3. Select flags -# There are cases where we have more than just one entry for a `country`, `item_code`, `element_code` and `year`. This is due to the fact that there are multiple ways of reporting the data. All these different methodologies are identified by the field `flag`, which tells us how a data point was obtained (see table below). This is given by FAOstat. -# -# |flag |description | -# |-------|-----------------------------------------------------------------------------------| -# |`*` | Unofficial figure | -# |`NaN` | Official data | -# |`A` | Aggregate; may include official; semi-official; estimated or calculated data| -# |`F` | FAO estimate | -# |`Fc` | Calculated data | -# |`Im` | FAO data based on imputation methodology | -# |`M` | Data not available | -# |`S` | Standardised | -# |`SD` | Statistical Discrepancy | -# |`R` | Estimated data using trading partners database | -# -# -# The following cell examines how many datapoints would be removed if we did _flag-prioritisation_. As per the output, we see that we would eliminate 30,688 rows (~1% of the data). - - -# %% -def check_flags_1(df: pd.DataFrame) -> None: - i_og = df.index.tolist() - i_ne = df.drop_duplicates(subset=["country", "item_code", "element_code", "year"]).index.tolist() - print( - f"Number of datapoints: {len(i_og)}\nNumber of datapoints (after dropping duplicates): {len(i_ne)}\nTotal datapoints removed: {len(i_og)-len(i_ne)}" - ) - check_flags_2(df, i_og, i_ne) - - -def check_flags_2(df: pd.DataFrame, i_og: List[int], i_ne: List[int]) -> None: - """Prints `[number of datapoints eliminated], True`""" - df = df.set_index(["country", "item_code", "element_code", "year"]) - dups = df.index.duplicated() - print(f"{dups.sum()}, {len(i_ne) == len(i_og)-dups.sum()}") - # dups = qcl_bulk.index.duplicated(keep=False) - df = df.reset_index() - - -check_flags_1(qcl_bulk) -print() -check_flags_1(fbsc_bulk) - -# %% [markdown] -# ### Flag prioritzation - -# %% [markdown] -# In this step we define a flag prioritisation rank, which allows us to discard duplicate entries based on which flag we "prefer". We do this by assigning a weight to each datapoint based on their `flag` value (the higher, the more prioritised it is). On top of flag prioritisation, we always prefer non-`NaN` values regardless of their associated `flag` value (we assign weight -1 to this datapoints). The weighting was shared and discussed with authors. -# -# The weight is added to the dataframe as a new column `flag_priority`. -# -# #### Example 1 -# -# country, year, product, value, flag -# Afghanistan, 1993, Apple, 100, F -# Afghanistan, 1993, Apple, 120, A -# -# We would choose first row, with flag F. -# -# #### Example 2: -# -# country, year, product, value, flag -# Afghanistan, 1993, Apple, NaN, F -# Afghanistan, 1993, Apple, 120, A -# -# We would choose second row, as first row is `NaN`. -# -# -# In the following cell we filter rows based on `FLAG_PRIORITIES`. - -# %% -# Create flag priority (add to df) More info at https://www.fao.org/faostat/en/#definitions -FLAG_PRIORITIES = { - "M": 0, # Data not available - "SD": 10, # Statistical Discrepancy - "*": 20, # Unofficial figure - "R": 30, # Estimated data using trading partners database - "Fc": 40, # Calculated data - "S": 60, # Standardized data - "A": 70, # Aggregate; may include official; semi-official; estimated or calculated data - "Im": 80, # FAO data based on imputation methodology - "F": 90, # FAO estimate - np.nan: 100, # Official data -} - - -def filter_by_flag_priority(df: pd.DataFrame) -> pd.DataFrame: - # Add flag priority column - df.loc[:, "flag_priority"] = df.flag.replace(FLAG_PRIORITIES).tolist() - df.loc[df.value.isna(), "flag_priority"] = -1 - # Remove duplicates based on flag value - df = df.sort_values("flag_priority") - df = df.drop_duplicates(subset=["country", "item_code", "element_code", "year"], keep="last") - return df.drop(columns=["flag_priority", "flag"]) - - -# %% -# QCL -qcl_bulk = filter_by_flag_priority(qcl_bulk) -print(qcl_bulk.shape) - -# %% -# FBSC -fbsc_bulk = filter_by_flag_priority(fbsc_bulk) -print(fbsc_bulk.shape) - -# %% [markdown] -# ## 4. Element Overview -# This serves as an initial check on the meaning of `element_code` values. In particular, we note that each `element_code` value corresponds to a unique pair of _element name_ and _element unit_. Note, for instance, that _element_name_ "production" can come in different flavours (i.e. units): "production -- tones" and "production -- 1000 No". -# -# Based on the number of occurrences of each element_code, we may want to keep only those that rank high. -# -# **Note: This step uses file `PATH_MAP_ELEM`, which is a file that was generated using the code in a later cell.** - - -# %% -# Where do each element appear? -def get_stats_elements(df: pd.DataFrame) -> pd.DataFrame: - res = df.reset_index().groupby("element_code")["item_code"].nunique() - df_elem = pd.read_csv(PATH_MAP_ELEM, index_col="code") - elem_map = df_elem["name"] + " -- " + df_elem["unit"] + " -- " + df_elem.index.astype(str) - res = res.rename(index=elem_map.to_dict()).sort_values(ascending=False) - return cast(pd.DataFrame, res) - - -# %% -# QCL -get_stats_elements(qcl_bulk) - -# %% -# FBSC -get_stats_elements(fbsc_bulk) - -# %% [markdown] -# ## 5. Reshape dataset -# This step is simple and brief. It attempts to pivot the dataset in order to have three identifying columns (i.e. "keys") and several "value" columns based on the `element_code` and `Value` columns. -# -# This format is more Grapher/Explorer friendly, as it clearly divides the dataset columns into: Entities, year, [Values]. - - -# %% -def reshape_df(df: pd.DataFrame) -> pd.DataFrame: - df = df.reset_index() - df = df.pivot(index=["country", "item_code", "year"], columns="element_code", values="value") - return df - - -# %% -# QCL -qcl_bulk = reshape_df(qcl_bulk) -# FBSC -fbsc_bulk = reshape_df(fbsc_bulk) - -# %% -print("QCL:", qcl_bulk.shape) -print("FBSC:", fbsc_bulk.shape) - -# %% [markdown] -# ## 6. Standardise Element and Item names (OPTIONAL) -# In the following cells we obtain tables with the code, current name and number of occurrences of all the Items and Elements present in our dataset. -# -# Based on this tables, Hannah (or another researcher), will revisit these and: -# - Select those Items and Elements that we are interested in. -# - Standardise naming proposals of Items and Elements. -# -# Notes: -# - We obtain the number of occurrences as this can assist the researcher in prioritising Items or Elements. - -# %% [markdown] -# ### Elements -# Here we obtain a table with the current namings for Elements (plus other variables). Note that we also propagate the unit names, as these may also be standardised (or even changed). - -# %% -# Load table from dataset containing Element information -qcl_elem = qcl_garden["meta_qcl_element"] -fbsc_elem = fbsc_garden["meta_fbs_element"] - - -# %% -def get_elements_to_standardize(df: pd.DataFrame, df_elem: pd.DataFrame) -> pd.DataFrame: - # Obtain number of occurrences for each element_code (each column is an element) - elements = pd.DataFrame(df.notna().sum()).reset_index() - elements = elements.sort_values(0, ascending=False) # type: ignore - # Add names and unit info to the table - elements = elements.merge( - df_elem[["element", "unit", "unit_description"]], - left_on="element_code", - right_index=True, - ) - # Rename column names - elements = elements.rename( - columns={ - "element_code": "code", - 0: "number_occurrences", - "element": "name", - "unit": "unit", - "unit_description": "unit_description", - } - )[["code", "name", "unit", "unit_description", "number_occurrences"]] - return elements - - -# %% -elements_qcl = get_elements_to_standardize(qcl_bulk, qcl_elem).assign(dataset="QCL") -elements_fbsc = get_elements_to_standardize(fbsc_bulk, fbsc_elem).assign(dataset="FBSC") - -assert elements_qcl.merge(elements_fbsc, on="code").empty - -# %% [markdown] -# Once the table is obtained, we take a look at it and export it. Note that we use a filename starting with `ign.`, as these are note git-tracked. - -# %% -elements = pd.concat([elements_qcl, elements_fbsc]) -elements.head() - -# %% -# elements.to_csv("ign.food.elements.csv", index=False) - -# %% [markdown] -# ### Items -# Here we obtain a table with the current namings for Items (plus other variables). - -# %% -# Load table from dataset containing Item information -qcl_item = qcl_garden["meta_qcl_item"] -fbsc_item = fbsc_garden["meta_item"] - -# %% [markdown] -# As the following cell shows, this table comes with a multi-index, as codes may actually be referring to "item_groups" or "Items". - -# %% -qcl_item.head() - -# %% [markdown] -# Therefore, in the next cell we attempt to flatten code to name mappings. -# -# To this end: -# - We first create two separate dictionaries, mapping `item_group_code --> item_group` and `item_code --> Item`, respectively. -# - We note, however, that some codes appear both as "Items" and "item_groups". This might be due to the fact that there are more than one level of items. That is, an Item can "belong" to an item_group, which in turn belongs to yet a higher up item_group. Therefore, we remove these codes from the item dictionary so they only appear in the item_group dictionary. -# - Next, we create a table with all items, their occurrences, whether they are item_groups, and their FAO original namings. - - -# %% -def get_items_to_standardize(df: pd.DataFrame, df_item: pd.DataFrame) -> pd.DataFrame: - # Group - map_item_g = dict( - zip( - df_item.index.get_level_values("item_group_code").astype(str), - df_item["item_group"], - ) - ) - # Item - map_item = dict(zip(df_item.index.get_level_values("item_code").astype(str), df_item["item"])) - - # Correct - map_item = {k: v for k, v in map_item.items() if k not in map_item_g} - - # Load item occurences - items = ( - pd.DataFrame(df.reset_index()["item_code"].value_counts()) - .reset_index() - .astype(str) - .rename( - columns={ - "index": "code", - "item_code": "number_occurences", - } - ) - ) - # Add flag for groups - items["type"] = items["code"].isin(map_item_g).apply(lambda x: "Group" if x else None) - # Add name - map_item_all = {**map_item, **map_item_g} - items["name"] = items.code.replace(map_item_all) - # Order columns - items = items[["code", "name", "type", "number_occurences"]] - return items - - -# %% -items_qcl = get_items_to_standardize(qcl_bulk, qcl_item).assign(dataset="QCL") -items_fbsc = get_items_to_standardize(fbsc_bulk, fbsc_item).assign(dataset="FBSC") -items = pd.concat([items_qcl, items_fbsc]) - -# %% [markdown] -# Once the table is obtained, we take a look at it and export it. Note that we use a filename starting with `ign.`, as these are note git-tracked. - -# %% -items.head() - -# %% -# items.to_csv("ign.food.items.csv", index=False) - -# %% [markdown] -# ## 7. Renaming Items and Elements -# After the previous step, where we shared files `ign.food.items.csv` and `ign.food.elements.csv` with a researcher, they will review them and add the standardisation namings for all items and elements that we intend to use. Note that if no standardised name is provided, the item or element will be discarded. -# -# Their proposals come in two files: `food_explorer.items.std.csv` and `food_explorer.elements.std.csv`. Note that we prefer working with the mapping `"item/element_code" ---> "new standardised item/element name"`. - -# %% [markdown] -# ### Element - -# %% [markdown] -# First of all, we load the standardisation table and remove NaN values (these belong to to-be-discarded elements). - -# %% -# Get standardised values -df = pd.read_csv(PATH_MAP_ELEM, index_col="code") -df = df.dropna(subset=["name_standardised"]) - -# %% [markdown] -# If we display the content of the standardisation element file we observe that: -# - Only some elements are preserved. -# - There is the column `unit_name_standardised_with_conversion` and `unit_factor`, which provide the new unit and the factor to convert the old one into the new one. -# - Multiple codes are assigned to the same `name_standardised` and `unit_name_standardised_with_conversion`, which means that we will have to merge them. In particular, element "Yield" with unit "kg/animal" appears with four different codes! - -# %% -# Show -df - -# %% [markdown] -# We keep columns in data file that belong to the "elements of interest" (those with renaming). - -# %% -# Filter elements of interest -qcl_bulk = qcl_bulk[[col for col in df.index if col in qcl_bulk.columns]] -fbsc_bulk = fbsc_bulk[[col for col in df.index if col in fbsc_bulk.columns]] - -# %% [markdown] -# We modify the values of some elements, based on the new units and `unit_factor` values. - -# %% -# Factor -qcl_bulk = qcl_bulk.multiply(df.loc[qcl_bulk.columns, "unit_factor"]) -fbsc_bulk = fbsc_bulk.multiply(df.loc[fbsc_bulk.columns, "unit_factor"]) - -# %% [markdown] -# Next, we merge codes into single codes: -# - **Yield**: `5417, 5420, 5424, 5410 ---> 5417` (QCL) -# - **Animals slaughtered**: `5320, 5321 ---> 5320` (QCL) -# -# As previously highlighted, all of them are mapped to the same (name, unit) tupple. - -# %% -# QCL -item_code_merge = { - 5417: [5420, 5424, 5410], - 5320: [5321], -} -items_drop = [ii for i in item_code_merge.values() for ii in i] -for code_new, codes_old in item_code_merge.items(): - for code_old in codes_old: - qcl_bulk[code_new] = qcl_bulk[code_new].fillna(qcl_bulk[code_old]) -qcl_bulk = qcl_bulk.drop(columns=items_drop) - -# %% [markdown] -# Finally, we rename the column names (so far element_codes) to more prosaic element identifiers (`[element-name]__[unit]`). - -# %% -# Build element name -a = df["name_standardised"].apply(lambda x: x.lower().replace(" ", "_")).astype(str) -b = df["unit_name_standardised_with_conversion"].apply(lambda x: x.lower().replace(" ", "_")).astype(str) -df["element_name"] = (a + "__" + b).tolist() -# Obtain dict element_code -> element name -map_elem = df["element_name"].to_dict() - -# %% -# Change columns names -qcl_bulk = qcl_bulk.rename(columns=map_elem) -fbsc_bulk = fbsc_bulk.rename(columns=map_elem) - -# %% -# Show dataframe with standardised element names -qcl_bulk.head() - -# %% [markdown] -# ### Item -# We now load the standardisation item table and remove `NaN` values (these belong to to-be-discarded items). - -# %% -# Get standardised values -df = pd.read_csv(PATH_MAP_ITEM, index_col="code") -map_item_std = df.dropna(subset=["name_standardised"])["name_standardised"].to_dict() - -# %% [markdown] -# Briefly display first 10 mappings. - -# %% -{k: v for (k, v) in list(map_item_std.items())[:10]} - -# %% [markdown] -# Next, we do a simple check of item name uniqueness. Note that we can have multiple codes assigned to the same `name_standardised`, as part of the standardisation process, BUT these should be in different datasets so we don't have any element conflicts. - -# %% -# Show "fused" products from QCL and FBSC -x = pd.DataFrame.from_dict(map_item_std, orient="index", columns=["name"]).reset_index() -x = x.groupby("name").index.unique().apply(list) -x = x[x.apply(len) > 1] -print("There are", len(x), "fused products:\n", x) - -# %% -# Check `code` --> `name_standardised` is unique in each dataset -assert ( - df.dropna(subset=["name_standardised"]).reset_index().groupby(["dataset", "name_standardised"]).code.nunique().max() - == 1 -) - -# %% [markdown] -# Next, we filter out items that we are not interested in and add a new column (`product`) with the standardised item names. - - -# %% -def standardise_product_names(df: pd.DataFrame) -> pd.DataFrame: - df = df.reset_index() - df = df[df["item_code"].isin(map_item_std)] - df.loc[:, "product"] = df["item_code"].replace(map_item_std).tolist() - df = df.drop(columns=["item_code"]) - # Set back index - df = df.set_index(["product", "country", "year"]) - return df - - -# %% -qcl_bulk = standardise_product_names(qcl_bulk) -fbsc_bulk = standardise_product_names(fbsc_bulk) - -# %% [markdown] -# ## 8. Dataset merge -# Here we add the final processing steps: -# - Merge datasets `QCL` + `FBSC` -# - Discard products (former items) that do not contain any value for the "elements of interest". - -# %% -# Merge datasets -fe_bulk = pd.merge(qcl_bulk, fbsc_bulk, how="outer", left_index=True, right_index=True) - -# %% -print("QCL // shape:", qcl_bulk.shape, "/ not-NaN:", qcl_bulk.notna().sum().sum()) -print("FBSC // shape:", fbsc_bulk.shape, "/ not-NaN:", fbsc_bulk.notna().sum().sum()) -print("FE // shape:", fe_bulk.shape, "/ not-NaN:", fe_bulk.notna().sum().sum()) - -# %% -# Drop nulls (some products dont have any value for the elements of interest) -fe_bulk = fe_bulk.dropna(how="all") -print("FE (after NaN-drop):", fe_bulk.shape) - -# %% -print(fe_bulk.shape) -fe_bulk.head() - -# %% [markdown] -# ## 9. Post processing -# In this section we obtain the metrics for all regions and add per-capita counterparts. So far, we include income groups by the World Bank, continents as defined by OWID and World. The values for these entities are obtained using only data present in the dataset (i.e. some countries may be missing). -# -# -# - Normalize metrics -# - Add population column -# - Weight columns -# - Rename columns -# - Obtain metrics for regions -# - Add population column, including regions - -# %% -# fe_bulk_orig = fe_bulk.copy() - -# %% -fe_bulk = fe_bulk.reset_index() - -# %% [markdown] -# ### 9.0 Build population table - -# %% -# Load population dataset -indicators = catalog.Dataset(PATH_DATASET_POPULATION) -population = indicators["population"][["population"]].reset_index() - -# %% -# Load from gapminder (former countries) -# more info: https://github.com/open-numbers/ddf--gapminder--systema_globalis/blob/master/ddf--entities--geo--country.csv -gapminder = catalog.Dataset(PATH_DATASET_POPULATION_GAPMINDER) -population_gap = ( - gapminder["total_population_with_projections"] - .reset_index() - .rename(columns={"time": "year", "total_population_with_projections": "population"}) -) - -gapminder_country_codes = { - "ussr": "USSR", - "cheslo": "Czechoslovakia", - "yug": "Yugoslavia", - "eri_a_eth": "Ethiopia (former)", - "scg": "Serbia and Montenegro", -} -former_states = list(gapminder_country_codes.values()) - -population_gap = population_gap[population_gap.geo.isin(gapminder_country_codes)] -population_gap = population_gap.assign(country=population_gap.geo.map(gapminder_country_codes)).drop(columns=["geo"]) - -# Filter years (former states only for past interval, not overlapping with current countries) -date_window = ( - fe_bulk[fe_bulk.country.isin(former_states)].groupby("country").year.agg(["min", "max"]).to_dict(orient="index") -) -population_ = [] -for state, dates in date_window.items(): - df_ = population_gap[ - (population_gap.country == state) - & (population_gap.year >= dates["min"]) - & (population_gap.year <= dates["max"]) - ] - population_.append(df_) - -population_gap = pd.concat(population_, ignore_index=True) - -# Index -population_gap = population_gap.set_index(["country", "year"], verify_integrity=True) - -# %% -# Ensure no overlapping -former_to_current = { - "USSR": [ - "Lithuania", - "Georgia", - "Estonia", - "Latvia", - "Ukraine", - "Moldova", - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Belarus", - "Russia", - "Kazakhstan", - ], - "Yugoslavia": [ - "Croatia", - "Slovenia", - "North Macedonia", - "Bosnia and Herzegovina", - "Serbia", - "Montenegro", - ], - "Czechoslovakia": ["Czechia", "Slovakia"], - "Ethiopia (former)": ["Ethiopia", "Eritrea"], - "Serbia and Montenegro": ["Serbia", "Montenegro"], - "Sudan (former)": ["Sudan", "South Sudan"], -} -former_states = list(former_to_current.keys()) - -for former, current in former_to_current.items(): - msk = fe_bulk.country.isin(current) - current_start = fe_bulk.loc[msk, "year"].min() - former_end = fe_bulk.loc[fe_bulk.country == former, "year"].max() - assert former_end < current_start - -# %% -# Estimate Sudan (former) -msk = population.country.isin(["South Sudan", "Sudan"]) & (population.year < 2012) -pop_sudan = population[msk].groupby("year", as_index=False).population.sum().assign(country="Sudan (former)") -population = pd.concat([pop_sudan, population], ignore_index=True) -date_window = date_window | {"Sudan (former)": {"min": 1961, "max": 2011}} - -# %% -# Filter current states that did not exist -msk = None -for former, current in former_to_current.items(): - if msk is None: - msk = population.country.isin(former_to_current[former]) & (population.year <= date_window[former]["max"]) - else: - msk |= population.country.isin(former_to_current[former]) & (population.year <= date_window[former]["max"]) -population = population[(population.year >= fe_bulk.year.min()) & (population.year <= fe_bulk.year.max())].astype( - {"year": int} -) -population = population.loc[~msk] # type: ignore -population = population.set_index(["country", "year"], verify_integrity=True) - -# %% -# Merge -population = pd.concat([population, population_gap]) - -# %% [markdown] -# ### 9.1 Normalize metrics -# In this section, we undo the _per_capita_ part of some metrics. We do this so we can aggregate countries into regions and later normalize by the total population. - -# %% [markdown] -# #### Add population column - -# %% -countries_pop = set(population.index.levels[0]) # type: ignore -countries = set(fe_bulk.country) -print(f"Missing {len(countries_missing := countries.difference(countries_pop))} countries: {countries_missing}") -if len(countries_missing) > 17: - raise ValueError("More countries missing than expected!") - -# %% -shape_first = fe_bulk.shape[0] -fe_bulk = fe_bulk.merge(population, left_on=["country", "year"], right_on=["country", "year"]) -print(f"Decrease of {round(100*(1-fe_bulk.shape[0]/shape_first))}% rows") - -# %% [markdown] -# #### Weight columns - -# %% -# Define which columns will be weighted -keyword = "_per_capita" -columns_per_capita = {col: col.replace(keyword, "") for col in fe_bulk.columns if keyword in col} -# Normalize and rename columns -fe_bulk[list(columns_per_capita)] = fe_bulk[list(columns_per_capita)].multiply(fe_bulk["population"], axis=0) -fe_bulk = fe_bulk.rename(columns=columns_per_capita).drop(columns=["population"]) - -# %% [markdown] -# ### 9.2 Add regions -# Here we obtain the metrics for each region (continents, income groups and World). We avoid computing the aggregates for metrics relative to land use and animal use, as for these we would need the number of land and animals used per country. We can estimate `yield__tonnes_per_ha`, with other available metrics but will leave `yield__kg_per_animal` as NaN for all regions. - -# %% [markdown] -# #### Create mappings Country ---> Region - -# %% -# Load region map -with open(PATH_REGIONS, "r") as f: - regions = json.load(f) -regions_all = ["World"] + list(regions) - -income = [ - "High-income countries", - "Low-income countries", - "Lower-middle-income countries", - "Upper-middle-income countries", -] -continents = [ - "Antarctica", - "Africa", - "Asia", - "Europe", - "South America", - "North America", - "Oceania", -] -country2continent = {vv: k for k, v in regions.items() for vv in v if k in continents} -country2income = {vv: k for k, v in regions.items() for vv in v if k in income} - -# %% -# Ensure former states presence -country2continent["Sudan (former)"] = "Africa" -country2income = { - **country2income, - "Czechoslovakia": "High-income countries", - "Ethiopia (former)": "Low-income countries", - "Serbia and Montenegro": "Upper-middle-income countries", - "Yugoslavia": "Upper-middle-income countries", - "USSR": "Upper-middle-income countries", - "Sudan (former)": "Low-income countries", -} -for state in former_states: - assert state in country2continent - -# %% [markdown] -# #### Remove default regions (if any) - -# %% -fe_bulk = fe_bulk.loc[~fe_bulk.country.isin(regions_all)].reset_index(drop=True) - -# %% [markdown] -# #### Function and variables to get metrics for regions -# Definition of functions recurrently needed and some variables - - -# %% -def get_df_regions( - df: pd.DataFrame, - mapping: Dict[Any, Any], - column_location: str, - columns_index: List[str], - columns_aggregate: Optional[List[str]] = None, -) -> pd.DataFrame: - # TODO: flag whenever all (production__tonnes, area_harvested__ha) are available - # Continents - df_regions = df.assign(**{column_location: df[column_location].replace(mapping)}) - if columns_aggregate is not None: - df_regions = df_regions.groupby(columns_index, as_index=False)[columns_aggregate].sum(min_count=1) - else: - df_regions = df_regions.groupby(columns_index, as_index=False).sum(min_count=1) - # Only keep new regions - msk = df_regions[column_location].isin(set(mapping.values())) - df_regions = df_regions.loc[msk] - print(f"{round(100*df_regions.shape[0]/df.shape[0], 2)}% increase in rows") - return df_regions - - -# %% -columns_index = ["product", "country", "year"] -columns_exclude = columns_index + ["yield__tonnes_per_ha", "yield__kg_per_animal"] -columns_aggregate = [col for col in fe_bulk.columns if col not in columns_exclude] - -# %% [markdown] -# #### Estimate region data - -# %% -# World -fe_bulk_world = ( - fe_bulk.groupby(["product", "year"], as_index=False)[columns_aggregate].sum(min_count=1).assign(country="World") -) -print(f"{round(100*fe_bulk_world.shape[0]/fe_bulk.shape[0], 2)}% increase in rows") -# Continents -fe_bulk_continent = get_df_regions(fe_bulk, country2continent, "country", columns_index, columns_aggregate) -# Income groups -fe_bulk_income = get_df_regions(fe_bulk, country2income, "country", columns_index, columns_aggregate) - -# %% [markdown] -# #### Merge - -# %% -# Concatenate -fe_bulk = pd.concat([fe_bulk, fe_bulk_world, fe_bulk_continent, fe_bulk_income]) - -# %% [markdown] -# #### Add missing metrics for regions - -# %% -msk = ( - (fe_bulk.country.isin(regions_all)) & (fe_bulk["area_harvested__ha"] != 0) & (~fe_bulk["area_harvested__ha"].isna()) -) -fe_bulk.loc[msk, "yield__tonnes_per_ha"] = ( - fe_bulk.loc[msk, "production__tonnes"] / fe_bulk.loc[msk, "area_harvested__ha"] -) - -# %% [markdown] -# ### 9.3 Population -# Next, we will add a column with the population of each country (or region). Note that some regions are not present in the population dataset, hence we first need to add these. - -# %% -# Load population dataset -population = population.reset_index() - -# %% -# Remove regions -population = population[~population.country.isin(set(country2continent.values()))] -# Remove income groups -population = population[~population.country.isin(set(country2income.values()))] - -# %% [markdown] -# #### Obtain continent and income group populations - -# %% -population_continent = get_df_regions(population, country2continent, "country", ["country", "year"]) -population_income = get_df_regions(population, country2income, "country", ["country", "year"]) - -# %% -# Concatenate -population = pd.concat([population, population_continent, population_income]) -population = population.set_index(["country", "year"]) - -# %% [markdown] -# #### Add `population` column - -# %% -fe_bulk = fe_bulk.merge(population, left_on=["country", "year"], right_index=True) - -# %% -fe_bulk = fe_bulk.set_index(["product", "country", "year"], verify_integrity=True).sort_index() - -# %% [markdown] -# ### 9.4 Value checks - -# %% [markdown] -# #### Remove values for _food_available_for_consumption__kcal_per_day -# We remove values for metric `food_available_for_consumption__kcal_per_day` whenever they seem wrong. Our criteria is to find out if for a given `(item,country)` this metric only has few values. We define _few_ as below a pre-defined threshold `th`. -# -# Note, here removing means assigning `NaN` to this metric for the rows considered. - -# %% -# Overview of the distribution of different metric values -res = fe_bulk.groupby( - [fe_bulk.index.get_level_values(0), fe_bulk.index.get_level_values(1)] -).food_available_for_consumption__kcal_per_day.nunique() -res[res != 0].value_counts(normalize=True).cumsum().head(10) - -# %% -# Get valid (item,country) -threshold = 5 -idx_keep = res[res < threshold].index -# Assign NaNs -index_2 = pd.Index([i[:2] for i in fe_bulk.index]) -msk = index_2.isin(idx_keep) -fe_bulk.loc[msk, "food_available_for_consumption__kcal_per_day"] = pd.NA - -# %% [markdown] -# #### Remove outliers -# Remove outliers (i.e. subsitute the values with `NaN`). - -# %% -# Define for each column (metric) which indices should be 'removed' -with open(PATH_OUTLIERS, "r") as f: - outliers = json.load(f) - -# %% -for datapoints in outliers: - fe_bulk.loc[datapoints["index"], datapoints["column"]] = pd.NA - -# %% [markdown] -# ### 9.5 Correct region entities -# For some `product`, `metric` and `year` no value can be estimated for certain regions. This is because a big chunk of the region's population (i.e. countries) are missing. In this section we filter these entries out. - -# %% [markdown] -# For this processing step, we melt the dataframe and divide it into two parts: -# - Country data -# - Region data (continents, income groups) - -# %% -fe_bulk_orig = fe_bulk.copy() -fe_bulk_melted = fe_bulk.reset_index().melt(id_vars=["product", "country", "year", "population"], var_name="metric") - -# %% -# Drop nan values -fe_bulk_melted = fe_bulk_melted.dropna(subset="value") -# Exclude regions -regions_ = continents + income + ["World"] -msk = fe_bulk_melted.country.isin(regions_) -fe_bulk_melted_countries = fe_bulk_melted[~msk] -fe_bulk_melted_regions = fe_bulk_melted[msk] - -# %% [markdown] -# Next, we build a dataframe `x` which contains the _population difference_ for each region given a product, metric and year. - - -# %% -def build_df(x: pd.DataFrame, ncountries: bool = True) -> pd.DataFrame: - # add number of countries and population in present countries - population_ = x.groupby(["product", "metric", "region", "year"]).population.sum().tolist() - x = x.groupby(["product", "metric", "region", "year"], as_index=False).country.nunique() - x = x.assign( - population=population_, - ) - - # add real population - population_ = population.reset_index().astype({"year": float}) - x = x.merge(population_, left_on=["region", "year"], right_on=["country", "year"]).rename( - columns={"population_y": "population_gt", "population_x": "population"} - ) - if ncountries: - # add real number of countries - region_size = [] - for r, members in regions.items(): - region_size.append({"region": r, "ncountries_gt": len(members)}) - r = pd.DataFrame(region_size) - x = x.merge(r, left_on="region", right_on="region") - # build df - x = pd.DataFrame(x) - # diff population - x = x.assign( - population_diff=x.population_gt - x.population, - population_diff_perc=(x.population_gt - x.population) / x.population_gt, - ) - return x - - -# %% -# continents -x_cont = build_df(fe_bulk_melted_countries.assign(region=fe_bulk_melted_countries.country.map(country2continent))) -# income groups -x_inco = build_df(fe_bulk_melted_countries.assign(region=fe_bulk_melted_countries.country.map(country2income))) -# world -x_world = build_df(fe_bulk_melted_countries.assign(region="World"), ncountries=False) -# merge -x = pd.concat([x_cont, x_inco, x_world], ignore_index=True) - -# %% [markdown] -# We now merge `x` with `fe_bulk_melted_regions` and filter out all entries that have a `population difference` greater than `t1`. - -# %% -# Merge -cols_merge = ["product", "region", "year", "metric"] -fe_bulk_melted_regions = fe_bulk_melted_regions.merge( - x[cols_merge + ["population", "population_diff_perc"]], - left_on=["product", "country", "year", "metric"], - right_on=["product", "region", "year", "metric"], - how="left", -) -fe_bulk_melted_regions = fe_bulk_melted_regions.rename(columns={"population_x": "population"}) - -# %% -# Checks after merge -msk = fe_bulk_melted_regions.isna().any(axis=1) -values_to_remove = fe_bulk_melted_regions.loc[msk, "value"].unique() -if not all(values_to_remove == [0.011428571428571429, 0.0]) or msk.sum() > 60: - raise ValueError(f"Re-check merge: {msk.sum()}, {values_to_remove}") -# Filter NaNs (controlled) -fe_bulk_melted_regions = fe_bulk_melted_regions[~msk] - -# %% -# Filter all samples with > T1 -## Threshold -t1 = 0.24 # Selected such that no datapoint for product='Total' is lost -t1_backup = fe_bulk_melted_regions[(fe_bulk_melted_regions["product"] == "Total")].population_diff_perc.max() -assert t1 > t1_backup -## Only apply to these metrics -metrics = [ - "food_available_for_consumption__fat_g_per_day", - "food_available_for_consumption__kcal_per_day", - "food_available_for_consumption__kg_per_year", - "food_available_for_consumption__protein_g_per_day", - "other_uses__tonnes", - "waste_in_supply_chain__tonnes", - "feed__tonnes", -] - -fe_bulk_melted_regions = fe_bulk_melted_regions[ - ~((fe_bulk_melted_regions.population_diff_perc >= t1) & (fe_bulk_melted_regions.metric.isin(metrics))) - | (fe_bulk_melted_regions["product"] == "Total") -] - -# %% -# Fix population for > 0 -fe_bulk_melted_regions = fe_bulk_melted_regions.assign(population_per_capita=fe_bulk_melted_regions.population) -msk = (fe_bulk_melted_regions.population_per_capita > 0) & (fe_bulk_melted_regions.metric.isin(metrics)) -fe_bulk_melted_regions.loc[msk, "population_per_capita"] = fe_bulk_melted_regions.loc[msk, "population_y"] - -# %% [markdown] -# Next, we estimate per capita values - -# %% -# Estimate per_capita -fe_bulk_melted_regions = pd.DataFrame(fe_bulk_melted_regions) -fe_bulk_melted_regions = fe_bulk_melted_regions.assign( - metric_capita=fe_bulk_melted_regions.metric + "__per_capita", - value_capita=fe_bulk_melted_regions.value / fe_bulk_melted_regions.population_per_capita, -) -fe_bulk_melted_countries = pd.DataFrame(fe_bulk_melted_countries) -fe_bulk_melted_countries = fe_bulk_melted_countries.assign( - metric_capita=fe_bulk_melted_countries.metric + "__per_capita", - value_capita=fe_bulk_melted_countries.value / fe_bulk_melted_countries.population, -) - -# %% [markdown] -# Time to pivot back - -# %% -cols = [ - "product", - "country", - "year", - "metric", - "population", - "value", - "metric_capita", - "value_capita", -] -r = pd.concat([fe_bulk_melted_countries[cols], fe_bulk_melted_regions[cols]], ignore_index=True) - -# %% -# Pivot -fe_bulk_absolute = ( - r.pivot( - index=["product", "country", "year", "population"], - columns="metric", - values="value", - ) - .reset_index() - .set_index(["product", "country", "year"]) -) -fe_bulk_capita = ( - r.pivot( - index=["product", "country", "year", "population"], - columns="metric_capita", - values="value_capita", - ) - .reset_index() - .set_index(["product", "country", "year"]) - .drop(columns=["population"]) -) - -# %% [markdown] -# Build `fe_bulk` back again. - -# %% -fe_bulk = pd.merge(fe_bulk_absolute, fe_bulk_capita, left_index=True, right_index=True, how="outer") - -# %% -# CHECK -# fe_bulk.loc["Maize", "Asia"]["food_available_for_consumption__kcal_per_day__per_capita"] - -# %% [markdown] -# ### 9.6 Remove former countries -# We want the values reported for former states to account for regions (continents, income groups), but not that they appear on themselves on the explorer. Therefore, we eliminate these from the final dataset. - -# %% -fe_bulk = fe_bulk.reset_index() -fe_bulk = fe_bulk.loc[~fe_bulk.country.isin(former_states)] - -# %% [markdown] -# #### Set index - -# %% -fe_bulk = fe_bulk.set_index(["product", "country", "year"], verify_integrity=True).sort_index() - -# %% [markdown] -# ### 9.7 Remove unnused columns - -# %% -# Remove unnused columns (https://github.com/owid/etl/pull/134#issuecomment-1076883200) -columns_remove = [ - "food_available_for_consumption__fat_g_per_day", - "food_available_for_consumption__kcal_per_day", - "food_available_for_consumption__kg_per_year", - "food_available_for_consumption__protein_g_per_day", - "yield__kg_per_animal__per_capita", - "yield__tonnes_per_ha__per_capita", -] - -fe_bulk = fe_bulk.drop(columns=columns_remove) - -# %% [markdown] -# ### 9.8 Remove all zero series -# Here we detect all `(country, product, metric)` which timeseries is all zeroes and set it to `NaN`. This way, this metric will be ignored in Grapher for the given country and product. - -# %% -# Unpivot -x = fe_bulk.melt(var_name="metric", ignore_index=False).reset_index() - -# %% -# Find (product, country, metric) with all zeros (or NaNs) -res = x.groupby(["product", "country", "metric"]).agg(value_sum=("value", "sum"), value_nunique=("value", "nunique")) -msk = (res["value_nunique"] == 1) & (res["value_sum"] == 0) & (res.index.get_level_values(2) != "population") -idx = msk[msk].index - -# %% -# Replace with NaNs -xx = x.set_index(["product", "country", "metric"]) -xx.loc[idx, "value"] = np.nan -xx = xx.reset_index() - -# %% -# Pivot back -fe_bulk = xx.pivot(index=["product", "country", "year"], columns="metric", values="value").astype(fe_bulk.dtypes) - -# %% [markdown] -# ## 10. Export -# Time to export the shining brand new dataset! -# -# We export it in two flavours: bulk and file-per-product formats. The former is the standard format, while the later is intended to power OWID tools such as explorers. - -# %% [markdown] -# ### Define metadata -# Prior to export, we need to create the metadata content for this dataset. It basically propagates the metadata from its building pieces (QCL so far). -# -# For this dataset, we use namespace `explorers`, which is intended for datasets aimed at powering explorers (this may change). - - -# %% -metadata = DatasetMeta( - namespace="explorers", - short_name="food_explorer", - title="Food Explorer: Livestock & Crops, Food Balances - FAO (2017, 2021)", - description=( - "This dataset has been created by Our World in Data, merging existing FAOstat datsets. In particular, we have used 'Crops and livestock products' (QCL) and 'Food Balances' (FBSH and FBS) datasets. Each row contains all the " - "metrics for a specific combination of (country, product, year). The metrics may come from different datasets." - ), - sources=qcl_garden.metadata.sources + fbsc_garden.metadata.sources, - licenses=qcl_garden.metadata.licenses + fbsc_garden.metadata.licenses, -) - -# %% [markdown] -# ### In bulk - -# %% [markdown] -# Preserve the bulk file for QA or manual analysis. - -# %% [markdown] -# #### Create metadata for fields -# Here we create the content for `field` metadata field, which contains metric-specific information. - -# %% -# Load table from dataset containing Element information -qcl_elem = qcl_garden["meta_qcl_element"] -fbsc_elem = fbsc_garden["meta_fbs_element"] -qcl_elem["name_std"] = qcl_elem.index.map(map_elem) -fbsc_elem["name_std"] = fbsc_elem.index.map(map_elem) -element_metadata = pd.concat([qcl_elem.dropna().assign(dataset="QCL"), fbsc_elem.dropna().assign(dataset="FBS")]) -# Final patch -patch = { - "food_available_for_consumption__fat_g_per_day_per_capita": "food_available_for_consumption__fat_g_per_day", - "food_available_for_consumption__protein_g_per_day_per_capita": "food_available_for_consumption__protein_g_per_day", - "food_available_for_consumption__kcal_per_day_per_capita": "food_available_for_consumption__kcal_per_day", - "food_available_for_consumption__kg_per_capita_per_year": "food_available_for_consumption__kg_per_year", -} -element_metadata["name_std"] = element_metadata["name_std"].replace(patch) - - -# %% -# Fill 'easy' fields -def _get_source_ids(dataset_code: str) -> List[int]: - res = [i for i, source in enumerate(metadata.sources) if f"{dataset_code}" in source.owid_data_url] - return res - - -def _build_description_extra(fe_bulk: pd.DataFrame, col: str) -> str: - num_products = len(set(fe_bulk[col].dropna().index.get_level_values(0))) - num_countries = len(set(fe_bulk[col].dropna().index.get_level_values(1))) - description = f"This metric is present in {num_products} products and {num_countries} countries." - return description - - -def _get_sources_and_licenses(dataset_code: str) -> Dict[str, Any]: - source_ids = _get_source_ids(dataset_code) - sources = [metadata.sources[i] for i in source_ids] - licenses = [metadata.licenses[i] for i in source_ids] - return {"sources": sources, "licenses": licenses} - - -fields = {} -columns = list(fe_bulk.columns) + fe_bulk.index.names -for col in columns: - msk = element_metadata.name_std == col - if msk.sum() == 0: - if "__per_capita" in col: - msk = element_metadata.name_std == col.replace("__per_capita", "") - if msk.sum() == 0: - msk = element_metadata.name_std == f"{col}_per_capita" - - if msk.sum() == 1: - dataset_code = element_metadata.loc[msk, "dataset"].item() - description = element_metadata.loc[msk, "description"].item() - fields[col] = catalog.VariableMeta( - title="", - description=description, - **_get_sources_and_licenses(dataset_code), - display={"description_extra": _build_description_extra(fe_bulk, col)}, - ) - elif msk.sum() > 1: - dataset_codes = element_metadata.loc[msk, "dataset"] - if dataset_codes.nunique() != 1: - raise ValueError(f"Merged metrics should all be from the same dataset! Check {col}") - dataset_code = dataset_codes.unique()[0] - fields[col] = catalog.VariableMeta( - title="", - description="", - **_get_sources_and_licenses(dataset_code), - display={"description_extra": _build_description_extra(fe_bulk, col)}, - ) - else: - fields[col] = catalog.VariableMeta() - -# %% -# Check missing fields -cols_missing = [f for f, v in fields.items() if v.description == ""] -cols_missing_check = { - "exports__tonnes", - "imports__tonnes", - "producing_or_slaughtered_animals__animals", - "yield__kg_per_animal", - "exports__tonnes__per_capita", - "food_available_for_consumption__fat_g_per_day__per_capita", - "food_available_for_consumption__kcal_per_day__per_capita", - "food_available_for_consumption__kg_per_year__per_capita", - "food_available_for_consumption__protein_g_per_day__per_capita", - "imports__tonnes__per_capita", - "producing_or_slaughtered_animals__animals__per_capita", -} -assert set(cols_missing) == cols_missing_check - -# %% -# fields['exports__tonnes']['description'] = -# fields['imports__tonnes']['description'] = -# fields['producing_or_slaughtered_animals__animals']['description'] = -# fields['yield__kg_per_animal']['description'] = "Yield is measured as the quantity produced per unit area of land used to grow it." -# fields['food_available_for_consumption__fat_g_per_day']['description'] = -# fields['food_available_for_consumption__kcal_per_day']['description'] = -# fields['food_available_for_consumption__kg_per_year']['description'] = -# fields['food_available_for_consumption__protein_g_per_day']['description'] = - -# %% [markdown] -# #### Create table - -# %% -table_bulk = catalog.Table(fe_bulk).copy() -table_bulk.metadata.short_name = "bulk" -table_bulk._fields = fields - -# %% [markdown] -# ### One file per product - -# %% [markdown] -# To work in an explorer, we need to add the table in CSV format. To make it more scalable for use, we want -# to split that dataset into many small files, one per product. - - -# %% -def to_short_name(raw: str) -> str: - return raw.lower().replace(" ", "_").replace(",", "").replace("(", "").replace(")", "").replace(".", "") - - -# the index contains values like "Asses" which have already been filtered out from the data, -# let's remove them -fe_bulk.index = fe_bulk.index.remove_unused_levels() # type: ignore - -tables_products = {} - -for product in sorted(fe_bulk.index.levels[0]): # type: ignore - short_name = to_short_name(product) - print(f"{product} --> {short_name}.csv") - - t = catalog.Table(fe_bulk.loc[[product]]) - t.metadata.short_name = short_name - - tables_products[product] = t - - -# %% [markdown] -# ### Create dataset and fill it with tables and metadata - -# %% -### One file per product - - -def run(dest_dir: str) -> None: - # Initialize dataset - fe_garden = catalog.Dataset.create_empty(dest_dir) - fe_garden.metadata = metadata - fe_garden.save() - - # Add bulk table - fe_garden.add(table_bulk) - - # Add products - for _, t in tables_products.items(): - fe_garden.add(t, formats=["csv"]) # <-- note we include CSV format here - - -# %% [markdown] -# Let's check that the biggest files are still an ok size for an explorer. - -# %% -# !du -hs {dest_dir}/*.csv | sort -hr | head -n 10 - -# %% [markdown] -# The biggest is 3.1MB (csv), we should be ok ✓ - -# %% -# # Comparison with previous (live) export -# product = 'vegetables' -# df_new = pd.read_csv(f'/tmp/food_explorer/{product}.csv') -# df_old = pd.read_csv(f'https://owid-catalog.nyc3.digitaloceanspaces.com/garden/explorers/2021/food_explorer/{product}.csv') - -# %% -# # Plot metric -# import matplotlib.pyplot as plt -# plt.rcParams['figure.figsize'] = [10, 7] -# plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower -# metric = "food_available_for_consumption__kcal_per_day" -# # country = "Europe" -# country = "High-income countries" -# product = "Total" -# ( -# fe_bulk.loc[(product, country), metric] -# / fe_bulk.loc[(product, country), "population"] -# ).plot(x="year", title=f"Food Supply in {country} ({product})", ylim=[0,3500]) - -# %% -# for former, current in former_to_current.items(): -# print(former) -# for c in current: -# print(c, country2income[c]) -# print('---') diff --git a/etl/steps/archive/explorers/owid/2021/food_explorer.regions.json b/etl/steps/archive/explorers/owid/2021/food_explorer.regions.json deleted file mode 100644 index 78f712531a1..00000000000 --- a/etl/steps/archive/explorers/owid/2021/food_explorer.regions.json +++ /dev/null @@ -1,527 +0,0 @@ -{ - "High-income countries": [ - "Andorra", - "Anguilla", - "Antigua and Barbuda", - "Aruba", - "Australia", - "Austria", - "Bahamas", - "Bahrain", - "Barbados", - "Belgium", - "Bermuda", - "British Virgin Islands", - "Brunei", - "Canada", - "Cayman Islands", - "Channel Islands", - "Chile", - "Croatia", - "Curacao", - "Cyprus", - "Czechia", - "Denmark", - "Estonia", - "Faroe Islands", - "Falkland Islands", - "Finland", - "France", - "French Polynesia", - "Germany", - "Gibraltar", - "Greece", - "Greenland", - "Guam", - "Guernsey", - "Hong Kong", - "Hungary", - "Iceland", - "Ireland", - "Isle of Man", - "Israel", - "Italy", - "Japan", - "Jersey", - "Kuwait", - "Latvia", - "Liechtenstein", - "Lithuania", - "Luxembourg", - "Macao", - "Malta", - "Monaco", - "Montserrat", - "Nauru", - "Netherlands", - "New Caledonia", - "New Zealand", - "Northern Cyprus", - "Northern Mariana Islands", - "Norway", - "Oman", - "Palau", - "Poland", - "Portugal", - "Puerto Rico", - "Qatar", - "Saint Helena", - "Saint Kitts and Nevis", - "Saint Martin (French part)", - "San Marino", - "Saudi Arabia", - "Seychelles", - "Singapore", - "Sint Maarten (Dutch part)", - "Slovakia", - "Slovenia", - "South Korea", - "Spain", - "Sweden", - "Switzerland", - "Taiwan", - "Trinidad and Tobago", - "Turks and Caicos Islands", - "United Arab Emirates", - "United Kingdom", - "United States", - "United States Virgin Islands", - "Uruguay", - "Wallis and Futuna" - ], - "Low-income countries": [ - "Afghanistan", - "Burkina Faso", - "Burundi", - "Central African Republic", - "Chad", - "Democratic Republic of Congo", - "Eritrea", - "Ethiopia", - "Gambia", - "Guinea", - "Guinea-Bissau", - "Liberia", - "Madagascar", - "Malawi", - "Mali", - "Mozambique", - "Niger", - "North Korea", - "Rwanda", - "Sierra Leone", - "Somalia", - "South Sudan", - "Sudan", - "Syria", - "Togo", - "Uganda", - "Yemen" - ], - "Lower-middle-income countries": [ - "Algeria", - "Angola", - "Bangladesh", - "Belize", - "Benin", - "Bhutan", - "Bolivia", - "Cambodia", - "Cameroon", - "Cape Verde", - "Comoros", - "Congo", - "Cote d'Ivoire", - "Djibouti", - "Egypt", - "El Salvador", - "Eswatini", - "Ghana", - "Haiti", - "Honduras", - "India", - "Indonesia", - "Iran", - "Kenya", - "Kiribati", - "Kyrgyzstan", - "Laos", - "Lesotho", - "Mauritania", - "Micronesia (country)", - "Mongolia", - "Morocco", - "Myanmar", - "Nepal", - "Nicaragua", - "Nigeria", - "Pakistan", - "Palestine", - "Papua New Guinea", - "Philippines", - "Samoa", - "Sao Tome and Principe", - "Senegal", - "Solomon Islands", - "Sri Lanka", - "Tajikistan", - "Tanzania", - "East Timor", - "Tunisia", - "Ukraine", - "Uzbekistan", - "Vanuatu", - "Vietnam", - "Zambia", - "Zimbabwe" - ], - "Upper-middle-income countries": [ - "Albania", - "American Samoa", - "Argentina", - "Armenia", - "Azerbaijan", - "Belarus", - "Bosnia and Herzegovina", - "Botswana", - "Brazil", - "Bulgaria", - "China", - "Colombia", - "Costa Rica", - "Cuba", - "Dominica", - "Dominican Republic", - "Ecuador", - "Equatorial Guinea", - "Fiji", - "Gabon", - "Georgia", - "Grenada", - "Guatemala", - "Guyana", - "Iraq", - "Jamaica", - "Jordan", - "Kazakhstan", - "Kosovo", - "Lebanon", - "Libya", - "Malaysia", - "Maldives", - "Marshall Islands", - "Mauritius", - "Mexico", - "Moldova", - "Montenegro", - "Namibia", - "North Macedonia", - "Panama", - "Paraguay", - "Peru", - "Romania", - "Russia", - "Saint Lucia", - "Saint Vincent and the Grenadines", - "Serbia", - "South Africa", - "Suriname", - "Thailand", - "Tonga", - "Turkey", - "Turkmenistan", - "Tuvalu" - ], - "Antarctica": [ - "Bouvet Island", - "French Southern Territories", - "Heard Island and McDonald Islands", - "South Georgia and the South Sandwich Islands" - ], - "Africa": [ - "Algeria", - "Angola", - "Benin", - "Botswana", - "Burkina Faso", - "Burundi", - "Cameroon", - "Cape Verde", - "Central African Republic", - "Chad", - "Comoros", - "Congo", - "Cote d'Ivoire", - "Democratic Republic of Congo", - "Djibouti", - "Egypt", - "Equatorial Guinea", - "Eritrea", - "Ethiopia (former)", - "Eswatini", - "Ethiopia", - "Gabon", - "Gambia", - "Ghana", - "Guinea", - "Guinea-Bissau", - "Kenya", - "Lesotho", - "Liberia", - "Libya", - "Madagascar", - "Malawi", - "Mali", - "Mauritania", - "Mauritius", - "Mayotte", - "Morocco", - "Mozambique", - "Namibia", - "Niger", - "Nigeria", - "Reunion", - "Rwanda", - "Saint Helena", - "Sao Tome and Principe", - "Senegal", - "Seychelles", - "Sierra Leone", - "Somalia", - "South Africa", - "South Sudan", - "Sudan", - "Tanzania", - "Togo", - "Tunisia", - "Uganda", - "Western Sahara", - "Zambia", - "Zanzibar", - "Zimbabwe" - ], - "Asia": [ - "Afghanistan", - "Armenia", - "Azerbaijan", - "Bahrain", - "Bangladesh", - "Bhutan", - "British Indian Ocean Territory", - "Brunei", - "Cambodia", - "China", - "Christmas Island", - "Cocos Islands", - "Georgia", - "Hong Kong", - "India", - "Indonesia", - "Iran", - "Iraq", - "Israel", - "Japan", - "Jordan", - "Kazakhstan", - "Korea", - "Kuwait", - "Kyrgyzstan", - "Laos", - "Lebanon", - "Macao", - "Malaysia", - "Maldives", - "Mongolia", - "Myanmar", - "Nepal", - "North Korea", - "Oman", - "Pakistan", - "Palestine", - "Philippines", - "Qatar", - "Republic of Vietnam", - "Saudi Arabia", - "Singapore", - "South Korea", - "Sri Lanka", - "Syria", - "Taiwan", - "Tajikistan", - "Thailand", - "East Timor", - "Turkey", - "Turkmenistan", - "United Arab Emirates", - "Uzbekistan", - "Vietnam", - "Yemen", - "Yemen Arab Republic", - "Yemen People's Republic" - ], - "Europe": [ - "Aland Islands", - "Albania", - "Andorra", - "Austria", - "Austria-Hungary", - "Baden", - "Bavaria", - "Belarus", - "Belgium", - "Bosnia and Herzegovina", - "Bulgaria", - "Channel Islands", - "Croatia", - "Cyprus", - "Czechia", - "Czechoslovakia", - "Denmark", - "East Germany", - "Estonia", - "Faroe Islands", - "Finland", - "France", - "Germany", - "Gibraltar", - "Greece", - "Guernsey", - "Hanover", - "Hesse Electoral", - "Hesse Grand Ducal", - "Hungary", - "Iceland", - "Ireland", - "Isle of Man", - "Italy", - "Jersey", - "Kosovo", - "Latvia", - "Liechtenstein", - "Lithuania", - "Luxembourg", - "Malta", - "Mecklenburg Schwerin", - "Modena", - "Moldova", - "Monaco", - "Montenegro", - "Netherlands", - "North Macedonia", - "Northern Cyprus", - "Norway", - "Parma", - "Poland", - "Portugal", - "Romania", - "Russia", - "San Marino", - "Saxony", - "Serbia", - "Serbia and Montenegro", - "Slovakia", - "Slovenia", - "Spain", - "Svalbard and Jan Mayen", - "Sweden", - "Switzerland", - "Tuscany", - "Two Sicilies", - "Ukraine", - "United Kingdom", - "USSR", - "Vatican", - "West Germany", - "Wuerttemburg", - "Yugoslavia" - ], - "South America": [ - "Argentina", - "Bolivia", - "Brazil", - "Caribbean Netherlands", - "Chile", - "Colombia", - "Ecuador", - "Falkland Islands", - "French Guiana", - "Guyana", - "Paraguay", - "Peru", - "Suriname", - "Uruguay", - "Venezuela" - ], - "North America": [ - "Anguilla", - "Antigua and Barbuda", - "Aruba", - "Bahamas", - "Barbados", - "Belize", - "Bermuda", - "Bonaire Sint Eustatius and Saba", - "British Virgin Islands", - "Canada", - "Cayman Islands", - "Costa Rica", - "Cuba", - "Curacao", - "Dominica", - "Dominican Republic", - "El Salvador", - "Greenland", - "Grenada", - "Guadeloupe", - "Guatemala", - "Haiti", - "Honduras", - "Jamaica", - "Martinique", - "Mexico", - "Montserrat", - "Netherlands Antilles", - "Nicaragua", - "Panama", - "Puerto Rico", - "Saint Barthlemy", - "Saint Kitts and Nevis", - "Saint Lucia", - "Saint Martin (French part)", - "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines", - "Sint Maarten (Dutch part)", - "Trinidad and Tobago", - "Turks and Caicos Islands", - "United States", - "United States Virgin Islands" - ], - "Oceania": [ - "American Samoa", - "Australia", - "Cook Islands", - "Fiji", - "French Polynesia", - "Guam", - "Kiribati", - "Marshall Islands", - "Micronesia (country)", - "Nauru", - "New Caledonia", - "New Zealand", - "Niue", - "Norfolk Island", - "Northern Mariana Islands", - "Palau", - "Papua New Guinea", - "Pitcairn", - "Samoa", - "Solomon Islands", - "Tokelau", - "Tonga", - "Tuvalu", - "United States Minor Outlying Islands", - "Vanuatu", - "Wallis and Futuna" - ] -} diff --git a/etl/steps/archive/explorers/owid/latest/food_explorer.py b/etl/steps/archive/explorers/owid/latest/food_explorer.py deleted file mode 100644 index cc80cae3d62..00000000000 --- a/etl/steps/archive/explorers/owid/latest/food_explorer.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Food explorer data step. - -Loads the latest faostat_food_explorer dataset from garden and stores a table (as a csv file) for each food product. - -NOTE: It will overwrite csv files inside "data/explorers/owid/latest/food_explorer". - -""" - -import sys -from copy import deepcopy - -from owid import catalog -from tqdm.auto import tqdm - -from etl.helpers import PathFinder - -paths = PathFinder(__file__) - -# Rename columns to be used by the food explorer. -# Note: Include here all columns, even if the name is not changed. -EXPECTED_COLUMNS = { - "population": "population", - "area_harvested__hectares": "area_harvested__ha", - "area_harvested__hectares_per_capita": "area_harvested__ha__per_capita", - "domestic_supply__tonnes": "domestic_supply__tonnes", - "domestic_supply__tonnes_per_capita": "domestic_supply__tonnes__per_capita", - "exports__tonnes": "exports__tonnes", - "exports__tonnes_per_capita": "exports__tonnes__per_capita", - "feed__tonnes": "feed__tonnes", - "feed__tonnes_per_capita": "feed__tonnes__per_capita", - "food__tonnes": "food__tonnes", - "food__tonnes_per_capita": "food__tonnes__per_capita", - "food_available_for_consumption__grams_of_fat_per_day_per_capita": "food_available_for_consumption__fat_g_per_day__per_capita", - "food_available_for_consumption__kilocalories_per_day_per_capita": "food_available_for_consumption__kcal_per_day__per_capita", - "food_available_for_consumption__kilograms_per_year_per_capita": "food_available_for_consumption__kg_per_year__per_capita", - "food_available_for_consumption__grams_of_protein_per_day_per_capita": "food_available_for_consumption__protein_g_per_day__per_capita", - "imports__tonnes": "imports__tonnes", - "imports__tonnes_per_capita": "imports__tonnes__per_capita", - "other_uses__tonnes": "other_uses__tonnes", - "other_uses__tonnes_per_capita": "other_uses__tonnes__per_capita", - "producing_or_slaughtered_animals__animals": "producing_or_slaughtered_animals__animals", - "producing_or_slaughtered_animals__animals_per_capita": "producing_or_slaughtered_animals__animals__per_capita", - "production__tonnes": "production__tonnes", - "production__tonnes_per_capita": "production__tonnes__per_capita", - "waste_in_supply_chain__tonnes": "waste_in_supply_chain__tonnes", - "waste_in_supply_chain__tonnes_per_capita": "waste_in_supply_chain__tonnes__per_capita", - "yield__kilograms_per_animal": "yield__kg_per_animal", - "yield__tonnes_per_hectare": "yield__tonnes_per_ha", -} - - -def run(dest_dir: str) -> None: - # Load the dataset for FAOSTAT food explorer from garden. - dataset_garden: catalog.Dataset = paths.load_dependency("faostat_food_explorer") - - # Get the table of all food products. - table_garden = dataset_garden["all_products"] - - # Initialize new garden dataset. - dataset = catalog.Dataset.create_empty(dest_dir) - # Add dataset metadata. - dataset.metadata = deepcopy(dataset_garden.metadata) - dataset.metadata.namespace = "owid" - dataset.metadata.short_name = "food_explorer" - dataset.metadata.version = "latest" - # Create new dataset in garden. - dataset.save() - - # List all products in table - products = sorted(table_garden.index.get_level_values("product").unique().tolist()) - - for product in tqdm(products, file=sys.stdout): - # Save a table (as a separate csv file) for each food product. - table_product = table_garden.loc[product].copy() - # Update table metadata. - table_product.title = product - - # Rename columns, select the required ones, and sort columns and rows conveniently. - table_product = table_product[list(EXPECTED_COLUMNS)].rename(columns=EXPECTED_COLUMNS) - table_product = table_product[ - ["population"] + [column for column in sorted(table_product.columns) if column not in ["population"]] - ] - table_product = table_product.sort_index() - - table_product.metadata.short_name = ( - catalog.utils.underscore(name=product, validate=True).replace("__", "_").replace("_e_g_", "_eg_") - ) - # Add table to dataset. Force publication in csv. - dataset.add(table_product, formats=["csv"]) diff --git a/etl/steps/archive/garden/agriculture/2023-04-20/long_term_wheat_yields.meta.yml b/etl/steps/archive/garden/agriculture/2023-04-20/long_term_wheat_yields.meta.yml deleted file mode 100644 index dfb8088a6e7..00000000000 --- a/etl/steps/archive/garden/agriculture/2023-04-20/long_term_wheat_yields.meta.yml +++ /dev/null @@ -1,11 +0,0 @@ -dataset: - title: Long-term wheat yields (agriculture, 2023) - description: | - This dataset combines data for European countries from two key sources: - + Data from prior to 1961 is sourced from Table 1.2 of Understanding Green Revolutions, by Bayliss-Smith & Wanmali (1984). - + Wheat yields from 1961 onwards are as reported by the Food and Agricultural Organization of the United Nations. - - All values of yield have been converted to tonnes per hectare. - -tables: - {} diff --git a/etl/steps/archive/garden/agriculture/2023-04-20/long_term_wheat_yields.py b/etl/steps/archive/garden/agriculture/2023-04-20/long_term_wheat_yields.py deleted file mode 100644 index 3e9df46bea6..00000000000 --- a/etl/steps/archive/garden/agriculture/2023-04-20/long_term_wheat_yields.py +++ /dev/null @@ -1,99 +0,0 @@ -"""Load a meadow dataset and create a garden dataset.""" - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.datautils.dataframes import combine_two_overlapping_dataframes -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Item code for "Wheat". -ITEM_CODE_FOR_WHEAT = "00000015" - -# Element code for "Yield". -ELEMENT_CODE_FOR_YIELD = "005419" - - -def run(dest_dir: str) -> None: - log.info("long_term_wheat_yields.start") - - # - # Load inputs. - # - # Load long-term wheat yield data from Bayliss-Smith & Wanmali (1984). - ds_bayliss: Dataset = paths.load_dependency("bayliss_smith_wanmali_1984") - - # Read main table from dataset. - tb_bayliss = ds_bayliss["long_term_wheat_yields"] - - # Create a convenient dataframe. - df_bayliss = pd.DataFrame(tb_bayliss).reset_index() - - # Load faostat data on crops and livestock products. - ds_qcl: Dataset = paths.load_dependency("faostat_qcl") - - # Read main table from dataset. - tb_qcl = ds_qcl["faostat_qcl"] - - # Create a convenient dataframe. - df_qcl = pd.DataFrame(tb_qcl).reset_index() - - # - # Process data. - # - # Select the relevant item and element from faostat data. - # Also, select only countries that appear in the Bayliss-Smith & Wanmali (1984) dataset. - df_qcl = df_qcl[ - (df_qcl["item_code"] == ITEM_CODE_FOR_WHEAT) - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_YIELD) - & (df_qcl["country"].isin(df_bayliss["country"].unique())) - ].reset_index(drop=True) - - # Sanity check. - error = "Units of yield have changed." - assert list(df_qcl["unit"].unique()) == ["tonnes per hectare"], error - - # Prepare variable description. - element_description = df_qcl["element_description"].drop_duplicates().item() - - # Transpose data. - df_qcl = ( - df_qcl.pivot(index=["country", "year"], columns="item", values="value") - .reset_index() - .rename(columns={"Wheat": "wheat_yield"}, errors="raise") - ) - - # Combine Bayliss and faostat data. - combined = combine_two_overlapping_dataframes(df1=df_qcl, df2=df_bayliss, index_columns=["country", "year"]) - - # Set an appropriate index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Create a new table with the processed data. - tb_garden = Table(combined, short_name="long_term_wheat_yields") - - # Add variable metadata, using faostat unit and element description (there was no item description). - tb_garden["wheat_yield"].metadata.title = "Wheat yields" - tb_garden["wheat_yield"].metadata.description = f"Long-term wheat yields.\n{element_description}" - tb_garden["wheat_yield"].metadata.unit = "tonnes per hectare" - tb_garden["wheat_yield"].metadata.short_unit = "tonnes/ha" - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset(dest_dir, tables=[tb_garden]) - - # Combine sources and licenses. - ds_garden.metadata.sources = ds_bayliss.metadata.sources + ds_qcl.metadata.sources - ds_garden.metadata.licenses = ds_bayliss.metadata.licenses + ds_qcl.metadata.licenses - - # Save changes in the new garden dataset. - ds_garden.save() - - log.info("long_term_wheat_yields.end") diff --git a/etl/steps/archive/garden/agriculture/2023-04-21/uk_long_term_yields.meta.yml b/etl/steps/archive/garden/agriculture/2023-04-21/uk_long_term_yields.meta.yml deleted file mode 100644 index f6fc66d533f..00000000000 --- a/etl/steps/archive/garden/agriculture/2023-04-21/uk_long_term_yields.meta.yml +++ /dev/null @@ -1,15 +0,0 @@ -dataset: - title: Long-term yields in the United Kingdom (various sources, 2023) - description: | - This dataset on agricultural yields in the United Kingdom was constructed from yield data from three key sources: - - • Data from 1270 to 1870 is taken from Table 3.06 of Broadberry et al. (2015). The data in this table is based on the Medieval Accounts Database, the Early Modern Probate Inventories Database and the Modern Farm Accounts Database. Seed sown per acre from the Medieval and Modern Databases. Pulses for the modern period and all seeds sown for the early modern period are taken from Overton and Campbell (1996), Allen (2005). - This comprises crop yield estimates only for England. For this dataset, we have assumed that yields in England are also representative of average UK yields. The data was given as decadal averages, and we have assumed, for each value, the middle year in each decade. - All values of yield in bushels per acre have been converted to tonnes per hectare, using the conversion factors given by [the USDA](https://www.ers.usda.gov/webdocs/publications/41880/33132_ah697_002.pdf) for the different commodities. - - • Data from 1870 to 1960 is taken from Table 4 of Brassley (2000). The data in this table is based on the book "A hundred Years of British food and farming: a statistical survey", by H. F. Marks (ed. D. K. Britton, 1989). The data is provided over 5-year periods. We have assumed, for each value, the middle year in each 5-year set. - - • Data from 1961 onwards is sourced from the Food and Agriculture Organization of the United Nations. - -tables: - {} diff --git a/etl/steps/archive/garden/agriculture/2023-04-21/uk_long_term_yields.py b/etl/steps/archive/garden/agriculture/2023-04-21/uk_long_term_yields.py deleted file mode 100644 index a74f5dba6f6..00000000000 --- a/etl/steps/archive/garden/agriculture/2023-04-21/uk_long_term_yields.py +++ /dev/null @@ -1,124 +0,0 @@ -"""Load historical data on UK yields and combine it with the latest FAOSTAT data.""" - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.datautils.dataframes import combine_two_overlapping_dataframes -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Element code for "Yield". -ELEMENT_CODE_FOR_YIELD = "005419" - -# Item codes for required items. -ITEM_CODE_FOR_WHEAT = "00000015" -ITEM_CODE_FOR_BARLEY = "00000044" -ITEM_CODE_FOR_OATS = "00000075" -ITEM_CODE_FOR_POTATOES = "00000116" -ITEM_CODE_FOR_PULSES = "00001726" -ITEM_CODE_FOR_RYE = "00000071" -ITEM_CODE_FOR_SUGAR_BEET = "00000157" -ITEM_CODES = [ - ITEM_CODE_FOR_WHEAT, - ITEM_CODE_FOR_BARLEY, - ITEM_CODE_FOR_OATS, - ITEM_CODE_FOR_POTATOES, - ITEM_CODE_FOR_PULSES, - ITEM_CODE_FOR_RYE, - ITEM_CODE_FOR_SUGAR_BEET, -] - - -def run(dest_dir: str) -> None: - log.info("uk_long_term_yields.start") - - # - # Load inputs. - # - # Load UK long-term yields data from Broadberry et al. (2015). - ds_broadberry: Dataset = paths.load_dependency("broadberry_et_al_2015") - - # Read main table from dataset. - tb_broadberry = ds_broadberry["broadberry_et_al_2015"] - - # Create a convenient dataframe. - df_broadberry = pd.DataFrame(tb_broadberry).reset_index() - - # Load UK long-term yields data from Brassley (2000). - ds_brassley: Dataset = paths.load_dependency("brassley_2000") - - # Read main table from dataset. - tb_brassley = ds_brassley["brassley_2000"] - - # Create a convenient dataframe. - df_brassley = pd.DataFrame(tb_brassley).reset_index() - - # Load faostat data on crop and livestock production. - ds_qcl: Dataset = paths.load_dependency("faostat_qcl") - - # Read main table from dataset. - tb_qcl = ds_qcl["faostat_qcl"] - - # Create a convenient dataframe. - df_qcl = pd.DataFrame(tb_qcl).reset_index() - - # - # Process data. - # - # Select required country, element and items. - df_qcl = df_qcl[ - (df_qcl["country"] == "United Kingdom") - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_YIELD) - & (df_qcl["item_code"].isin(ITEM_CODES)) - ].reset_index(drop=True) - - # Sanity check. - error = "Units for yield have changed." - assert list(df_qcl["unit"].unique()) == ["tonnes per hectare"], error - - # Transpose data. - df_qcl = df_qcl.pivot(index=["country", "year"], columns=["item"], values=["value"]) - df_qcl.columns = [column[1].lower().replace(" ", "_") + "_yield" for column in df_qcl.columns] - df_qcl = df_qcl.reset_index() - - # Combine historical data. - df_historical = combine_two_overlapping_dataframes( - df1=df_broadberry, df2=df_brassley, index_columns=["country", "year"] - ) - - # Combine historical data with faostat data. - df_combined = combine_two_overlapping_dataframes(df1=df_qcl, df2=df_historical, index_columns=["country", "year"]) - - # Set an appropriate index and sort conveniently. - df_combined = df_combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Create a new table with the processed data. - tb_garden = Table(df_combined, short_name=paths.short_name) - - # Add variable metadata. - for column in tb_garden.columns: - tb_garden[column].metadata.title = column.capitalize().replace("_", " ") - tb_garden[column].metadata.unit = "tonnes per hectare" - tb_garden[column].metadata.short_unit = "tonnes/ha" - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset(dest_dir, tables=[tb_garden]) - - # Combine sources and licenses. - ds_garden.metadata.sources = ds_broadberry.metadata.sources + ds_brassley.metadata.sources + ds_qcl.metadata.sources - ds_garden.metadata.licenses = ( - ds_broadberry.metadata.licenses + ds_brassley.metadata.licenses + ds_qcl.metadata.licenses - ) - - # Save changes in the new garden dataset. - ds_garden.save() - - log.info("uk_long_term_yields.end") diff --git a/etl/steps/archive/garden/agriculture/2023-05-26/attainable_yields.meta.yml b/etl/steps/archive/garden/agriculture/2023-05-26/attainable_yields.meta.yml deleted file mode 100644 index 5095ef94a28..00000000000 --- a/etl/steps/archive/garden/agriculture/2023-05-26/attainable_yields.meta.yml +++ /dev/null @@ -1,6 +0,0 @@ -dataset: - title: Attainable yields (various sources, 2023) - description: | - Attainable yields are estimates of feasible crop yields calculated from high-yielding areas of similar climate. They are more conservative than biophysical 'potential yields', but should be achievable using current technologies and management (e.g. fertilizers and irrigation). Attainable yields are based on assessments for the year 2000. Attainable yield pre-2000 may be lower; and post-2000 may be higher than these values. - - Yield gaps have been calculated by Our World in Data as the attainable yields, as reported by Mueller et al. (2012), minus the actual observed yields, as reported by the UN FAO. Negative values have been clipped to zero (meaning that the attainable yield has been reached). diff --git a/etl/steps/archive/garden/agriculture/2023-05-26/attainable_yields.py b/etl/steps/archive/garden/agriculture/2023-05-26/attainable_yields.py deleted file mode 100644 index d05edb4617c..00000000000 --- a/etl/steps/archive/garden/agriculture/2023-05-26/attainable_yields.py +++ /dev/null @@ -1,173 +0,0 @@ -"""Combine attainable yields from Mueller et al. (2012) with the latest FAOSTAT yields data. - -The resulting dataset contains: -1. item_yield: Yield from FAOSTAT (e.g. barley_yield). -2. item_attainable_yield: Maximum attainable yield from Mueller et al. (2012) (e.g. barley_attainable_yield). -3. item_yield_gap: Yield gap, which is the difference between the previous two (e.g. barley_yield_gap). - -Elements 2 and 3 are provided only for items that were included in Mueller et al. (2012), whereas element 1 is -provided also for other items. - -This dataset will be imported by the crop_yields explorers step, which feeds our Crop Yields explorer: -https://ourworldindata.org/explorers/crop-yields -""" - -import pandas as pd -from owid.catalog import Dataset, Table -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Element code for "Yield". -ELEMENT_CODE_FOR_YIELD = "005419" - -ITEM_CODES = { - # Items included in Mueller et al. (2012): - # Item code for "Barley". - "barley": "00000044", - # Item code for "Cassava, fresh" - "cassava": "00000125", - # Item code for "Seed cotton, unginned". - "cotton": "00000328", - # Item code for "Groundnuts, excluding shelled". - # NOTE: This was wrong, the correct item code should have been "00000242". This is corrected in the new version. - "groundnut": "00000234", - # Item code for "Maize (corn)". - "maize": "00000056", - # Item code for "Millet". - "millet": "00000079", - # Item code for "Oil palm fruit". - "oilpalm": "00000254", - # Item code for "Potatoes". - "potato": "00000116", - # Item code for "Rape or colza seed". - "rapeseed": "00000270", - # Item code for "Rice". - "rice": "00000027", - # Item code for "Rye". - "rye": "00000071", - # Item code for "Sorghum". - "sorghum": "00000083", - # Item code for "Soya beans". - "soybean": "00000236", - # Item code for "Sugar beet". - "sugarbeet": "00000157", - # Item code for "Sugar cane". - "sugarcane": "00000156", - # Item code for "Sunflower seed". - "sunflower": "00000267", - # Item code for "Wheat". - "wheat": "00000015", - # Additional items not included in Mueller et al. (2012): - # Item code for "Almonds, in shell". - "almond": "00000221", - # Item code for "Bananas". - "banana": "00000486", - # Item code for "Beans, dry". - "bean": "00000176", - # Item code for "Cereals, primary". - "cereal": "00001717", - # Item code for "Cocoa beans". - "cocoa": "00000661", - # Item code for "Coffee, green". - "coffee": "00000656", - # Item code for "Lettuce and chicory". - "lettuce": "00000372", - # Item code for "Oranges". - "orange": "00000490", - # Item code for "Peas, dry". - "pea": "00000187", - # Item code for "Tomatoes". - "tomato": "00000388", -} - - -def add_table_and_variable_metadata(tb: Table) -> Table: - # Add a short name to the combined table. - tb.metadata.short_name = "attainable_yields" - - # Update each variable's metadata. - for column in tb.columns: - title = ( - column.capitalize() - .replace("_", " ") - .replace("Oilpalm", "Oil palm") - .replace("Sugarbeet", "Sugar beet") - .replace("Sugarcane", "Sugar cane") - .replace("Sunflower", "Sunflower seed") - ) - tb[column].metadata.title = title - tb[column].metadata.unit = "tonnes per hectare" - tb[column].metadata.short_unit = "tonnes/ha" - - return tb - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load attainable yields data from Mueller et al. (2012). - ds_mueller: Dataset = paths.load_dependency("mueller_et_al_2012") - - # Read main table from dataset. - tb_mueller = ds_mueller["mueller_et_al_2012"].reset_index() - - # Load faostat data on crop and livestock production. - ds_qcl: Dataset = paths.load_dependency("faostat_qcl") - - # Read main table from dataset. - tb_qcl = ds_qcl["faostat_qcl"].reset_index() - - # - # Process data. - # - # Select required country, element and items. - tb_qcl = tb_qcl[ - (tb_qcl["element_code"] == ELEMENT_CODE_FOR_YIELD) & (tb_qcl["item_code"].isin(ITEM_CODES.values())) - ].reset_index(drop=True) - - # Sanity check. - error = "Units for yield have changed." - assert list(tb_qcl["unit"].unique()) == ["tonnes per hectare"], error - - # Transpose data. - tb_qcl = tb_qcl.pivot(index=["country", "year"], columns=["item_code"], values=["value"]) - item_code_to_name = {code: name for name, code in ITEM_CODES.items()} - tb_qcl.columns = [f"{item_code_to_name[column[1]]}_yield" for column in tb_qcl.columns] - tb_qcl = tb_qcl.reset_index() - - # Combine both tables. - tb = pd.merge(tb_qcl, tb_mueller.drop(columns=["year"]), on=["country"], how="inner") - - # Set an appropriate index and sort conveniently. - tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Add the yield gap (difference between maximum attainable yields minus actual yield). - for item in ITEM_CODES: - if f"{item}_attainable_yield" in tb.columns: - # Clip the series at zero (negative values mean that the yield has been attained). - tb[f"{item}_yield_gap"] = (tb[f"{item}_attainable_yield"] - tb[f"{item}_yield"]).clip(0) - - # Update table and variable metadata. - tb = add_table_and_variable_metadata(tb=tb) - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset(dest_dir, tables=[tb]) - - # Combine sources and licenses. - # Skip FAOSTAT source description (which is long and mostly irrelevant for the topic at hand). - ds_qcl.metadata.sources[0].description = None - ds_garden.metadata.sources = ds_mueller.metadata.sources + ds_qcl.metadata.sources - ds_garden.metadata.licenses = ds_mueller.metadata.licenses + ds_qcl.metadata.licenses - - # Save changes in the new garden dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/agriculture/2023-05-30/attainable_yields.meta.yml b/etl/steps/archive/garden/agriculture/2023-05-30/attainable_yields.meta.yml deleted file mode 100644 index e110b45cce7..00000000000 --- a/etl/steps/archive/garden/agriculture/2023-05-30/attainable_yields.meta.yml +++ /dev/null @@ -1,6 +0,0 @@ -dataset: - title: Attainable yields (various sources, 2023b) - description: | - Attainable yields are estimates of feasible crop yields calculated from high-yielding areas of similar climate. They are more conservative than biophysical 'potential yields', but should be achievable using current technologies and management (e.g. fertilizers and irrigation). Attainable yields are based on assessments for the year 2000. Attainable yield pre-2000 may be lower; and post-2000 may be higher than these values. - - Yield gaps have been calculated by Our World in Data as the attainable yields, as reported by Mueller et al. (2012), minus the actual observed yields, as reported by the UN FAO. Negative values have been clipped to zero (meaning that the attainable yield has been reached). diff --git a/etl/steps/archive/garden/agriculture/2023-05-30/attainable_yields.py b/etl/steps/archive/garden/agriculture/2023-05-30/attainable_yields.py deleted file mode 100644 index 4b26fce0f12..00000000000 --- a/etl/steps/archive/garden/agriculture/2023-05-30/attainable_yields.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Combine attainable yields from Mueller et al. (2012) with the latest FAOSTAT yields data. - -The resulting dataset contains: -1. item_yield: Yield from the Long-term crop yields dataset (e.g. barley_yield). -2. item_attainable_yield: Maximum attainable yield from Mueller et al. (2012) (e.g. barley_attainable_yield). -3. item_yield_gap: Yield gap, which is the difference between the previous two (e.g. barley_yield_gap). - -Elements 2 and 3 are provided only for items that were included in Mueller et al. (2012), whereas element 1 is -provided also for other items. - -This dataset will be imported by the crop_yields explorers step, which feeds our Crop Yields explorer: -https://ourworldindata.org/explorers/crop-yields -""" - -import pandas as pd -from owid.catalog import Dataset, Table -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# How to adjust items in the Long-term crop yields dataset to coincide with the names in Mueller et al. (2012). -COLUMNS = { - "almonds_yield": "almond_yield", - "bananas_yield": "banana_yield", - "beans__dry_yield": "bean_yield", - "cereals_yield": "cereal_yield", - "cocoa_beans_yield": "cocoa_yield", - "coffee__green_yield": "coffee_yield", - "oranges_yield": "orange_yield", - "peas__dry_yield": "pea_yield", - "tomatoes_yield": "tomato_yield", - "seed_cotton_yield": "cotton_yield", - "groundnuts_yield": "groundnut_yield", - "palm_fruit_oil_yield": "oilpalm_yield", - "potatoes_yield": "potato_yield", - "soybeans_yield": "soybean_yield", - "sugar_beet_yield": "sugarbeet_yield", - "sugar_cane_yield": "sugarcane_yield", - "sunflower_seed_yield": "sunflower_yield", -} - - -def add_table_and_variable_metadata(tb: Table) -> Table: - # Add a short name to the combined table. - tb.metadata.short_name = "attainable_yields" - - # Update each variable's metadata. - for column in tb.columns: - title = ( - column.capitalize() - .replace("_", " ") - .replace("Oilpalm", "Oil palm") - .replace("Sugarbeet", "Sugar beet") - .replace("Sugarcane", "Sugar cane") - .replace("Sunflower", "Sunflower seed") - ) - tb[column].metadata.title = title - tb[column].metadata.unit = "tonnes per hectare" - tb[column].metadata.short_unit = "tonnes/ha" - - return tb - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - - # Load attainable yields data from Mueller et al. (2012), and read its main table. - ds_mueller: Dataset = paths.load_dependency("mueller_et_al_2012") - tb_mueller = ds_mueller["mueller_et_al_2012"].reset_index() - - # Load long-term crop yields dataset and read its main table. - ds_yields: Dataset = paths.load_dependency("long_term_crop_yields") - tb_yields = ds_yields["long_term_crop_yields"].reset_index() - - # - # Process data. - # - # Rename columns from the long-term crop yields dataset, to coincide with the names of Mueller et al. (2012). - tb_yields = tb_yields.rename(columns=COLUMNS, errors="raise") - - # Combine both tables. - tb = pd.merge(tb_yields, tb_mueller.drop(columns=["year"]), on=["country"], how="inner") - - # Set an appropriate index and sort conveniently. - tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Add the yield gap (difference between maximum attainable yields minus actual yield). - for item in [ - column.replace("_attainable_yield", "") for column in tb.columns if column.endswith("_attainable_yield") - ]: - # Clip the series at zero (negative values mean that the yield has been attained). - tb[f"{item}_yield_gap"] = (tb[f"{item}_attainable_yield"] - tb[f"{item}_yield"]).clip(0) - - # Update table and variable metadata. - tb = add_table_and_variable_metadata(tb=tb) - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset(dest_dir, tables=[tb]) - - # Combine sources and licenses. - ds_garden.metadata.sources = ds_mueller.metadata.sources + ds_yields.metadata.sources - ds_garden.metadata.licenses = ds_mueller.metadata.licenses + ds_yields.metadata.licenses - - # Save changes in the new garden dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/agriculture/2023-05-30/long_term_crop_yields.meta.yml b/etl/steps/archive/garden/agriculture/2023-05-30/long_term_crop_yields.meta.yml deleted file mode 100644 index 9e7f1f85f83..00000000000 --- a/etl/steps/archive/garden/agriculture/2023-05-30/long_term_crop_yields.meta.yml +++ /dev/null @@ -1,4 +0,0 @@ -dataset: - title: "Long-term crop yields (various sources, 2023)" - description: | - This is a compilation of all our data on crop yields, extracted from different sources. The majority of the data comes from FAOSTAT, where the earliest information is of 1961. However, where possible, we complement that data with other sources, to have long-run series on crop yields, going back to even the XIII century. diff --git a/etl/steps/archive/garden/agriculture/2023-05-30/long_term_crop_yields.py b/etl/steps/archive/garden/agriculture/2023-05-30/long_term_crop_yields.py deleted file mode 100644 index 1097b40d5af..00000000000 --- a/etl/steps/archive/garden/agriculture/2023-05-30/long_term_crop_yields.py +++ /dev/null @@ -1,165 +0,0 @@ -"""Load a meadow dataset and create a garden dataset.""" - -from typing import List, cast - -import numpy as np -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore -from owid.datautils.dataframes import combine_two_overlapping_dataframes -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# FAOSTAT source name (used to be able to put this source first in the list of sources of each variable). -FAOSTAT_SOURCE_NAME = "Food and Agriculture Organization of the United Nations" - -# FAOSTAT element code for "Yield". -ELEMENT_CODE_FOR_YIELD = "005419" - - -def prepare_faostat_data(tb_qcl: Table) -> Table: - # Select the relevant metric in FAOSTAT dataset. - tb_qcl = tb_qcl[tb_qcl["element_code"] == ELEMENT_CODE_FOR_YIELD].reset_index(drop=True) - - # Store FAOSTAT QCL metadata (it will be used later after transforming the table). - metadata_qcl = tb_qcl.metadata - - # Sanity check. - error = "Units of yield may have changed in FAOSTAT QCL." - assert set(tb_qcl["unit"]) == {"tonnes per hectare"}, error - - # Transpose FAOSTAT data. - tb_qcl = tb_qcl.pivot(index=["country", "year"], columns=["item"], values=["value"]) - tb_qcl.columns = [f"{underscore(column[1])}_yield" for column in tb_qcl.columns] - tb_qcl = tb_qcl.reset_index() - tb_qcl.metadata = metadata_qcl - - return tb_qcl - - -def run_sanity_checks_on_inputs(tb_qcl: Table, tb_us: Table, tb_uk: Table, tb_wheat: Table) -> None: - error = "Columns in US long-term corn yields were expected to be found in FAOSTAT QCL." - assert set(tb_us.columns) <= set(tb_qcl.columns), error - error = "Columns in UK long-term yields were expected to be found in FAOSTAT QCL." - assert set(tb_uk.columns) <= set(tb_qcl.columns), error - error = "UK long-term yields were expected to start earlier than FAOSTAT QCL." - assert set(tb_qcl[tb_qcl["country"] == "United Kingdom"]["year"]) <= set(tb_uk["year"]), error - error = "Columns in long-term wheat yields were expected to be found in FAOSTAT QCL." - assert set(tb_wheat.columns) <= set(tb_qcl.columns) - error = "Long-term wheat yields were expected to start earlier than FAOSTAT QCL." - assert set(tb_qcl["year"]) <= set(tb_wheat["year"]) - - -def combine_variables_metadata(combined_table: Table, individual_tables: List[Table]) -> Table: - # Assign sources and licenses of the variables of each individual table to the variables in the combined table. - combined_table = combined_table.copy() - for column in combined_table.columns: - # Initialize sources and licenses for the current variable. - sources = [] - licenses = [] - for table in individual_tables: - if column in table.columns: - # If the current variable was in this table, assign its sources and licenses to the current variable. - for source in table.metadata.dataset.sources: - if source.name not in [known_source.name for known_source in sources]: - sources.append(source) - for license in table.metadata.dataset.licenses: - if license.name not in [known_license.name for known_license in licenses]: - licenses.append(license) - - # Given that FAOSTAT is the main source of data, place this source first in the list of sources. - sources_without_faostat = [source for source in sources if source.name != FAOSTAT_SOURCE_NAME] - # Sort the rest of the sources in alphabetical order. - sources_sorting = np.argsort([source.name for source in sources_without_faostat]) - sources = [source for source in sources if source.name == FAOSTAT_SOURCE_NAME] + np.array( - sources_without_faostat - )[sources_sorting].tolist() - - combined_table[column].metadata.sources = sources - combined_table[column].metadata.licenses = licenses - - # Generate a title for this variable using the column name. - title = column.capitalize().replace("_", " ").replace(" ", " ").replace("n e c", "n.e.c.") - combined_table[column].metadata.title = title - - # Define units. - combined_table[column].metadata.unit = "tonnes per hectare" - combined_table[column].metadata.short_unit = "t/ha" - - return combined_table - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load FAOSTAT QCL dataset and read its main table. - ds_qcl = cast(Dataset, paths.load_dependency("faostat_qcl")) - tb_qcl = ds_qcl["faostat_qcl"].reset_index() - - # Load the UK long-term yields dataset and read its main table. - ds_uk = cast(Dataset, paths.load_dependency("uk_long_term_yields")) - tb_uk = ds_uk["uk_long_term_yields"].reset_index() - - # Load the long-term US corn yields dataset and read its main table. - ds_us = cast(Dataset, paths.load_dependency("us_corn_yields")) - tb_us = ds_us["us_corn_yields"].reset_index() - - # Load the long-term wheat yields dataset and read its main table. - ds_wheat = cast(Dataset, paths.load_dependency("long_term_wheat_yields")) - tb_wheat = ds_wheat["long_term_wheat_yields"].reset_index() - - # - # Process data. - # - # Prepare FAOSTAT QCL data. - tb_qcl = prepare_faostat_data(tb_qcl=tb_qcl) - - # Rename US corn variable to be consistent with FAOSTAT QCL. - tb_us = tb_us.rename(columns={"corn_yield": "maize_yield"}, errors="raise") - - # Sanity checks. - run_sanity_checks_on_inputs(tb_qcl=tb_qcl, tb_us=tb_us, tb_uk=tb_uk, tb_wheat=tb_wheat) - - # Tables tb_uk and tb_wheat share column "wheat_yield" for the UK. - # We should keep the former, since it includes much earlier data. - - # Combine the long-term wheat yields table with FAOSTAT QCL (prioritizing the former). - tb = combine_two_overlapping_dataframes( - df1=tb_wheat, df2=tb_qcl, index_columns=["country", "year"], keep_column_order=True - ) - - # Combine the UK long-term yields with the previous table (prioritizing the former). - tb = combine_two_overlapping_dataframes( - df1=tb_uk, df2=tb, index_columns=["country", "year"], keep_column_order=True - ) - - # Combine the US long-term corn yields with the previous table (prioritizing the former). - tb = combine_two_overlapping_dataframes( - df1=tb_us, df2=tb, index_columns=["country", "year"], keep_column_order=True - ) - - # Combine variables metadata and adjust table metadata. - tb = combine_variables_metadata(combined_table=tb, individual_tables=[tb_uk, tb_us, tb_wheat, tb_qcl]) - tb.metadata.short_name = paths.short_name - - # Set an appropriate index and sort conveniently. - tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset(dest_dir, tables=[tb]) - - # Combine sources and licenses from all involved datasets. - ds_garden.metadata.sources = sum([ds.metadata.sources for ds in [ds_uk, ds_us, ds_wheat, ds_qcl]], []) - ds_garden.metadata.licenses = sum([ds.metadata.licenses for ds in [ds_uk, ds_us, ds_wheat, ds_qcl]], []) - - # Save changes in the new garden dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/bp/2022-07-11/shared.py b/etl/steps/archive/garden/bp/2022-07-11/shared.py deleted file mode 100644 index 6df4f45b1da..00000000000 --- a/etl/steps/archive/garden/bp/2022-07-11/shared.py +++ /dev/null @@ -1,538 +0,0 @@ -from pathlib import Path -from typing import Dict, List, Optional, Union, cast - -import numpy as np -import pandas as pd -from owid import catalog -from structlog import get_logger - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -log = get_logger() - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - -# Aggregate regions to add, following OWID definitions. -REGIONS_TO_ADD = { - "North America": { - "country_code": "OWID_NAM", - }, - "South America": { - "country_code": "OWID_SAM", - }, - "Europe": { - "country_code": "OWID_EUR", - }, - # The EU27 is already included in the original BP data, with the same definition as OWID. - # "European Union (27)": { - # "country_code": "OWID_EU27", - # }, - "Africa": { - "country_code": "OWID_AFR", - }, - "Asia": { - "country_code": "OWID_ASI", - }, - "Oceania": { - "country_code": "OWID_OCE", - }, - "Low-income countries": { - "country_code": "OWID_LIC", - }, - "Upper-middle-income countries": { - "country_code": "OWID_UMC", - }, - "Lower-middle-income countries": { - "country_code": "OWID_LMC", - }, - "High-income countries": { - "country_code": "OWID_HIC", - }, -} - -# We need to include the 'Other * (BP)' regions, otherwise continents have incomplete data. -# For example, when constructing the aggregate for Africa, we need to include 'Other Africa (BP)'. -# Otherwise we would be underestimating the region's total contribution. -ADDITIONAL_COUNTRIES_IN_REGIONS = { - "Africa": [ - # Additional African regions in BP's data (e.g. 'Other Western Africa (BP)') seem to be included in - # 'Other Africa (BP)', therefore we ignore them when creating aggregates. - "Other Africa (BP)", - ], - "Asia": [ - # Adding 'Other Asia Pacific (BP)' may include areas of Oceania in Asia. - # However, it seems that this region is usually significantly smaller than Asia. - # So, we are possibly overestimating Asia, but not by a significant amount. - "Other Asia Pacific (BP)", - # Similarly, adding 'Other CIS (BP)' in Asia may include areas of Europe in Asia (e.g. Moldova). - # However, since most countries in 'Other CIS (BP)' are Asian, adding it is more accurate than not adding it. - "Other CIS (BP)", - # Countries defined by BP in 'Middle East' are fully included in OWID's definition of Asia. - "Other Middle East (BP)", - ], - "Europe": [ - "Other Europe (BP)", - ], - "North America": [ - "Other Caribbean (BP)", - "Other North America (BP)", - ], - "South America": [ - "Other South America (BP)", - ], - # Given that 'Other Asia and Pacific (BP)' is often similar or even larger than Oceania, we avoid including it in - # Oceania (and include it in Asia, see comment above). - # This means that we may be underestimating Oceania by a significant amount, but BP does not provide unambiguous - # data to avoid this. - "Oceania": [], -} - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION: Dict[str, Dict[str, Union[str, List[str]]]] = { - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "members": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, -} - - -def load_population() -> pd.DataFrame: - """Load OWID population dataset, and add historical regions to it. - - Returns - ------- - population : pd.DataFrame - Population dataset. - - """ - # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] - - # Add data for historical regions (if not in population) by adding the population of its current successors. - countries_with_population = population["country"].unique() - missing_countries = [country for country in HISTORIC_TO_CURRENT_REGION if country not in countries_with_population] - for country in missing_countries: - members = HISTORIC_TO_CURRENT_REGION[country]["members"] - _population = ( - population[population["country"].isin(members)] - .groupby("year") - .agg({"population": "sum", "country": "nunique"}) - .reset_index() - ) - # Select only years for which we have data for all member countries. - _population = _population[_population["country"] == len(members)].reset_index(drop=True) - _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) - - error = "Duplicate country-years found in population. Check if historical regions changed." - assert population[population.duplicated(subset=["country", "year"])].empty, error - - return cast(pd.DataFrame, population) - - -def load_income_groups() -> pd.DataFrame: - """Load dataset of income groups and add historical regions to it. - - Returns - ------- - income_groups : pd.DataFrame - Income groups data. - - """ - # Load the WorldBank dataset for income grups. - income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() - - # Add historical regions to income groups. - for historic_region in HISTORIC_TO_CURRENT_REGION: - historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] - if historic_region not in income_groups["country"]: - historic_region_df = pd.DataFrame( - { - "country": [historic_region], - "income_group": [historic_region_income_group], - } - ) - income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) - - return cast(pd.DataFrame, income_groups) - - -def add_population( - df: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - This function has been adapted from datautils.geo, because population currently does not include historic regions. - We include them in this function. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Load population dataset. - population = load_population().rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population - - -def detect_overlapping_data_for_regions_and_members( - df: pd.DataFrame, - index_columns: List[str], - regions_and_members: Dict[str, Dict[str, Union[str, List[str]]]], - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]], - ignore_zeros: bool = True, -) -> None: - """Raise a warning if there is data for a particular region and for a country that is a member of that region. - - For example, if there is data for USSR and Russia on the same years, a warning will be raised. - - Parameters - ---------- - df : pd.DataFrame - Data. - index_columns : list - Names of columns that should be index of the data. - regions_and_members : dict - Regions and members (where each key corresponds to a region, and each region is a dictionary of various keys, - one of which is 'members', which is a list of member countries). - known_overlaps : list or None - Instances of known overlaps in the data. If this function raises a warning, new instances should be added to the - list. - ignore_zeros : bool - True to consider zeros in the data as missing values. Doing this, if a region has overlapping data with a member - country, but one of their data points is zero, it will not be considered an overlap. - - """ - if known_overlaps is not None: - df = df.copy() - - if ignore_zeros: - # Replace zeros by nans, so that zeros are ignored when looking for overlapping data. - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - - regions = list(regions_and_members) - for region in regions: - # Create a dataframe with only data for the region, and remove columns that only have nans. - # Optionally, replace zeros by nans, to also remove columns that only have zeros or nans. - region_df = ( - df[df["country"] == region].replace(overlapping_values_to_ignore, np.nan).dropna(axis=1, how="all") - ) - members = regions_and_members[region]["members"] - for member in members: - # Create a dataframe for this particular member country. - member_df = ( - df[df["country"] == member].replace(overlapping_values_to_ignore, np.nan).dropna(axis=1, how="all") - ) - # Find common columns with (non-nan) data between region and member country. - variables = [ - column - for column in (set(region_df.columns) & set(member_df.columns)) - if column not in index_columns - ] - for variable in variables: - # Concatenate region and member country's data for this variable. - combined = ( - pd.concat( - [ - region_df[["year", variable]], - member_df[["year", variable]], - ], - ignore_index=True, - ) - .dropna() - .reset_index(drop=True) - ) - # Find years where region and member country overlap. - overlapping = combined[combined.duplicated(subset="year")] - if not overlapping.empty: - overlapping_years = sorted(set(overlapping["year"])) - new_overlap = { - "region": region, - "member": member, - "years": overlapping_years, - "variable": variable, - } - # Check if the overlap found is already in the list of known overlaps. - # If this overlap is not known, raise a warning. - # Omit the field "entity_to_make_nan" when checking if this overlap is known. - _known_overlaps = [ - {key for key in overlap if key != "entity_to_make_nan"} for overlap in known_overlaps - ] - if new_overlap not in _known_overlaps: # type: ignore - log.warning( - f"Data for '{region}' overlaps with '{member}' on '{variable}' " - f"and years: {overlapping_years}" - ) - - -def remove_overlapping_data_for_regions_and_members( - df: pd.DataFrame, - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]], - country_col: str = "country", - year_col: str = "year", - ignore_zeros: bool = True, -) -> pd.DataFrame: - """Check if list of known overlaps between region (e.g. a historical region like the USSR) and a member country (or - a successor country, like Russia) do overlap, and remove them from the data. - - Parameters - ---------- - df : pd.DataFrame - Data. - known_overlaps : list or None - List of known overlaps between region and member country. - country_col : str - Name of country column. - year_col : str - Name of year column. - ignore_zeros : bool - True to ignore columns of zeros when checking if known overlaps are indeed overlaps. - - Returns - ------- - df : pd.DataFrame - Data after removing known overlapping rows between a region and a member country. - - """ - if known_overlaps is not None: - df = df.copy() - - if ignore_zeros: - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - - for i, overlap in enumerate(known_overlaps): - if set([overlap["region"], overlap["member"]]) <= set(df["country"]): - # Check that the known overlap is indeed found in the data. - duplicated_rows = ( - df[(df[country_col].isin([overlap["region"], overlap["member"]]))][ - [country_col, year_col, overlap["variable"]] - ] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=overlap["variable"]) - ) - duplicated_rows = duplicated_rows[duplicated_rows.duplicated(subset="year", keep=False)] - overlapping_years = sorted(set(duplicated_rows["year"])) - if overlapping_years != overlap["years"]: - log.warning(f"Given overlap number {i} is not found in the data; redefine this list.") - # Make nan data points for either the region or the member (which is specified by "entity to make nan"). - indexes_to_make_nan = duplicated_rows[ - duplicated_rows["country"] == overlap[overlap["entity_to_make_nan"]] # type: ignore - ].index.tolist() - df.loc[indexes_to_make_nan, overlap["variable"]] = np.nan - - return df - - -def load_countries_in_regions() -> Dict[str, List[str]]: - """Create a dictionary of regions (continents and income groups) and their member countries. - - Regions to include are defined above, in REGIONS_TO_ADD. - Additional countries are added to regions following the definitions in ADDITIONAL_COUNTRIES_IN_REGIONS. - - Returns - ------- - countries_in_regions : dict - Dictionary of regions, where the value is a list of member countries in the region. - - """ - # Load income groups. - income_groups = load_income_groups() - - countries_in_regions = {} - for region in list(REGIONS_TO_ADD): - # Add default OWID list of countries in region (which includes historical regions). - countries_in_regions[region] = geo.list_countries_in_region(region=region, income_groups=income_groups) - - # Include additional countries in the region (if any given). - for region in ADDITIONAL_COUNTRIES_IN_REGIONS: - countries_in_regions[region] = countries_in_regions[region] + ADDITIONAL_COUNTRIES_IN_REGIONS[region] - - return countries_in_regions - - -def add_region_aggregates( - data: pd.DataFrame, - regions: List[str], - index_columns: List[str], - country_column: str = "country", - year_column: str = "year", - aggregates: Optional[Dict[str, str]] = None, - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]] = None, - region_codes: Optional[List[str]] = None, - country_code_column: str = "country_code", -) -> pd.DataFrame: - """Add region aggregates for all regions (which may include continents and income groups). - - Parameters - ---------- - data : pd.DataFrame - Data. - regions : list - Regions to include. - index_columns : list - Name of index columns. - country_column : str - Name of country column. - year_column : str - Name of year column. - aggregates : dict or None - Dictionary of type of aggregation to use for each variable. If None, variables will be aggregated by summing. - known_overlaps : list or None - List of known overlaps between regions and their member countries. - region_codes : list or None - List of country codes for each new region. It must have the same number of elements, and in the same order, as - the 'regions' argument. - country_code_column : str - Name of country codes column (only relevant of region_codes is not None). - - Returns - ------- - data : pd.DataFrame - Data after adding aggregate regions. - - """ - data = data.copy() - - if aggregates is None: - # If aggregations are not specified, assume all variables are to be aggregated, by summing. - aggregates = {column: "sum" for column in data.columns if column not in index_columns} - # Get the list of regions to create, and their member countries. - countries_in_regions = load_countries_in_regions() - for region in regions: - # List of countries in region. - countries_in_region = countries_in_regions[region] - # Select rows of data for member countries. - data_region = data[data[country_column].isin(countries_in_region)] - # Remove any known overlaps between regions (e.g. USSR, which is a historical region) in current region (e.g. - # Europe) and their member countries (or successor countries, like Russia). - # If any overlap in known_overlaps is not found, a warning will be raised. - data_region = remove_overlapping_data_for_regions_and_members(df=data_region, known_overlaps=known_overlaps) - - # Check that there are no other overlaps in the data (after having removed the known ones). - detect_overlapping_data_for_regions_and_members( - df=data_region, - regions_and_members=HISTORIC_TO_CURRENT_REGION, - index_columns=index_columns, - known_overlaps=known_overlaps, - ) - - # Add region aggregates. - data_region = geo.add_region_aggregates( - df=data_region, - region=region, - country_col=country_column, - year_col=year_column, - aggregations=aggregates, - countries_in_region=countries_in_region, - countries_that_must_have_data=[], - # Here we allow aggregating even when there are few countries informed (which seems to agree with BP's - # criterion for aggregates). - # However, if absolutely all countries have nan, we want the aggregate to be nan, not zero. - frac_allowed_nans_per_year=0.999, - num_allowed_nans_per_year=None, - ) - data = pd.concat( - [ - data[~(data[country_column].astype(str) == region)], - data_region[data_region[country_column] == region], - ], - ignore_index=True, - ).reset_index(drop=True) - - if region_codes is not None: - # Add region codes to regions. - if data[country_code_column].dtype == "category": - data[country_code_column] = data[country_code_column].cat.add_categories(region_codes) - for i, region in enumerate(regions): - data.loc[data[country_column] == region, country_code_column] = region_codes[i] - - return data diff --git a/etl/steps/archive/garden/bp/2022-07-11/statistical_review.meta.yml b/etl/steps/archive/garden/bp/2022-07-11/statistical_review.meta.yml deleted file mode 100644 index cb5f9df1ae4..00000000000 --- a/etl/steps/archive/garden/bp/2022-07-11/statistical_review.meta.yml +++ /dev/null @@ -1,41 +0,0 @@ -dataset: - namespace: bp - version: 2022-07-11 - title: Statistical Review of World Energy - BP (2021) - short_name: statistical_review - sources: - - - name: Our World in Data based on BP Statistical Review of World Energy (2021) - published_by: BP Statistical Review of World Energy - date_accessed: 2021-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: | - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe (BP)", or "Other CIS (BP)"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa (BP)". - * "Asia" - All Asian countries + "Other Middle East (BP)" + "Other CIS (BP)" + "Other Asia Pacific (BP)". - * "Europe" - All European countries + "Other Europe (BP)". - * "North America" - All North American countries + "Other Caribbean (BP)" + "Other North America (BP)". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America (BP)". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa (BP)" is included in "Other Africa (BP)"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). diff --git a/etl/steps/archive/garden/bp/2022-07-11/statistical_review.py b/etl/steps/archive/garden/bp/2022-07-11/statistical_review.py deleted file mode 100644 index 559071da6ce..00000000000 --- a/etl/steps/archive/garden/bp/2022-07-11/statistical_review.py +++ /dev/null @@ -1,266 +0,0 @@ -"""Process the BP Statistical Review of World Energy 2021. - -This dataset was downloaded and processed by a previous version of -https://github.com/owid/importers/tree/master/bp_statreview - -However, in this additional step we add region aggregates following OWID definitions of regions. -""" - -from copy import deepcopy - -import numpy as np -import pandas as pd -from owid import catalog -from shared import CURRENT_DIR, REGIONS_TO_ADD, add_region_aggregates - -from etl.helpers import PathFinder - -P = PathFinder(__file__) - -# Namespace and short name for output dataset. -NAMESPACE = "bp" -# Path to metadata file for current dataset. -METADATA_FILE_PATH = CURRENT_DIR / "statistical_review.meta.yml" -# Original BP's Statistical Review dataset name in the OWID catalog (without the institution and year). -BP_CATALOG_NAME = "statistical_review_of_world_energy" -BP_BACKPORTED_DATASET_NAME = "dataset_5347_statistical_review_of_world_energy__bp__2021" -BP_NAMESPACE_IN_CATALOG = "bp_statreview" -BP_VERSION = 2021 - -# List of known overlaps between regions and member countries (or successor countries). -OVERLAPPING_DATA_TO_REMOVE_IN_AGGREGATES = [ - { - "region": "USSR", - "member": "Russia", - "entity_to_make_nan": "region", - "years": [1991, 1992, 1993, 1994, 1995, 1996], - "variable": "Gas - Proved reserves", - } -] - -# True to ignore zeros when checking for overlaps between regions and member countries. -# This means that, if a region (e.g. USSR) and a member country or successor country (e.g. Russia) overlap, but in a -# variable that only has zeros, it will not be considered an overlap. -IGNORE_ZEROS_WHEN_CHECKING_FOR_OVERLAPPING_DATA = True - -# Variables that can be summed when constructing region aggregates. -# Biofuels in Africa have a non-zero total, while there is no contribution from African countries. -# This causes that our aggregate for 'Africa' would be zero, while the original 'Africa (BP)' is not. -# Also, biodiesels are only given for continents and a few countries. -# For this reason we avoid creating aggregates for biofuels and biodiesels. -AGGREGATES_BY_SUM = [ - "Carbon Dioxide Emissions", - "Coal - Reserves - Anthracite and bituminous", - "Coal - Reserves - Sub-bituminous and lignite", - "Coal - Reserves - Total", - "Coal Consumption - EJ", - "Coal Consumption - TWh", - "Coal Production - EJ", - "Coal Production - TWh", - "Coal Production - Tonnes", - "Cobalt Production-Reserves", - "Elec Gen from Coal", - "Elec Gen from Gas", - "Elec Gen from Oil", - "Electricity Generation", - "Gas - Proved reserves", - "Gas Consumption - Bcf", - "Gas Consumption - Bcm", - "Gas Consumption - EJ", - "Gas Consumption - TWh", - "Gas Production - Bcf", - "Gas Production - Bcm", - "Gas Production - EJ", - "Gas Production - TWh", - "Geo Biomass Other - EJ", - "Geo Biomass Other - TWh", - "Graphite Production-Reserves", - "Hydro Consumption - EJ", - "Hydro Consumption - TWh", - "Hydro Generation - TWh", - "Lithium Production-Reserves", - "Nuclear Consumption - EJ", - "Nuclear Consumption - TWh", - "Nuclear Generation - TWh", - "Oil - Proved reserves", - "Oil - Refinery throughput", - "Oil - Refining capacity", - "Oil Consumption - Barrels", - "Oil Consumption - EJ", - "Oil Consumption - TWh", - "Oil Consumption - Tonnes", - "Oil Production - Barrels", - "Oil Production - Crude Conds", - "Oil Production - NGLs", - "Oil Production - TWh", - "Oil Production - Tonnes", - "Primary Energy Consumption - EJ", - "Primary Energy Consumption - TWh", - "Renewables Consumption - EJ", - "Renewables Consumption - TWh", - "Renewables Power - EJ", - "Renewables power - TWh", - "Solar Capacity", - "Solar Consumption - EJ", - "Solar Consumption - TWh", - "Solar Generation - TWh", - "Total Liquids - Consumption", - "Wind Capacity", - "Wind Consumption - EJ", - "Wind Consumption - TWh", - "Wind Generation - TWh", - # 'Biofuels Consumption - Kboed - Total', - # 'Biofuels Consumption - Kboed - Biodiesel', - # 'Biofuels Consumption - PJ - Total', - # 'Biofuels Consumption - PJ - Biodiesel', - # 'Biofuels Consumption - TWh - Total', - # 'Biofuels Consumption - TWh - Biodiesel', - # 'Biofuels Consumption - TWh - Biodiesel (zero filled)', - # 'Biofuels Consumption - TWh - Total (zero filled)', - # 'Biofuels Production - Kboed - Total', - # 'Biofuels Production - PJ - Total', - # 'Biofuels Production - TWh - Total', - # 'Biofuels Production - Kboed - Biodiesel', - # 'Biofuels Production - PJ - Biodiesel', - # 'Biofuels Production - TWh - Biodiesel', - # 'Coal - Prices', - # 'Coal Consumption - TWh (zero filled)', - # 'Gas - Prices', - # 'Gas Consumption - TWh (zero filled)', - # 'Geo Biomass Other - TWh (zero filled)', - # 'Hydro Consumption - TWh (zero filled)', - # 'Nuclear Consumption - TWh (zero filled)', - # 'Oil - Crude prices since 1861 (2021 $)', - # 'Oil - Crude prices since 1861 (current $)', - # 'Oil - Spot crude prices', - # 'Oil Consumption - TWh (zero filled)', - # 'Primary Energy - Cons capita', - # 'Rare Earth Production-Reserves', - # 'Solar Consumption - TWh (zero filled)', - # 'Wind Consumption - TWh (zero filled)', -] - - -def prepare_output_table(df: pd.DataFrame, bp_table: catalog.Table) -> catalog.Table: - """Create a table with the processed data, ready to be in a garden dataset and to be uploaded to grapher (although - additional metadata may need to be added to the table). - - Parameters - ---------- - df : pd.DataFrame - Processed BP data. - bp_table : catalog.Table - Original table of BP statistical review data (used to transfer its metadata to the new table). - - Returns - ------- - table : catalog.Table - Table, ready to be added to a new garden dataset. - - """ - # Create new table. - table = catalog.Table(df).copy() - - # Replace spurious inf values by nan. - table = table.replace([np.inf, -np.inf], np.nan) - - # Sort conveniently and add an index. - table = ( - table.sort_values(["country", "year"]) - .reset_index(drop=True) - .set_index(["country", "year"], verify_integrity=True) - .astype({"country_code": "category"}) - ) - - # Convert column names to lower, snake case. - table = catalog.utils.underscore_table(table) - - # Get the table metadata from the original table. - table.metadata = deepcopy(bp_table.metadata) - - # Get the metadata of each variable from the original table. - for column in table.drop(columns="country_code").columns: - table[column].metadata = deepcopy(bp_table[column].metadata) - - return table - - -def amend_zero_filled_variables_for_region_aggregates(df: pd.DataFrame) -> pd.DataFrame: - """Fill the "* (zero filled)" variables (which were ignored when creating aggregates) with the new aggregate data, - and fill any possible nan with zeros. - - Parameters - ---------- - df : pd.DataFrame - Data after having created region aggregates (which ignore '* (zero filled)' variables). - - Returns - ------- - df : pd.DataFrame - Data after amending zero filled variables for region aggregates. - - """ - df = df.copy() - - zero_filled_variables = [column for column in df.columns if "(zero filled)" in column] - original_variables = [column.replace(" (zero filled)", "") for column in df.columns if "(zero filled)" in column] - select_regions = df["country"].isin(REGIONS_TO_ADD) - df.loc[select_regions, zero_filled_variables] = df[select_regions][original_variables].fillna(0).values - - return df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load table from latest BP dataset. - bp_ds: catalog.Dataset = P.load_dependency(BP_BACKPORTED_DATASET_NAME) - bp_table = bp_ds[BP_BACKPORTED_DATASET_NAME] - - # - # Process data. - # - # Extract dataframe of BP data from table. - bp_data = ( - pd.DataFrame(bp_table) - .reset_index() - .rename(columns={column: bp_table[column].metadata.title for column in bp_table.columns}) - .rename(columns={"entity_name": "country", "entity_code": "country_code"}) - .drop(columns="entity_id") - ) - - # Add region aggregates. - df = add_region_aggregates( - data=bp_data, - regions=list(REGIONS_TO_ADD), - index_columns=["country", "year", "country_code"], - country_column="country", - year_column="year", - aggregates={column: "sum" for column in AGGREGATES_BY_SUM}, - known_overlaps=OVERLAPPING_DATA_TO_REMOVE_IN_AGGREGATES, # type: ignore - region_codes=[REGIONS_TO_ADD[region]["country_code"] for region in REGIONS_TO_ADD], - ) - - # Fill nans with zeros for "* (zero filled)" variables for region aggregates (which were ignored). - df = amend_zero_filled_variables_for_region_aggregates(df) - - # Prepare output data in a convenient way. - table = prepare_output_table(df, bp_table) - - # - # Save outputs. - # - # Initialize new garden dataset. - dataset = catalog.Dataset.create_empty(dest_dir) - # Add metadata to dataset. - dataset.metadata.update_from_yaml(METADATA_FILE_PATH) - # Create new dataset in garden. - dataset.save() - - # Add table to the dataset. - table.metadata.title = dataset.metadata.title - table.metadata.description = dataset.metadata.description - table.metadata.short_name = dataset.metadata.short_name - table.metadata.primary_key = list(table.index.names) - dataset.add(table, repack=True) diff --git a/etl/steps/archive/garden/bp/2022-07-14/energy_mix.meta.yml b/etl/steps/archive/garden/bp/2022-07-14/energy_mix.meta.yml deleted file mode 100644 index df15ba6753e..00000000000 --- a/etl/steps/archive/garden/bp/2022-07-14/energy_mix.meta.yml +++ /dev/null @@ -1,58 +0,0 @@ -dataset: - namespace: bp - version: 2022-07-14 - title: Energy mix from BP - short_name: energy_mix - sources: - - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: | - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe (BP)", or "Other CIS (BP)"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa (BP)". - * "Asia" - All Asian countries + "Other Middle East (BP)" + "Other CIS (BP)" + "Other Asia Pacific (BP)". - * "Europe" - All European countries + "Other Europe (BP)". - * "North America" - All North American countries + "Other Caribbean (BP)" + "Other North America (BP)". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America (BP)". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa (BP)" is included in "Other Africa (BP)"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - description: | - Raw data on energy consumption is sourced from [the BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - - Primary energy in exajoules (EJ) has been converted to TWh by Our World in Data based on a conversion factor of 1,000,000 / 3,600 (~277.778). - - For non-fossil based electricity sources (nuclear, hydro, wind, solar, geothermal, biomass in power, and other renewable sources), BP's generation (in TWh) corresponds to gross generation and not accounting for cross-border electricity supply. - Also, for non-fossil based electricity, there are two ways to define primary energy: - * One is "direct primary energy", which corresponds to the electricity generation (in TWh). - * The other is "input-equivalent primary energy" (also called "primary energy using the substitution method"). - This is the amount of fuel that would be required by thermal power stations to generate the reported electricity, as explained in [BP's methodology document](https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/pdfs/energy-economics/statistical-review/bp-stats-review-2022-methodology.pdf). For example, if a country's nuclear power generated 100 TWh of electricity, and assuming that the efficiency of a standard thermal power plant is 38%, the input equivalent primary energy for this country would be 100 TWh / 0.38 = 263 TWh = 0.95 EJ. This input-equivalent primary energy takes account of the inefficiencies in fossil fuel production and provides a better approximation of each source's share of "final energy" consumption. - - Additional metrics have been calculated by Our World in Data: - – Annual change in energy consumption by source: this is calculated as the difference from the previous year. - – % of total primary energy: calculated as each source's share of primary energy (direct energy and primary energy using the substitution method) from all sources. - – Per capita energy by source: calculated as primary energy consumption by source, divided by population. - - Per capita figures have been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). diff --git a/etl/steps/archive/garden/bp/2022-07-14/energy_mix.py b/etl/steps/archive/garden/bp/2022-07-14/energy_mix.py deleted file mode 100644 index c23a6e599fa..00000000000 --- a/etl/steps/archive/garden/bp/2022-07-14/energy_mix.py +++ /dev/null @@ -1,471 +0,0 @@ -"""Generate BP energy mix 2022 dataset using data from BP's statistical review of the world energy. - -""" - -import numpy as np -import pandas as pd -from owid import catalog -from shared import CURRENT_DIR, add_population - -from etl.paths import DATA_DIR - -# Namespace, dataset short name, and version of the garden dataset of the BP statistical review. -STAT_REVIEW_NAMESPACE = "bp" -STAT_REVIEW_SHORT_NAME = "statistical_review" -STAT_REVIEW_VERSION = "2022-07-14" -# Path to metadata file. -METADATA_FILE_PATH = CURRENT_DIR / "energy_mix.meta.yml" - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 -# Exajoules to terawatt-hours. -EJ_TO_TWH = 1e6 / 3600 -# Petajoules to exajoules. -PJ_TO_EJ = 1e-3 - -# List all energy sources in the data. -ONLY_DIRECT_ENERGY = ["Coal", "Fossil fuels", "Gas", "Oil", "Biofuels"] -DIRECT_AND_EQUIVALENT_ENERGY = [ - "Hydro", - "Low-carbon energy", - "Nuclear", - "Other renewables", - "Renewables", - "Solar", - "Wind", -] -ALL_SOURCES = sorted(ONLY_DIRECT_ENERGY + DIRECT_AND_EQUIVALENT_ENERGY) - - -def get_bp_data(bp_table: catalog.Table) -> pd.DataFrame: - """Extract a simple dataframe of BP statistical review data from the table in the dataset. - - Parameters - ---------- - bp_table : catalog.Table - BP table (from the dataset of BP statistical review). - - Returns - ------- - bp_data : pd.DataFrame - BP statistical review data. - - """ - bp_table = bp_table.copy() - - # Convert table (snake case) column names to human readable names. - bp_table = bp_table.rename( - columns={column: bp_table[column].metadata.title for column in bp_table.columns if column != "country_code"} - ).reset_index() - - # Rename human-readable columns (and select only the ones that will be used). - columns = { - "country": "Country", - "country_code": "Country code", - "year": "Year", - # Fossil fuel primary energy (in EJ). - "Coal Consumption - EJ": "Coal (EJ)", - "Gas Consumption - EJ": "Gas (EJ)", - "Oil Consumption - EJ": "Oil (EJ)", - # Non-fossil based electricity generation (in TWh). - "Hydro Generation - TWh": "Hydro (TWh - direct)", - "Nuclear Generation - TWh": "Nuclear (TWh - direct)", - "Solar Generation - TWh": "Solar (TWh - direct)", - "Wind Generation - TWh": "Wind (TWh - direct)", - "Geo Biomass Other - TWh": "Other renewables (TWh - direct)", - # Non-fossil based electricity generation converted into input-equivalent primary energy (in EJ). - "Hydro Consumption - EJ": "Hydro (EJ - equivalent)", - "Nuclear Consumption - EJ": "Nuclear (EJ - equivalent)", - "Solar Consumption - EJ": "Solar (EJ - equivalent)", - "Wind Consumption - EJ": "Wind (EJ - equivalent)", - "Geo Biomass Other - EJ": "Other renewables (EJ - equivalent)", - # Total, input-equivalent primary energy consumption (in EJ). - "Primary Energy Consumption - EJ": "Primary energy (EJ - equivalent)", - # Biofuels consumption (in PJ, that will be converted into EJ). - "Biofuels Consumption - PJ - Total": "Biofuels (PJ)", - } - - # Create a simple dataframe (without metadata and with a dummy index). - assert set(columns) < set(bp_table.columns), "Column names have changed in BP data." - - bp_data = ( - pd.DataFrame(bp_table)[list(columns)].rename(errors="raise", columns=columns).astype({"Country code": str}) - ) - - return bp_data - - -def _check_that_substitution_method_is_well_calculated( - primary_energy: pd.DataFrame, -) -> None: - # Check that the constructed primary energy using the substitution method (in TWh) coincides with the - # input-equivalent primary energy (converted from EJ into TWh) given in the original data. - check = primary_energy[ - [ - "Year", - "Country", - "Primary energy (EJ - equivalent)", - "Primary energy (TWh - equivalent)", - ] - ].reset_index(drop=True) - check["Primary energy (TWh - equivalent) - original"] = check["Primary energy (EJ - equivalent)"] * EJ_TO_TWH - check = check.dropna().reset_index(drop=True) - # They may not coincide exactly, but at least check that they differ (point by point) by less than 10%. - max_deviation = max( - abs( - (check["Primary energy (TWh - equivalent)"] - check["Primary energy (TWh - equivalent) - original"]) - / check["Primary energy (TWh - equivalent) - original"] - ) - ) - assert max_deviation < 0.1 - - -def calculate_direct_primary_energy(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Convert direct primary energy into TWh and create various aggregates (e.g. Fossil fuels and Renewables). - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - Data, after adding direct primary energy. - - """ - primary_energy = primary_energy.copy() - - # Convert units of biofuels consumption. - primary_energy["Biofuels (EJ)"] = primary_energy["Biofuels (PJ)"] * PJ_TO_EJ - - # Create column for fossil fuels primary energy (if any of them is nan, the sum will be nan). - primary_energy["Fossil fuels (EJ)"] = ( - primary_energy["Coal (EJ)"] + primary_energy["Oil (EJ)"] + primary_energy["Gas (EJ)"] - ) - - # Convert primary energy of fossil fuels and biofuels into TWh. - for cat in ["Coal", "Oil", "Gas", "Biofuels"]: - primary_energy[f"{cat} (TWh)"] = primary_energy[f"{cat} (EJ)"] * EJ_TO_TWH - - # Create column for primary energy from fossil fuels (in TWh). - primary_energy["Fossil fuels (TWh)"] = ( - primary_energy["Coal (TWh)"] + primary_energy["Oil (TWh)"] + primary_energy["Gas (TWh)"] - ) - - # Create column for direct primary energy from renewable sources in TWh. - # (total renewable electricity generation and biofuels) (in TWh). - # By visually inspecting the original data, it seems that many data points that used to be zero are - # missing in the 2022 release, so filling nan with zeros seems to be a reasonable approach to avoids losing a - # significant amount of data. - primary_energy["Renewables (TWh - direct)"] = ( - primary_energy["Hydro (TWh - direct)"] - + primary_energy["Solar (TWh - direct)"].fillna(0) - + primary_energy["Wind (TWh - direct)"].fillna(0) - + primary_energy["Other renewables (TWh - direct)"].fillna(0) - + primary_energy["Biofuels (TWh)"].fillna(0) - ) - # Create column for direct primary energy from low-carbon sources in TWh. - # (total renewable electricity generation, biofuels, and nuclear power) (in TWh). - primary_energy["Low-carbon energy (TWh - direct)"] = primary_energy["Renewables (TWh - direct)"] + primary_energy[ - "Nuclear (TWh - direct)" - ].fillna(0) - # Create column for total direct primary energy. - primary_energy["Primary energy (TWh - direct)"] = ( - primary_energy["Fossil fuels (TWh)"] + primary_energy["Low-carbon energy (TWh - direct)"] - ) - - return primary_energy - - -def calculate_equivalent_primary_energy(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Convert input-equivalent primary energy into TWh and create various aggregates (e.g. Fossil fuels and - Renewables). - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - Data, after adding input-equivalent primary energy. - - """ - primary_energy = primary_energy.copy() - # Create column for total renewable input-equivalent primary energy (in EJ). - # Fill missing values with zeros (see comment above). - primary_energy["Renewables (EJ - equivalent)"] = ( - primary_energy["Hydro (EJ - equivalent)"] - + primary_energy["Solar (EJ - equivalent)"].fillna(0) - + primary_energy["Wind (EJ - equivalent)"].fillna(0) - + primary_energy["Other renewables (EJ - equivalent)"].fillna(0) - + primary_energy["Biofuels (EJ)"].fillna(0) - ) - # Create column for low carbon energy (i.e. renewable plus nuclear energy). - primary_energy["Low-carbon energy (EJ - equivalent)"] = primary_energy[ - "Renewables (EJ - equivalent)" - ] + primary_energy["Nuclear (EJ - equivalent)"].fillna(0) - # Convert input-equivalent primary energy of non-fossil based electricity into TWh. - # The result is primary energy using the "substitution method". - for cat in DIRECT_AND_EQUIVALENT_ENERGY: - primary_energy[f"{cat} (TWh - equivalent)"] = primary_energy[f"{cat} (EJ - equivalent)"] * EJ_TO_TWH - # Create column for primary energy from all sources (which corresponds to input-equivalent primary - # energy for non-fossil based sources). - primary_energy["Primary energy (TWh - equivalent)"] = ( - primary_energy["Fossil fuels (TWh)"] + primary_energy["Low-carbon energy (TWh - equivalent)"] - ) - # Check that the primary energy constructed using the substitution method coincides with the - # input-equivalent primary energy. - _check_that_substitution_method_is_well_calculated(primary_energy) - - return primary_energy - - -def calculate_share_of_primary_energy(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Calculate the share (percentage) of (direct or direct and input-equivalent) primary energy for each energy - source. - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - BP data after adding columns for the share of primary energy. - - """ - primary_energy = primary_energy.copy() - # Check that all sources are included in the data. - expected_sources = sorted( - set( - [ - source.split("(")[0].strip() - for source in primary_energy.columns - if not source.startswith(("Country", "Year", "Primary")) - ] - ) - ) - assert expected_sources == ALL_SOURCES, "Sources may have changed names." - - for source in ONLY_DIRECT_ENERGY: - # Calculate each source as share of direct primary energy. - primary_energy[f"{source} (% direct primary energy)"] = ( - primary_energy[f"{source} (TWh)"] / primary_energy["Primary energy (TWh - direct)"] * 100 - ) - # Calculate each source as share of input-equivalent primary energy (i.e. substitution method). - primary_energy[f"{source} (% equivalent primary energy)"] = ( - primary_energy[f"{source} (EJ)"] / primary_energy["Primary energy (EJ - equivalent)"] * 100 - ) - - for source in DIRECT_AND_EQUIVALENT_ENERGY: - # Calculate each source as share of direct primary energy. - primary_energy[f"{source} (% direct primary energy)"] = ( - primary_energy[f"{source} (TWh - direct)"] / primary_energy["Primary energy (TWh - direct)"] * 100 - ) - # Calculate each source as share of input-equivalent primary energy (i.e. substitution method). - primary_energy[f"{source} (% equivalent primary energy)"] = ( - primary_energy[f"{source} (EJ - equivalent)"] / primary_energy["Primary energy (EJ - equivalent)"] * 100 - ) - - return primary_energy - - -def calculate_primary_energy_annual_change( - primary_energy: pd.DataFrame, -) -> pd.DataFrame: - """Calculate annual change of (direct or direct and input-equivalent) primary energy for each energy source. - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - BP data after adding annual changes. - - """ - primary_energy = primary_energy.copy() - - # Calculate annual change in each source. - primary_energy = primary_energy.sort_values(["Country", "Year"]).reset_index(drop=True) - for source in ONLY_DIRECT_ENERGY: - # Create column for source percentage growth as a function of direct primary energy. - primary_energy[f"{source} (% growth)"] = primary_energy.groupby("Country")[f"{source} (TWh)"].pct_change() * 100 - # Create column for source absolute growth as a function of direct primary energy. - primary_energy[f"{source} (TWh growth)"] = primary_energy.groupby("Country")[f"{source} (TWh)"].diff() - - for source in DIRECT_AND_EQUIVALENT_ENERGY: - # Create column for source percentage growth as a function of primary energy - # (as a percentage, it is irrelevant whether it is direct or equivalent). - primary_energy[f"{source} (% growth)"] = ( - primary_energy.groupby("Country")[f"{source} (TWh - direct)"].pct_change() * 100 - ) - # Create column for source absolute growth as a function of direct primary energy. - primary_energy[f"{source} (TWh growth - direct)"] = primary_energy.groupby("Country")[ - f"{source} (TWh - direct)" - ].diff() - # Create column for source absolute growth as a function of input-equivalent primary energy. - primary_energy[f"{source} (TWh growth - equivalent)"] = primary_energy.groupby("Country")[ - f"{source} (TWh - equivalent)" - ].diff() - - return primary_energy - - -def add_per_capita_variables(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Add per-capita variables. - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - BP data after adding per-capita variables. - - """ - primary_energy = primary_energy.copy() - - primary_energy = add_population( - df=primary_energy, - country_col="Country", - year_col="Year", - population_col="Population", - warn_on_missing_countries=False, - ) - for source in ONLY_DIRECT_ENERGY: - primary_energy[f"{source} per capita (kWh)"] = ( - primary_energy[f"{source} (TWh)"] / primary_energy["Population"] * TWH_TO_KWH - ) - for source in DIRECT_AND_EQUIVALENT_ENERGY: - primary_energy[f"{source} per capita (kWh - direct)"] = ( - primary_energy[f"{source} (TWh - direct)"] / primary_energy["Population"] * TWH_TO_KWH - ) - primary_energy[f"{source} per capita (kWh - equivalent)"] = ( - primary_energy[f"{source} (TWh - equivalent)"] / primary_energy["Population"] * TWH_TO_KWH - ) - - # Drop unnecessary column. - primary_energy = primary_energy.drop(columns=["Population"]) - - return primary_energy - - -def prepare_output_table(primary_energy: pd.DataFrame) -> catalog.Table: - """Create a table with the processed data, ready to be in a garden dataset and to be uploaded to grapher (although - additional metadata may need to be added to the table). - - Parameters - ---------- - primary_energy : pd.DataFrame - Processed BP data. - - Returns - ------- - table : catalog.Table - Table, ready to be added to a new garden dataset. - - """ - # Keep only columns in TWh (and not EJ or PJ). - table = catalog.Table(primary_energy).drop( - errors="raise", - columns=[column for column in primary_energy.columns if (("(EJ" in column) or ("(PJ" in column))], - ) - - # Replace spurious inf values by nan. - table = table.replace([np.inf, -np.inf], np.nan) - - # Sort conveniently and add an index. - table = ( - table.sort_values(["Country", "Year"]) - .reset_index(drop=True) - .set_index(["Country", "Year"], verify_integrity=True) - .astype({"Country code": "category"}) - ) - - # Add metadata (e.g. unit) to each column. - # Define unit names (these are the long and short unit names that will be shown in grapher). - # The keys of the dictionary should correspond to units expected to be found in each of the variable names in table. - short_unit_to_unit = { - "TWh": "terawatt-hours", - "kWh": "kilowatt-hours", - "%": "%", - } - # Define number of decimal places to show (only relevant for grapher, not for the data). - short_unit_to_num_decimals = { - "TWh": 0, - "kWh": 0, - } - for column in table.columns: - table[column].metadata.title = column - for short_unit in ["TWh", "kWh", "%"]: - if short_unit in column: - table[column].metadata.short_unit = short_unit - table[column].metadata.unit = short_unit_to_unit[short_unit] - table[column].metadata.display = {} - if short_unit in short_unit_to_num_decimals: - table[column].metadata.display["numDecimalPlaces"] = short_unit_to_num_decimals[short_unit] - # Add the variable name without unit (only relevant for grapher). - table[column].metadata.display["name"] = column.split(" (")[0] - - table = catalog.utils.underscore_table(table) - - return table - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load the latest BP statistical review. - bp_dataset_path = DATA_DIR / "garden" / STAT_REVIEW_NAMESPACE / STAT_REVIEW_VERSION / STAT_REVIEW_SHORT_NAME - bp_dataset = catalog.Dataset(bp_dataset_path) - bp_table = bp_dataset[bp_dataset.table_names[0]] - - # - # Process data. - # - # Get a dataframe out of the BP table. - primary_energy = get_bp_data(bp_table=bp_table) - - # Calculate direct and primary energy using the substitution method. - primary_energy = calculate_direct_primary_energy(primary_energy=primary_energy) - primary_energy = calculate_equivalent_primary_energy(primary_energy=primary_energy) - - # Calculate share of (direct and sub-method) primary energy. - primary_energy = calculate_share_of_primary_energy(primary_energy=primary_energy) - - # Calculate annual change of primary energy. - primary_energy = calculate_primary_energy_annual_change(primary_energy) - - # Add per-capita variables. - primary_energy = add_per_capita_variables(primary_energy=primary_energy) - - # Prepare output data in a convenient way. - table = prepare_output_table(primary_energy) - - # - # Save outputs. - # - # Initialize new garden dataset. - dataset = catalog.Dataset.create_empty(dest_dir) - # Add metadata to dataset. - dataset.metadata.update_from_yaml(METADATA_FILE_PATH) - # Create new dataset in garden. - dataset.save() - - # Add table to the dataset. - table.metadata.title = dataset.metadata.title - table.metadata.description = dataset.metadata.description - table.metadata.short_name = dataset.metadata.short_name - table.metadata.primary_key = list(table.index.names) - dataset.add(table, repack=True) diff --git a/etl/steps/archive/garden/bp/2022-07-14/shared.py b/etl/steps/archive/garden/bp/2022-07-14/shared.py deleted file mode 100644 index 309625fdca7..00000000000 --- a/etl/steps/archive/garden/bp/2022-07-14/shared.py +++ /dev/null @@ -1,535 +0,0 @@ -from pathlib import Path -from typing import Dict, List, Optional, Union, cast - -import numpy as np -import pandas as pd -from owid import catalog -from structlog import get_logger - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -log = get_logger() - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - -# Aggregate regions to add, following OWID definitions. -REGIONS_TO_ADD = { - "North America": { - "country_code": "OWID_NAM", - }, - "South America": { - "country_code": "OWID_SAM", - }, - "Europe": { - "country_code": "OWID_EUR", - }, - # The EU27 is already included in the original BP data, with the same definition as OWID. - # "European Union (27)": { - # "country_code": "OWID_EU27", - # }, - "Africa": { - "country_code": "OWID_AFR", - }, - "Asia": { - "country_code": "OWID_ASI", - }, - "Oceania": { - "country_code": "OWID_OCE", - }, - "Low-income countries": { - "country_code": "OWID_LIC", - }, - "Upper-middle-income countries": { - "country_code": "OWID_UMC", - }, - "Lower-middle-income countries": { - "country_code": "OWID_LMC", - }, - "High-income countries": { - "country_code": "OWID_HIC", - }, -} - -# We need to include the 'Other * (BP)' regions, otherwise continents have incomplete data. -# For example, when constructing the aggregate for Africa, we need to include 'Other Africa (BP)'. -# Otherwise we would be underestimating the region's total contribution. -ADDITIONAL_COUNTRIES_IN_REGIONS = { - "Africa": [ - # Additional African regions in BP's data (e.g. 'Other Western Africa (BP)') seem to be included in - # 'Other Africa (BP)', therefore we ignore them when creating aggregates. - "Other Africa (BP)", - ], - "Asia": [ - # Adding 'Other Asia Pacific (BP)' may include areas of Oceania in Asia. - # However, it seems that this region is usually significantly smaller than Asia. - # So, we are possibly overestimating Asia, but not by a significant amount. - "Other Asia Pacific (BP)", - # Similarly, adding 'Other CIS (BP)' in Asia may include areas of Europe in Asia (e.g. Moldova). - # However, since most countries in 'Other CIS (BP)' are Asian, adding it is more accurate than not adding it. - "Other CIS (BP)", - # Countries defined by BP in 'Middle East' are fully included in OWID's definition of Asia. - "Other Middle East (BP)", - ], - "Europe": [ - "Other Europe (BP)", - ], - "North America": [ - "Other Caribbean (BP)", - "Other North America (BP)", - ], - "South America": [ - "Other South America (BP)", - ], - # Given that 'Other Asia and Pacific (BP)' is often similar or even larger than Oceania, we avoid including it in - # Oceania (and include it in Asia, see comment above). - # This means that we may be underestimating Oceania by a significant amount, but BP does not provide unambiguous - # data to avoid this. - "Oceania": [], -} - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION: Dict[str, Dict[str, Union[str, List[str]]]] = { - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "members": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, -} - - -def load_population() -> pd.DataFrame: - """Load OWID population dataset, and add historical regions to it. - - Returns - ------- - population : pd.DataFrame - Population dataset. - - """ - # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] - - # Add data for historical regions (if not in population) by adding the population of its current successors. - countries_with_population = population["country"].unique() - missing_countries = [country for country in HISTORIC_TO_CURRENT_REGION if country not in countries_with_population] - for country in missing_countries: - members = HISTORIC_TO_CURRENT_REGION[country]["members"] - _population = ( - population[population["country"].isin(members)] - .groupby("year") - .agg({"population": "sum", "country": "nunique"}) - .reset_index() - ) - # Select only years for which we have data for all member countries. - _population = _population[_population["country"] == len(members)].reset_index(drop=True) - _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) - - error = "Duplicate country-years found in population. Check if historical regions changed." - assert population[population.duplicated(subset=["country", "year"])].empty, error - - return cast(pd.DataFrame, population) - - -def load_income_groups() -> pd.DataFrame: - """Load dataset of income groups and add historical regions to it. - - Returns - ------- - income_groups : pd.DataFrame - Income groups data. - - """ - # Load the WorldBank dataset for income grups. - income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() - - # Add historical regions to income groups. - for historic_region in HISTORIC_TO_CURRENT_REGION: - historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] - if historic_region not in income_groups["country"]: - historic_region_df = pd.DataFrame( - { - "country": [historic_region], - "income_group": [historic_region_income_group], - } - ) - income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) - - return cast(pd.DataFrame, income_groups) - - -def add_population( - df: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - This function has been adapted from datautils.geo, because population currently does not include historic regions. - We include them in this function. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Load population dataset. - population = load_population().rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population - - -def detect_overlapping_data_for_regions_and_members( - df: pd.DataFrame, - index_columns: List[str], - regions_and_members: Dict[str, Dict[str, Union[str, List[str]]]], - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]], - ignore_zeros: bool = True, -) -> None: - """Raise a warning if there is data for a particular region and for a country that is a member of that region. - - For example, if there is data for USSR and Russia on the same years, a warning will be raised. - - Parameters - ---------- - df : pd.DataFrame - Data. - index_columns : list - Names of columns that should be index of the data. - regions_and_members : dict - Regions and members (where each key corresponds to a region, and each region is a dictionary of various keys, - one of which is 'members', which is a list of member countries). - known_overlaps : list or None - Instances of known overlaps in the data. If this function raises a warning, new instances should be added to the - list. - ignore_zeros : bool - True to consider zeros in the data as missing values. Doing this, if a region has overlapping data with a member - country, but one of their data points is zero, it will not be considered an overlap. - - """ - if known_overlaps is not None: - df = df.copy() - - if ignore_zeros: - # Replace zeros by nans, so that zeros are ignored when looking for overlapping data. - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - - regions = list(regions_and_members) - for region in regions: - # Create a dataframe with only data for the region, and remove columns that only have nans. - # Optionally, replace zeros by nans, to also remove columns that only have zeros or nans. - region_df = ( - df[df["country"] == region].replace(overlapping_values_to_ignore, np.nan).dropna(axis=1, how="all") - ) - members = regions_and_members[region]["members"] - for member in members: - # Create a dataframe for this particular member country. - member_df = ( - df[df["country"] == member].replace(overlapping_values_to_ignore, np.nan).dropna(axis=1, how="all") - ) - # Find common columns with (non-nan) data between region and member country. - variables = [ - column - for column in (set(region_df.columns) & set(member_df.columns)) - if column not in index_columns - ] - for variable in variables: - # Concatenate region and member country's data for this variable. - combined = ( - pd.concat( - [ - region_df[["year", variable]], - member_df[["year", variable]], - ], - ignore_index=True, - ) - .dropna() - .reset_index(drop=True) - ) - # Find years where region and member country overlap. - overlapping = combined[combined.duplicated(subset="year")] - if not overlapping.empty: - overlapping_years = sorted(set(overlapping["year"])) - new_overlap = { - "region": region, - "member": member, - "years": overlapping_years, - "variable": variable, - } - # Check if the overlap found is already in the list of known overlaps. - # If this overlap is not known, raise a warning. - # Omit the field "entity_to_make_nan" when checking if this overlap is known. - _known_overlaps = [ - {key for key in overlap if key != "entity_to_make_nan"} for overlap in known_overlaps - ] - if new_overlap not in _known_overlaps: # type: ignore - log.warning( - f"Data for '{region}' overlaps with '{member}' on '{variable}' " - f"and years: {overlapping_years}" - ) - - -def remove_overlapping_data_for_regions_and_members( - df: pd.DataFrame, - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]], - country_col: str = "country", - year_col: str = "year", - ignore_zeros: bool = True, -) -> pd.DataFrame: - """Check if list of known overlaps between region (e.g. a historical region like the USSR) and a member country (or - a successor country, like Russia) do overlap, and remove them from the data. - - Parameters - ---------- - df : pd.DataFrame - Data. - known_overlaps : list or None - List of known overlaps between region and member country. - country_col : str - Name of country column. - year_col : str - Name of year column. - ignore_zeros : bool - True to ignore columns of zeros when checking if known overlaps are indeed overlaps. - - Returns - ------- - df : pd.DataFrame - Data after removing known overlapping rows between a region and a member country. - - """ - if known_overlaps is not None: - df = df.copy() - - if ignore_zeros: - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - - for i, overlap in enumerate(known_overlaps): - if set([overlap["region"], overlap["member"]]) <= set(df["country"]): - # Check that the known overlap is indeed found in the data. - duplicated_rows = ( - df[(df[country_col].isin([overlap["region"], overlap["member"]]))][ - [country_col, year_col, overlap["variable"]] - ] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=overlap["variable"]) - ) - duplicated_rows = duplicated_rows[duplicated_rows.duplicated(subset="year", keep=False)] - overlapping_years = sorted(set(duplicated_rows["year"])) - if overlapping_years != overlap["years"]: - log.warning(f"Given overlap number {i} is not found in the data; redefine this list.") - # Make nan data points for either the region or the member (which is specified by "entity to make nan"). - indexes_to_make_nan = duplicated_rows[ - duplicated_rows["country"] == overlap[overlap["entity_to_make_nan"]] # type: ignore - ].index.tolist() - df.loc[indexes_to_make_nan, overlap["variable"]] = np.nan - - return df - - -def load_countries_in_regions() -> Dict[str, List[str]]: - """Create a dictionary of regions (continents and income groups) and their member countries. - - Regions to include are defined above, in REGIONS_TO_ADD. - Additional countries are added to regions following the definitions in ADDITIONAL_COUNTRIES_IN_REGIONS. - - Returns - ------- - countries_in_regions : dict - Dictionary of regions, where the value is a list of member countries in the region. - - """ - # Load income groups. - income_groups = load_income_groups() - - countries_in_regions = {} - for region in list(REGIONS_TO_ADD): - # Add default OWID list of countries in region (which includes historical regions). - countries_in_regions[region] = geo.list_countries_in_region(region=region, income_groups=income_groups) - - # Include additional countries in the region (if any given). - for region in ADDITIONAL_COUNTRIES_IN_REGIONS: - countries_in_regions[region] = countries_in_regions[region] + ADDITIONAL_COUNTRIES_IN_REGIONS[region] - - return countries_in_regions - - -def add_region_aggregates( - data: pd.DataFrame, - regions: List[str], - index_columns: List[str], - country_column: str = "country", - year_column: str = "year", - aggregates: Optional[Dict[str, str]] = None, - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]] = None, - region_codes: Optional[List[str]] = None, - country_code_column: str = "country_code", -) -> pd.DataFrame: - """Add region aggregates for all regions (which may include continents and income groups). - - Parameters - ---------- - data : pd.DataFrame - Data. - regions : list - Regions to include. - index_columns : list - Name of index columns. - country_column : str - Name of country column. - year_column : str - Name of year column. - aggregates : dict or None - Dictionary of type of aggregation to use for each variable. If None, variables will be aggregated by summing. - known_overlaps : list or None - List of known overlaps between regions and their member countries. - region_codes : list or None - List of country codes for each new region. It must have the same number of elements, and in the same order, as - the 'regions' argument. - country_code_column : str - Name of country codes column (only relevant of region_codes is not None). - - Returns - ------- - data : pd.DataFrame - Data after adding aggregate regions. - - """ - data = data.copy() - - if aggregates is None: - # If aggregations are not specified, assume all variables are to be aggregated, by summing. - aggregates = {column: "sum" for column in data.columns if column not in index_columns} - # Get the list of regions to create, and their member countries. - countries_in_regions = load_countries_in_regions() - for region in regions: - # List of countries in region. - countries_in_region = countries_in_regions[region] - # Select rows of data for member countries. - data_region = data[data[country_column].isin(countries_in_region)] - # Remove any known overlaps between regions (e.g. USSR, which is a historical region) in current region (e.g. - # Europe) and their member countries (or successor countries, like Russia). - # If any overlap in known_overlaps is not found, a warning will be raised. - data_region = remove_overlapping_data_for_regions_and_members(df=data_region, known_overlaps=known_overlaps) - - # Check that there are no other overlaps in the data (after having removed the known ones). - detect_overlapping_data_for_regions_and_members( - df=data_region, - regions_and_members=HISTORIC_TO_CURRENT_REGION, - index_columns=index_columns, - known_overlaps=known_overlaps, - ) - - # Add region aggregates. - data_region = geo.add_region_aggregates( - df=data_region, - region=region, - country_col=country_column, - year_col=year_column, - aggregations=aggregates, - countries_in_region=countries_in_region, - countries_that_must_have_data=[], - # Here we allow aggregating even when there are few countries informed (which seems to agree with BP's - # criterion for aggregates). - # However, if absolutely all countries have nan, we want the aggregate to be nan, not zero. - frac_allowed_nans_per_year=0.999, - num_allowed_nans_per_year=None, - ) - data = pd.concat( - [data, data_region[data_region[country_column] == region]], - ignore_index=True, - ).reset_index(drop=True) - - if region_codes is not None: - # Add region codes to regions. - if data[country_code_column].dtype == "category": - data[country_code_column] = data[country_code_column].cat.add_categories(region_codes) - for i, region in enumerate(regions): - data.loc[data[country_column] == region, country_code_column] = region_codes[i] - - return data diff --git a/etl/steps/archive/garden/bp/2022-07-14/statistical_review.meta.yml b/etl/steps/archive/garden/bp/2022-07-14/statistical_review.meta.yml deleted file mode 100644 index 66ca1356bab..00000000000 --- a/etl/steps/archive/garden/bp/2022-07-14/statistical_review.meta.yml +++ /dev/null @@ -1,41 +0,0 @@ -dataset: - namespace: bp - version: 2022-07-14 - title: Statistical Review of World Energy - BP (2022) - short_name: statistical_review - sources: - - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: | - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe (BP)", or "Other CIS (BP)"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa (BP)". - * "Asia" - All Asian countries + "Other Middle East (BP)" + "Other CIS (BP)" + "Other Asia Pacific (BP)". - * "Europe" - All European countries + "Other Europe (BP)". - * "North America" - All North American countries + "Other Caribbean (BP)" + "Other North America (BP)". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America (BP)". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa (BP)" is included in "Other Africa (BP)"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). diff --git a/etl/steps/archive/garden/bp/2022-07-14/statistical_review.py b/etl/steps/archive/garden/bp/2022-07-14/statistical_review.py deleted file mode 100644 index eeba1407a14..00000000000 --- a/etl/steps/archive/garden/bp/2022-07-14/statistical_review.py +++ /dev/null @@ -1,366 +0,0 @@ -"""Process the BP Statistical Review of World Energy 2022. - -For the moment, this dataset is downloaded and processed by -https://github.com/owid/importers/tree/master/bp_statreview - -However, in this additional step we add region aggregates following OWID definitions of regions. - -""" - -from copy import deepcopy - -import numpy as np -import pandas as pd -from owid import catalog -from shared import ( - CURRENT_DIR, - HISTORIC_TO_CURRENT_REGION, - REGIONS_TO_ADD, - add_region_aggregates, -) - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -P = PathFinder(__file__) - -# Namespace and short name for output dataset. -NAMESPACE = "bp" -# Path to metadata file for current dataset. -METADATA_FILE_PATH = CURRENT_DIR / "statistical_review.meta.yml" -# Original BP's Statistical Review dataset name in the OWID catalog (without the institution and year). -BP_CATALOG_NAME = "statistical_review_of_world_energy" -BP_BACKPORTED_DATASET_NAME = "dataset_5650_statistical_review_of_world_energy__bp__2022" -BP_NAMESPACE_IN_CATALOG = "bp_statreview" -BP_VERSION = 2022 -# Path to previous (processed) Statistical Review dataset. -BP_DATASET_OLD_PATH = DATA_DIR / "garden" / "bp" / "2022-07-11" / "statistical_review" - -# List of known overlaps between regions and member countries (or successor countries). -OVERLAPPING_DATA_TO_REMOVE_IN_AGGREGATES = [ - { - "region": "USSR", - "member": "Russia", - "entity_to_make_nan": "region", - "years": [1991, 1992, 1993, 1994, 1995, 1996], - "variable": "Gas - Proved reserves", - } -] - -# In the new dataset there are many missing values that were not missing in the previous release. -# We use data from the previous dataset to fill those gaps. -# However, we avoid certain countries in regions: -# * All region aggregates, since aggregates each year may include data from different countries. -# * USSR and their successor countries, since, in the old dataset, there is plenty of overlap between them (which does -# not happen in the new release). -COUNTRIES_TO_AVOID_WHEN_FILLING_NANS_WITH_PREVIOUS_RELEASE = ( - list(REGIONS_TO_ADD) + ["USSR"] + HISTORIC_TO_CURRENT_REGION["USSR"]["members"] # type: ignore -) - -# True to ignore zeros when checking for overlaps between regions and member countries. -# This means that, if a region (e.g. USSR) and a member country or successor country (e.g. Russia) overlap, but in a -# variable that only has zeros, it will not be considered an overlap. -IGNORE_ZEROS_WHEN_CHECKING_FOR_OVERLAPPING_DATA = True - -# Variables that can be summed when constructing region aggregates. -# Biofuels in Africa have a non-zero total, while there is no contribution from African countries. -# This causes that our aggregate for 'Africa' would be zero, while the original 'Africa (BP)' is not. -# Also, biodiesels are only given for continents and a few countries. -# For this reason we avoid creating aggregates for biofuels and biodiesels. -AGGREGATES_BY_SUM = [ - "Carbon Dioxide Emissions", - "Coal - Reserves - Anthracite and bituminous", - "Coal - Reserves - Sub-bituminous and lignite", - "Coal - Reserves - Total", - "Coal Consumption - EJ", - "Coal Consumption - TWh", - "Coal Production - EJ", - "Coal Production - TWh", - "Coal Production - Tonnes", - "Cobalt Production-Reserves", - "Elec Gen from Coal", - "Elec Gen from Gas", - "Elec Gen from Oil", - "Electricity Generation", - "Gas - Proved reserves", - "Gas Consumption - Bcf", - "Gas Consumption - Bcm", - "Gas Consumption - EJ", - "Gas Consumption - TWh", - "Gas Production - Bcf", - "Gas Production - Bcm", - "Gas Production - EJ", - "Gas Production - TWh", - "Geo Biomass Other - EJ", - "Geo Biomass Other - TWh", - "Graphite Production-Reserves", - "Hydro Consumption - EJ", - "Hydro Consumption - TWh", - "Hydro Generation - TWh", - "Lithium Production-Reserves", - "Nuclear Consumption - EJ", - "Nuclear Consumption - TWh", - "Nuclear Generation - TWh", - "Oil - Proved reserves", - "Oil - Refinery throughput", - "Oil - Refining capacity", - "Oil Consumption - Barrels", - "Oil Consumption - EJ", - "Oil Consumption - TWh", - "Oil Consumption - Tonnes", - "Oil Production - Barrels", - "Oil Production - Crude Conds", - "Oil Production - NGLs", - "Oil Production - TWh", - "Oil Production - Tonnes", - "Primary Energy Consumption - EJ", - "Primary Energy Consumption - TWh", - "Renewables Consumption - EJ", - "Renewables Consumption - TWh", - "Renewables Power - EJ", - "Renewables power - TWh", - "Solar Capacity", - "Solar Consumption - EJ", - "Solar Consumption - TWh", - "Solar Generation - TWh", - "Total Liquids - Consumption", - "Wind Capacity", - "Wind Consumption - EJ", - "Wind Consumption - TWh", - "Wind Generation - TWh", - # 'Biofuels Consumption - Kboed - Total', - # 'Biofuels Consumption - Kboed - Biodiesel', - # 'Biofuels Consumption - PJ - Total', - # 'Biofuels Consumption - PJ - Biodiesel', - # 'Biofuels Consumption - TWh - Total', - # 'Biofuels Consumption - TWh - Biodiesel', - # 'Biofuels Consumption - TWh - Biodiesel (zero filled)', - # 'Biofuels Consumption - TWh - Total (zero filled)', - # 'Biofuels Production - Kboed - Total', - # 'Biofuels Production - PJ - Total', - # 'Biofuels Production - TWh - Total', - # 'Biofuels Production - Kboed - Biodiesel', - # 'Biofuels Production - PJ - Biodiesel', - # 'Biofuels Production - TWh - Biodiesel', - # 'Coal - Prices', - # 'Coal Consumption - TWh (zero filled)', - # 'Gas - Prices', - # 'Gas Consumption - TWh (zero filled)', - # 'Geo Biomass Other - TWh (zero filled)', - # 'Hydro Consumption - TWh (zero filled)', - # 'Nuclear Consumption - TWh (zero filled)', - # 'Oil - Crude prices since 1861 (2021 $)', - # 'Oil - Crude prices since 1861 (current $)', - # 'Oil - Spot crude prices', - # 'Oil Consumption - TWh (zero filled)', - # 'Primary Energy - Cons capita', - # 'Rare Earth Production-Reserves', - # 'Solar Consumption - TWh (zero filled)', - # 'Wind Consumption - TWh (zero filled)', -] - - -def prepare_output_table(df: pd.DataFrame, bp_table: catalog.Table) -> catalog.Table: - """Create a table with the processed data, ready to be in a garden dataset and to be uploaded to grapher (although - additional metadata may need to be added to the table). - - Parameters - ---------- - df : pd.DataFrame - Processed BP data. - bp_table : catalog.Table - Original table of BP statistical review data (used to transfer its metadata to the new table). - - Returns - ------- - table : catalog.Table - Table, ready to be added to a new garden dataset. - - """ - # Create new table. - table = catalog.Table(df).copy() - - # Replace spurious inf values by nan. - table = table.replace([np.inf, -np.inf], np.nan) - - # Sort conveniently and add an index. - table = ( - table.sort_values(["country", "year"]) - .reset_index(drop=True) - .set_index(["country", "year"], verify_integrity=True) - .astype({"country_code": "category"}) - ) - - # Convert column names to lower, snake case. - table = catalog.utils.underscore_table(table) - - # Get the table metadata from the original table. - table.metadata = deepcopy(bp_table.metadata) - - # Get the metadata of each variable from the original table. - for column in table.drop(columns="country_code").columns: - table[column].metadata = deepcopy(bp_table[column].metadata) - - return table - - -def fill_missing_values_with_previous_version(table: catalog.Table, table_old: catalog.Table) -> catalog.Table: - """Fill missing values in current data with values from the previous version of the dataset. - - Parameters - ---------- - table : catalog.Table - Processed data from current dataset. - table_old : catalog.Table - Processed data from previous dataset. - - Returns - ------- - combined : catalog.Table - Combined table, with data from the current data, but after filling missing values with data from the previous - version of the dataset. - - """ - # Remove region aggregates from the old table. - table_old = table_old.reset_index().drop(columns="country_code") - table_old = ( - table_old[~table_old["country"].isin(COUNTRIES_TO_AVOID_WHEN_FILLING_NANS_WITH_PREVIOUS_RELEASE)] - .reset_index(drop=True) - .set_index(["country", "year"]) - ) - - # We should only merge on columns that exist in the new dataset. - # If we merge on all columns, we would get old columns from the old dataset that do not exist in the current one. - # This could be thought of a possitive outcome, but we avoid it because: - # * It would be misleading to claim that this data is from BP Statistical Review (2022), since it would be - # a mix of different releases. - # * By doing an outer join, some countries in the old dataset that may not be present in the current dataset - # may be added (e.g. Kenya and Ethiopia were present in the 2021 release because they had data for - # geothermal_capacity, but they are not included in the 2022 release, since they don't have data for any other - # variable). This could lead to unharmonized country names appearing in the current dataset. - # Combine the current output table with the table from the previous version the dataset. - combined = pd.merge( - table, - table_old[[column for column in table_old.columns if column in table.columns]], - left_index=True, - right_index=True, - how="left", - suffixes=("", "_old"), - ) - - # List the common columns that can be filled with values from the previous version. - columns = [column for column in combined.columns if column.endswith("_old")] - - # Fill missing values in the current table with values from the old table. - for column_old in columns: - column = column_old.replace("_old", "") - combined[column] = combined[column].fillna(combined[column_old]) - - # Remove columns from the old table. - combined = combined.drop(columns=columns) - - # Transfer metadata from the table of the current dataset into the combined table. - combined.metadata = deepcopy(table.metadata) - # When that is not possible (for columns that were only in the old but not in the new table), - # get the metadata from the old table. - - for column in combined.columns: - try: - combined[column].metadata = deepcopy(table[column].metadata) - except KeyError: - combined[column].metadata = deepcopy(table_old[column].metadata) - - # Sanity checks. - assert len(combined) == len(table) - assert set(table.columns) <= set(combined.columns) - - return combined - - -def amend_zero_filled_variables_for_region_aggregates(df: pd.DataFrame) -> pd.DataFrame: - """Fill the "* (zero filled)" variables (which were ignored when creating aggregates) with the new aggregate data, - and fill any possible nan with zeros. - - Parameters - ---------- - df : pd.DataFrame - Data after having created region aggregates (which ignore '* (zero filled)' variables). - - Returns - ------- - df : pd.DataFrame - Data after amending zero filled variables for region aggregates. - - """ - df = df.copy() - - zero_filled_variables = [column for column in df.columns if "(zero filled)" in column] - original_variables = [column.replace(" (zero filled)", "") for column in df.columns if "(zero filled)" in column] - select_regions = df["country"].isin(REGIONS_TO_ADD) - df.loc[select_regions, zero_filled_variables] = df[select_regions][original_variables].fillna(0).values - - return df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load table from latest BP dataset. - bp_ds: catalog.Dataset = P.load_dependency(BP_BACKPORTED_DATASET_NAME) - bp_table = bp_ds[BP_BACKPORTED_DATASET_NAME] - - # Load previous version of the BP energy mix dataset, that will be used at the end to fill missing values in the - # current dataset. - bp_dataset_old = catalog.Dataset(BP_DATASET_OLD_PATH) - bp_table_old = bp_dataset_old[bp_dataset_old.table_names[0]] - - # - # Process data. - # - # Extract dataframe of BP data from table. - bp_data = ( - pd.DataFrame(bp_table) - .reset_index() - .rename(columns={column: bp_table[column].metadata.title for column in bp_table.columns}) - .rename(columns={"entity_name": "country", "entity_code": "country_code"}) - .drop(columns="entity_id") - ) - - # Add region aggregates. - df = add_region_aggregates( - data=bp_data, - regions=list(REGIONS_TO_ADD), - index_columns=["country", "year", "country_code"], - country_column="country", - year_column="year", - aggregates={column: "sum" for column in AGGREGATES_BY_SUM}, - known_overlaps=OVERLAPPING_DATA_TO_REMOVE_IN_AGGREGATES, # type: ignore - region_codes=[REGIONS_TO_ADD[region]["country_code"] for region in REGIONS_TO_ADD], - ) - - # Fill nans with zeros for "* (zero filled)" variables for region aggregates (which were ignored). - df = amend_zero_filled_variables_for_region_aggregates(df) - - # Prepare output data in a convenient way. - table = prepare_output_table(df, bp_table) - - # Fill missing values in current table with values from the previous dataset, when possible. - table = fill_missing_values_with_previous_version(table=table, table_old=bp_table_old) - - # - # Save outputs. - # - # Initialize new garden dataset. - dataset = catalog.Dataset.create_empty(dest_dir) - # Add metadata to dataset. - dataset.metadata.update_from_yaml(METADATA_FILE_PATH) - # Create new dataset in garden. - dataset.save() - - # Add table to the dataset. - table.metadata.title = dataset.metadata.title - table.metadata.description = dataset.metadata.description - table.metadata.short_name = dataset.metadata.short_name - table.metadata.primary_key = list(table.index.names) - dataset.add(table, repack=True) diff --git a/etl/steps/archive/garden/bp/2022-09-19/fossil_fuel_reserves_production_ratio.meta.yml b/etl/steps/archive/garden/bp/2022-09-19/fossil_fuel_reserves_production_ratio.meta.yml deleted file mode 100644 index e6184da3aa4..00000000000 --- a/etl/steps/archive/garden/bp/2022-09-19/fossil_fuel_reserves_production_ratio.meta.yml +++ /dev/null @@ -1,53 +0,0 @@ -dataset: - namespace: bp - version: 2022-09-19 - title: Fossil fuel reserves/production ratio - short_name: fossil_fuel_reserves_production_ratio - description: | - The Reserves-to-Production (R/P) Ratio measures the number of years of fuel supplies left based on current annual consumption rates. Note that this can change through time through the discovery of new fuel reserves, and increases in annual consumption. - sources: - - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - -tables: - fossil_fuel_reserves_production_ratio: - variables: - coal_left: - title: Coal reserves/production ratio - short_unit: years - unit: years - coal_production: - title: Coal annual production - short_unit: t - unit: tonnes - coal_reserves: - title: Global reserves of coal - short_unit: t - unit: tonnes - gas_left: - title: Gas reserves/production ratio - short_unit: years - unit: years - gas_production: - title: Gas annual production - short_unit: cubic meters - unit: cubic meters - gas_reserves: - title: Global reserves of gas - short_unit: cubic meters - unit: cubic meters - oil_left: - title: Oil reserves/production ratio - short_unit: years - unit: years - oil_production: - title: Oil annual production - short_unit: t - unit: tonnes - oil_reserves: - title: Global reserves of oil - short_unit: t - unit: tonnes diff --git a/etl/steps/archive/garden/bp/2022-09-19/fossil_fuel_reserves_production_ratio.py b/etl/steps/archive/garden/bp/2022-09-19/fossil_fuel_reserves_production_ratio.py deleted file mode 100644 index ce02dc64d85..00000000000 --- a/etl/steps/archive/garden/bp/2022-09-19/fossil_fuel_reserves_production_ratio.py +++ /dev/null @@ -1,100 +0,0 @@ -import pandas as pd -from owid import catalog -from shared import CURRENT_DIR - -from etl.paths import DATA_DIR - -# Details for dataset to export. -DATASET_SHORT_NAME = "fossil_fuel_reserves_production_ratio" -DATASET_TITLE = "Fossil fuel reserves/production ratio" -METADATA_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Path to BP statistical review datset to import. -BP_DATASET_PATH = DATA_DIR / "garden/bp/2022-07-14/statistical_review" - - -def prepare_bp_data(tb_bp: catalog.Table) -> catalog.Table: - # Prepare BP data. - columns = { - "country": "country", - "year": "year", - "coal__reserves__total": "coal_reserves", - "coal_production__tonnes": "coal_production", - "oil__proved_reserves": "oil_reserves", - "oil_production__barrels": "oil_production", - "gas__proved_reserves": "gas_reserves", - "gas_production__bcm": "gas_production", - } - df_bp = pd.DataFrame(tb_bp).reset_index()[list(columns)].rename(columns=columns) - - # Select only global data. - df_bp = df_bp[df_bp["country"] == "World"].reset_index(drop=True) - - # Check that the units are the expected ones. - assert tb_bp["coal__reserves__total"].metadata.unit == "Million tonnes" - assert tb_bp["coal_production__tonnes"].metadata.unit == "Million tonnes" - # WARNING: Here the "unit" metadata field seems to be wrong, it should be billion barrels. - assert tb_bp["oil__proved_reserves"].metadata.unit == "Barrels" - assert tb_bp["oil_production__barrels"].metadata.unit == "Thousand barrels per day" - assert tb_bp["gas__proved_reserves"].metadata.unit == "Trillion cubic metres" - assert tb_bp["gas_production__bcm"].metadata.unit == "Billion cubic metres" - - # Convert to tonnes. - # Million tonnes to tonnes. - df_bp["coal_reserves"] *= 1e6 - # Million tonnes to tonnes. - df_bp["coal_production"] *= 1e6 - # Billion barrels to tonnes. - df_bp["oil_reserves"] *= 1e9 * 0.1364 - # Thousand barrels per day to tonnes per year. - df_bp["oil_production"] *= 1e3 * 365 * 0.1364 - # Trillion cubic meters to cubic meters. - df_bp["gas_reserves"] *= 1e12 - # Billion cubic meters to cubic meters. - df_bp["gas_production"] *= 1e9 - - # Create columns for reserves-production ratio (measured in years of fossil fuels left). - df_bp["coal_left"] = df_bp["coal_reserves"] / df_bp["coal_production"] - df_bp["oil_left"] = df_bp["oil_reserves"] / df_bp["oil_production"] - df_bp["gas_left"] = df_bp["gas_reserves"] / df_bp["gas_production"] - - # Set index, drop rows that only have nans, and sort conveniently. - df_bp = df_bp.set_index(["country", "year"]).dropna(how="all").sort_index().sort_index(axis=1) - - # Create a new table. - tb = catalog.Table(df_bp) - - return tb - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read all required datasets. - ds_bp = catalog.Dataset(BP_DATASET_PATH) - - # Gather all required tables from all datasets. - tb_bp = ds_bp[ds_bp.table_names[0]] - - # - # Process data. - # - # Prepare BP data. - tb = prepare_bp_data(tb_bp=tb_bp) - - # - # Save outputs. - # - ds_garden = catalog.Dataset.create_empty(dest_dir) - # Get the rest of the metadata from the yaml file. - ds_garden.metadata.update_from_yaml(METADATA_PATH) - # Create dataset. - ds_garden.save() - - # Add other metadata fields to table. - tb.metadata.short_name = DATASET_SHORT_NAME - tb.metadata.title = DATASET_TITLE - tb.update_metadata_from_yaml(METADATA_PATH, DATASET_SHORT_NAME) - - # Add combined tables to the new dataset. - ds_garden.add(tb) diff --git a/etl/steps/archive/garden/bp/2022-09-19/shared.py b/etl/steps/archive/garden/bp/2022-09-19/shared.py deleted file mode 100644 index 7e7f4d18c5b..00000000000 --- a/etl/steps/archive/garden/bp/2022-09-19/shared.py +++ /dev/null @@ -1,3 +0,0 @@ -from pathlib import Path - -CURRENT_DIR = Path(__file__).parent diff --git a/etl/steps/archive/garden/bp/2022-12-28/energy_mix.meta.yml b/etl/steps/archive/garden/bp/2022-12-28/energy_mix.meta.yml deleted file mode 100644 index 2d10be74f44..00000000000 --- a/etl/steps/archive/garden/bp/2022-12-28/energy_mix.meta.yml +++ /dev/null @@ -1,58 +0,0 @@ -dataset: - namespace: bp - version: 2022-12-28 - title: Energy mix (BP, 2022) - short_name: energy_mix - sources: - - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: | - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe", or "Other CIS"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa". - * "Asia" - All Asian countries + "Other Middle East" + "Other CIS" + "Other Asia Pacific". - * "Europe" - All European countries + "Other Europe". - * "North America" - All North American countries + "Other Caribbean" + "Other North America". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa" is included in "Other Africa"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - description: | - Raw data on energy consumption is sourced from [the BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - - Primary energy in exajoules (EJ) has been converted to TWh by Our World in Data based on a conversion factor of 1,000,000 / 3,600 (~277.778). - - For non-fossil based electricity sources (nuclear, hydro, wind, solar, geothermal, biomass in power, and other renewable sources), BP's generation (in TWh) corresponds to gross generation and not accounting for cross-border electricity supply. - Also, for non-fossil based electricity, there are two ways to define primary energy: - * One is "direct primary energy", which corresponds to the electricity generation (in TWh). - * The other is "input-equivalent primary energy" (also called "primary energy using the substitution method"). - This is the amount of fuel that would be required by thermal power stations to generate the reported electricity, as explained in [BP's methodology document](https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/pdfs/energy-economics/statistical-review/bp-stats-review-2022-methodology.pdf). For example, if a country's nuclear power generated 100 TWh of electricity, and assuming that the efficiency of a standard thermal power plant is 38%, the input equivalent primary energy for this country would be 100 TWh / 0.38 = 263 TWh = 0.95 EJ. This input-equivalent primary energy takes account of the inefficiencies in fossil fuel production and provides a better approximation of each source's share of "final energy" consumption. - - Additional metrics have been calculated by Our World in Data: - - Annual change in energy consumption by source: this is calculated as the difference from the previous year. - - % of total primary energy: calculated as each source's share of primary energy (direct energy and primary energy using the substitution method) from all sources. - - Per capita energy by source: calculated as primary energy consumption by source, divided by population. - - Per capita figures have been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). diff --git a/etl/steps/archive/garden/bp/2022-12-28/energy_mix.py b/etl/steps/archive/garden/bp/2022-12-28/energy_mix.py deleted file mode 100644 index 3ac06563796..00000000000 --- a/etl/steps/archive/garden/bp/2022-12-28/energy_mix.py +++ /dev/null @@ -1,468 +0,0 @@ -"""Generate BP energy mix 2022 dataset using data from BP's statistical review of the world energy. - -""" - -import numpy as np -import pandas as pd -from owid import catalog -from shared import CURRENT_DIR, add_population - -from etl.paths import DATA_DIR - -# Namespace, dataset short name, and version of the garden dataset of the BP statistical review. -STAT_REVIEW_NAMESPACE = "bp" -STAT_REVIEW_SHORT_NAME = "statistical_review" -STAT_REVIEW_VERSION = "2022-12-28" -# Path to metadata file. -METADATA_FILE_PATH = CURRENT_DIR / "energy_mix.meta.yml" - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 -# Exajoules to terawatt-hours. -EJ_TO_TWH = 1e6 / 3600 -# Petajoules to exajoules. -PJ_TO_EJ = 1e-3 - -# List all energy sources in the data. -ONLY_DIRECT_ENERGY = ["Coal", "Fossil fuels", "Gas", "Oil", "Biofuels"] -DIRECT_AND_EQUIVALENT_ENERGY = [ - "Hydro", - "Low-carbon energy", - "Nuclear", - "Other renewables", - "Renewables", - "Solar", - "Wind", -] -ALL_SOURCES = sorted(ONLY_DIRECT_ENERGY + DIRECT_AND_EQUIVALENT_ENERGY) - - -def get_bp_data(bp_table: catalog.Table) -> pd.DataFrame: - """Extract a simple dataframe of BP statistical review data from the table in the dataset. - - Parameters - ---------- - bp_table : catalog.Table - BP table (from the dataset of BP statistical review). - - Returns - ------- - bp_data : pd.DataFrame - BP statistical review data. - - """ - bp_table = bp_table.copy() - - # Convert table (snake case) column names to human readable names. - bp_table = bp_table.rename( - columns={column: bp_table[column].metadata.title for column in bp_table.columns if column != "country_code"} - ).reset_index() - - # Rename human-readable columns (and select only the ones that will be used). - columns = { - "country": "Country", - "country_code": "Country code", - "year": "Year", - # Fossil fuel primary energy (in EJ). - "Coal Consumption - EJ": "Coal (EJ)", - "Gas Consumption - EJ": "Gas (EJ)", - "Oil Consumption - EJ": "Oil (EJ)", - # Non-fossil based electricity generation (in TWh). - "Hydro Generation - TWh": "Hydro (TWh - direct)", - "Nuclear Generation - TWh": "Nuclear (TWh - direct)", - "Solar Generation - TWh": "Solar (TWh - direct)", - "Wind Generation - TWh": "Wind (TWh - direct)", - "Geo Biomass Other - TWh": "Other renewables (TWh - direct)", - # Non-fossil based electricity generation converted into input-equivalent primary energy (in EJ). - "Hydro Consumption - EJ": "Hydro (EJ - equivalent)", - "Nuclear Consumption - EJ": "Nuclear (EJ - equivalent)", - "Solar Consumption - EJ": "Solar (EJ - equivalent)", - "Wind Consumption - EJ": "Wind (EJ - equivalent)", - "Geo Biomass Other - EJ": "Other renewables (EJ - equivalent)", - # Total, input-equivalent primary energy consumption (in EJ). - "Primary Energy Consumption - EJ": "Primary energy (EJ - equivalent)", - # Biofuels consumption (in PJ, that will be converted into EJ). - "Biofuels Consumption - PJ - Total": "Biofuels (PJ)", - } - - # Create a simple dataframe (without metadata and with a dummy index). - assert set(columns) < set(bp_table.columns), "Column names have changed in BP data." - - bp_data = pd.DataFrame(bp_table)[list(columns)].rename(errors="raise", columns=columns) - - return bp_data - - -def _check_that_substitution_method_is_well_calculated( - primary_energy: pd.DataFrame, -) -> None: - # Check that the constructed primary energy using the substitution method (in TWh) coincides with the - # input-equivalent primary energy (converted from EJ into TWh) given in the original data. - check = primary_energy[ - [ - "Year", - "Country", - "Primary energy (EJ - equivalent)", - "Primary energy (TWh - equivalent)", - ] - ].reset_index(drop=True) - check["Primary energy (TWh - equivalent) - original"] = check["Primary energy (EJ - equivalent)"] * EJ_TO_TWH - check = check.dropna().reset_index(drop=True) - # They may not coincide exactly, but at least check that they differ (point by point) by less than 10%. - max_deviation = max( - abs( - (check["Primary energy (TWh - equivalent)"] - check["Primary energy (TWh - equivalent) - original"]) - / check["Primary energy (TWh - equivalent) - original"] - ) - ) - assert max_deviation < 0.1 - - -def calculate_direct_primary_energy(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Convert direct primary energy into TWh and create various aggregates (e.g. Fossil fuels and Renewables). - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - Data, after adding direct primary energy. - - """ - primary_energy = primary_energy.copy() - - # Convert units of biofuels consumption. - primary_energy["Biofuels (EJ)"] = primary_energy["Biofuels (PJ)"] * PJ_TO_EJ - - # Create column for fossil fuels primary energy (if any of them is nan, the sum will be nan). - primary_energy["Fossil fuels (EJ)"] = ( - primary_energy["Coal (EJ)"] + primary_energy["Oil (EJ)"] + primary_energy["Gas (EJ)"] - ) - - # Convert primary energy of fossil fuels and biofuels into TWh. - for cat in ["Coal", "Oil", "Gas", "Biofuels"]: - primary_energy[f"{cat} (TWh)"] = primary_energy[f"{cat} (EJ)"] * EJ_TO_TWH - - # Create column for primary energy from fossil fuels (in TWh). - primary_energy["Fossil fuels (TWh)"] = ( - primary_energy["Coal (TWh)"] + primary_energy["Oil (TWh)"] + primary_energy["Gas (TWh)"] - ) - - # Create column for direct primary energy from renewable sources in TWh. - # (total renewable electricity generation and biofuels) (in TWh). - # By visually inspecting the original data, it seems that many data points that used to be zero are - # missing in the 2022 release, so filling nan with zeros seems to be a reasonable approach to avoids losing a - # significant amount of data. - primary_energy["Renewables (TWh - direct)"] = ( - primary_energy["Hydro (TWh - direct)"] - + primary_energy["Solar (TWh - direct)"].fillna(0) - + primary_energy["Wind (TWh - direct)"].fillna(0) - + primary_energy["Other renewables (TWh - direct)"].fillna(0) - + primary_energy["Biofuels (TWh)"].fillna(0) - ) - # Create column for direct primary energy from low-carbon sources in TWh. - # (total renewable electricity generation, biofuels, and nuclear power) (in TWh). - primary_energy["Low-carbon energy (TWh - direct)"] = primary_energy["Renewables (TWh - direct)"] + primary_energy[ - "Nuclear (TWh - direct)" - ].fillna(0) - # Create column for total direct primary energy. - primary_energy["Primary energy (TWh - direct)"] = ( - primary_energy["Fossil fuels (TWh)"] + primary_energy["Low-carbon energy (TWh - direct)"] - ) - - return primary_energy - - -def calculate_equivalent_primary_energy(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Convert input-equivalent primary energy into TWh and create various aggregates (e.g. Fossil fuels and - Renewables). - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - Data, after adding input-equivalent primary energy. - - """ - primary_energy = primary_energy.copy() - # Create column for total renewable input-equivalent primary energy (in EJ). - # Fill missing values with zeros (see comment above). - primary_energy["Renewables (EJ - equivalent)"] = ( - primary_energy["Hydro (EJ - equivalent)"] - + primary_energy["Solar (EJ - equivalent)"].fillna(0) - + primary_energy["Wind (EJ - equivalent)"].fillna(0) - + primary_energy["Other renewables (EJ - equivalent)"].fillna(0) - + primary_energy["Biofuels (EJ)"].fillna(0) - ) - # Create column for low carbon energy (i.e. renewable plus nuclear energy). - primary_energy["Low-carbon energy (EJ - equivalent)"] = primary_energy[ - "Renewables (EJ - equivalent)" - ] + primary_energy["Nuclear (EJ - equivalent)"].fillna(0) - # Convert input-equivalent primary energy of non-fossil based electricity into TWh. - # The result is primary energy using the "substitution method". - for cat in DIRECT_AND_EQUIVALENT_ENERGY: - primary_energy[f"{cat} (TWh - equivalent)"] = primary_energy[f"{cat} (EJ - equivalent)"] * EJ_TO_TWH - # Create column for primary energy from all sources (which corresponds to input-equivalent primary - # energy for non-fossil based sources). - primary_energy["Primary energy (TWh - equivalent)"] = ( - primary_energy["Fossil fuels (TWh)"] + primary_energy["Low-carbon energy (TWh - equivalent)"] - ) - # Check that the primary energy constructed using the substitution method coincides with the - # input-equivalent primary energy. - _check_that_substitution_method_is_well_calculated(primary_energy) - - return primary_energy - - -def calculate_share_of_primary_energy(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Calculate the share (percentage) of (direct or direct and input-equivalent) primary energy for each energy - source. - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - BP data after adding columns for the share of primary energy. - - """ - primary_energy = primary_energy.copy() - # Check that all sources are included in the data. - expected_sources = sorted( - set( - [ - source.split("(")[0].strip() - for source in primary_energy.columns - if not source.startswith(("Country", "Year", "Primary")) - ] - ) - ) - assert expected_sources == ALL_SOURCES, "Sources may have changed names." - - for source in ONLY_DIRECT_ENERGY: - # Calculate each source as share of direct primary energy. - primary_energy[f"{source} (% direct primary energy)"] = ( - primary_energy[f"{source} (TWh)"] / primary_energy["Primary energy (TWh - direct)"] * 100 - ) - # Calculate each source as share of input-equivalent primary energy (i.e. substitution method). - primary_energy[f"{source} (% equivalent primary energy)"] = ( - primary_energy[f"{source} (EJ)"] / primary_energy["Primary energy (EJ - equivalent)"] * 100 - ) - - for source in DIRECT_AND_EQUIVALENT_ENERGY: - # Calculate each source as share of direct primary energy. - primary_energy[f"{source} (% direct primary energy)"] = ( - primary_energy[f"{source} (TWh - direct)"] / primary_energy["Primary energy (TWh - direct)"] * 100 - ) - # Calculate each source as share of input-equivalent primary energy (i.e. substitution method). - primary_energy[f"{source} (% equivalent primary energy)"] = ( - primary_energy[f"{source} (EJ - equivalent)"] / primary_energy["Primary energy (EJ - equivalent)"] * 100 - ) - - return primary_energy - - -def calculate_primary_energy_annual_change( - primary_energy: pd.DataFrame, -) -> pd.DataFrame: - """Calculate annual change of (direct or direct and input-equivalent) primary energy for each energy source. - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - BP data after adding annual changes. - - """ - primary_energy = primary_energy.copy() - - # Calculate annual change in each source. - primary_energy = primary_energy.sort_values(["Country", "Year"]).reset_index(drop=True) - for source in ONLY_DIRECT_ENERGY: - # Create column for source percentage growth as a function of direct primary energy. - primary_energy[f"{source} (% growth)"] = primary_energy.groupby("Country")[f"{source} (TWh)"].pct_change() * 100 - # Create column for source absolute growth as a function of direct primary energy. - primary_energy[f"{source} (TWh growth)"] = primary_energy.groupby("Country")[f"{source} (TWh)"].diff() - - for source in DIRECT_AND_EQUIVALENT_ENERGY: - # Create column for source percentage growth as a function of primary energy - # (as a percentage, it is irrelevant whether it is direct or equivalent). - primary_energy[f"{source} (% growth)"] = ( - primary_energy.groupby("Country")[f"{source} (TWh - direct)"].pct_change() * 100 - ) - # Create column for source absolute growth as a function of direct primary energy. - primary_energy[f"{source} (TWh growth - direct)"] = primary_energy.groupby("Country")[ - f"{source} (TWh - direct)" - ].diff() - # Create column for source absolute growth as a function of input-equivalent primary energy. - primary_energy[f"{source} (TWh growth - equivalent)"] = primary_energy.groupby("Country")[ - f"{source} (TWh - equivalent)" - ].diff() - - return primary_energy - - -def add_per_capita_variables(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Add per-capita variables. - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - BP data after adding per-capita variables. - - """ - primary_energy = primary_energy.copy() - - primary_energy = add_population( - df=primary_energy, - country_col="Country", - year_col="Year", - population_col="Population", - warn_on_missing_countries=False, - ) - for source in ONLY_DIRECT_ENERGY: - primary_energy[f"{source} per capita (kWh)"] = ( - primary_energy[f"{source} (TWh)"] / primary_energy["Population"] * TWH_TO_KWH - ) - for source in DIRECT_AND_EQUIVALENT_ENERGY: - primary_energy[f"{source} per capita (kWh - direct)"] = ( - primary_energy[f"{source} (TWh - direct)"] / primary_energy["Population"] * TWH_TO_KWH - ) - primary_energy[f"{source} per capita (kWh - equivalent)"] = ( - primary_energy[f"{source} (TWh - equivalent)"] / primary_energy["Population"] * TWH_TO_KWH - ) - - # Drop unnecessary column. - primary_energy = primary_energy.drop(columns=["Population"]) - - return primary_energy - - -def prepare_output_table(primary_energy: pd.DataFrame) -> catalog.Table: - """Create a table with the processed data, ready to be in a garden dataset and to be uploaded to grapher (although - additional metadata may need to be added to the table). - - Parameters - ---------- - primary_energy : pd.DataFrame - Processed BP data. - - Returns - ------- - table : catalog.Table - Table, ready to be added to a new garden dataset. - - """ - # Keep only columns in TWh (and not EJ or PJ). - table = catalog.Table(primary_energy).drop( - errors="raise", - columns=[column for column in primary_energy.columns if (("(EJ" in column) or ("(PJ" in column))], - ) - - # Replace spurious inf values by nan. - table = table.replace([np.inf, -np.inf], np.nan) - - # Sort conveniently and add an index. - table = ( - table.sort_values(["Country", "Year"]) - .reset_index(drop=True) - .set_index(["Country", "Year"], verify_integrity=True) - .astype({"Country code": "category"}) - ) - - # Add metadata (e.g. unit) to each column. - # Define unit names (these are the long and short unit names that will be shown in grapher). - # The keys of the dictionary should correspond to units expected to be found in each of the variable names in table. - short_unit_to_unit = { - "TWh": "terawatt-hours", - "kWh": "kilowatt-hours", - "%": "%", - } - # Define number of decimal places to show (only relevant for grapher, not for the data). - short_unit_to_num_decimals = { - "TWh": 0, - "kWh": 0, - } - for column in table.columns: - table[column].metadata.title = column - for short_unit in ["TWh", "kWh", "%"]: - if short_unit in column: - table[column].metadata.short_unit = short_unit - table[column].metadata.unit = short_unit_to_unit[short_unit] - table[column].metadata.display = {} - if short_unit in short_unit_to_num_decimals: - table[column].metadata.display["numDecimalPlaces"] = short_unit_to_num_decimals[short_unit] - # Add the variable name without unit (only relevant for grapher). - table[column].metadata.display["name"] = column.split(" (")[0] - - table = catalog.utils.underscore_table(table) - - return table - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load the latest BP statistical review. - bp_dataset_path = DATA_DIR / "garden" / STAT_REVIEW_NAMESPACE / STAT_REVIEW_VERSION / STAT_REVIEW_SHORT_NAME - bp_dataset = catalog.Dataset(bp_dataset_path) - bp_table = bp_dataset[bp_dataset.table_names[0]] - - # - # Process data. - # - # Get a dataframe out of the BP table. - primary_energy = get_bp_data(bp_table=bp_table) - - # Calculate direct and primary energy using the substitution method. - primary_energy = calculate_direct_primary_energy(primary_energy=primary_energy) - primary_energy = calculate_equivalent_primary_energy(primary_energy=primary_energy) - - # Calculate share of (direct and sub-method) primary energy. - primary_energy = calculate_share_of_primary_energy(primary_energy=primary_energy) - - # Calculate annual change of primary energy. - primary_energy = calculate_primary_energy_annual_change(primary_energy) - - # Add per-capita variables. - primary_energy = add_per_capita_variables(primary_energy=primary_energy) - - # Prepare output data in a convenient way. - table = prepare_output_table(primary_energy) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Add table to dataset. - table.metadata.short_name = "energy_mix" - ds_garden.add(table) - - # Update dataset and table metadata using yaml file. - ds_garden.update_metadata(METADATA_FILE_PATH) - - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio.meta.yml b/etl/steps/archive/garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio.meta.yml deleted file mode 100644 index 4ba5778392c..00000000000 --- a/etl/steps/archive/garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio.meta.yml +++ /dev/null @@ -1,53 +0,0 @@ -dataset: - namespace: bp - version: 2022-12-28 - title: Fossil fuel reserves/production ratio (BP, 2022) - short_name: fossil_fuel_reserves_production_ratio - description: | - The Reserves-to-Production (R/P) Ratio measures the number of years of fuel supplies left based on current annual consumption rates. Note that this can change through time through the discovery of new fuel reserves, and increases in annual consumption. - sources: - - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - -tables: - fossil_fuel_reserves_production_ratio: - variables: - coal_left: - title: Coal reserves/production ratio - short_unit: years - unit: years - coal_production: - title: Coal annual production - short_unit: t - unit: tonnes - coal_reserves: - title: Global reserves of coal - short_unit: t - unit: tonnes - gas_left: - title: Gas reserves/production ratio - short_unit: years - unit: years - gas_production: - title: Gas annual production - short_unit: cubic meters - unit: cubic meters - gas_reserves: - title: Global reserves of gas - short_unit: cubic meters - unit: cubic meters - oil_left: - title: Oil reserves/production ratio - short_unit: years - unit: years - oil_production: - title: Oil annual production - short_unit: t - unit: tonnes - oil_reserves: - title: Global reserves of oil - short_unit: t - unit: tonnes diff --git a/etl/steps/archive/garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio.py b/etl/steps/archive/garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio.py deleted file mode 100644 index ade2a36f011..00000000000 --- a/etl/steps/archive/garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio.py +++ /dev/null @@ -1,101 +0,0 @@ -import pandas as pd -from owid import catalog -from shared import CURRENT_DIR - -from etl.paths import DATA_DIR - -# Details for dataset to export. -DATASET_SHORT_NAME = "fossil_fuel_reserves_production_ratio" -DATASET_TITLE = "Fossil fuel reserves/production ratio" -METADATA_FILE_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Path to BP statistical review datset to import. -BP_DATASET_PATH = DATA_DIR / "garden/bp/2022-12-28/statistical_review" - - -def prepare_bp_data(tb_bp: catalog.Table) -> catalog.Table: - # Prepare BP data. - columns = { - "country": "country", - "year": "year", - "coal__reserves__total": "coal_reserves", - "coal_production__tonnes": "coal_production", - "oil__proved_reserves": "oil_reserves", - "oil_production__barrels": "oil_production", - "gas__proved_reserves": "gas_reserves", - "gas_production__bcm": "gas_production", - } - df_bp = pd.DataFrame(tb_bp).reset_index()[list(columns)].rename(columns=columns) - - # Select only global data. - df_bp = df_bp[df_bp["country"] == "World"].reset_index(drop=True) - - # Check that the units are the expected ones. - assert tb_bp["coal__reserves__total"].metadata.unit == "Million tonnes" - assert tb_bp["coal_production__tonnes"].metadata.unit == "Million tonnes" - # WARNING: Here the "unit" metadata field seems to be wrong, it should be billion barrels. - assert tb_bp["oil__proved_reserves"].metadata.unit == "Barrels" - assert tb_bp["oil_production__barrels"].metadata.unit == "Thousand barrels per day" - assert tb_bp["gas__proved_reserves"].metadata.unit == "Trillion cubic metres" - assert tb_bp["gas_production__bcm"].metadata.unit == "Billion cubic metres" - - # Convert to tonnes. - # Million tonnes to tonnes. - df_bp["coal_reserves"] *= 1e6 - # Million tonnes to tonnes. - df_bp["coal_production"] *= 1e6 - # Billion barrels to tonnes. - df_bp["oil_reserves"] *= 1e9 * 0.1364 - # Thousand barrels per day to tonnes per year. - df_bp["oil_production"] *= 1e3 * 365 * 0.1364 - # Trillion cubic meters to cubic meters. - df_bp["gas_reserves"] *= 1e12 - # Billion cubic meters to cubic meters. - df_bp["gas_production"] *= 1e9 - - # Create columns for reserves-production ratio (measured in years of fossil fuels left). - df_bp["coal_left"] = df_bp["coal_reserves"] / df_bp["coal_production"] - df_bp["oil_left"] = df_bp["oil_reserves"] / df_bp["oil_production"] - df_bp["gas_left"] = df_bp["gas_reserves"] / df_bp["gas_production"] - - # Set index, drop rows that only have nans, and sort conveniently. - df_bp = ( - df_bp.set_index(["country", "year"], verify_integrity=True).dropna(how="all").sort_index().sort_index(axis=1) - ) - - # Create a new table. - tb = catalog.Table(df_bp, underscore=True) - - return tb - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read all required datasets. - ds_bp = catalog.Dataset(BP_DATASET_PATH) - - # Gather all required tables from all datasets. - tb_bp = ds_bp[ds_bp.table_names[0]] - - # - # Process data. - # - # Prepare BP data. - tb = prepare_bp_data(tb_bp=tb_bp) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Add table to dataset. - tb.metadata.short_name = "fossil_fuel_reserves_production_ratio" - ds_garden.add(tb) - - # Update dataset and table metadata using yaml file. - ds_garden.update_metadata(METADATA_FILE_PATH) - - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/bp/2022-12-28/shared.py b/etl/steps/archive/garden/bp/2022-12-28/shared.py deleted file mode 100644 index 309625fdca7..00000000000 --- a/etl/steps/archive/garden/bp/2022-12-28/shared.py +++ /dev/null @@ -1,535 +0,0 @@ -from pathlib import Path -from typing import Dict, List, Optional, Union, cast - -import numpy as np -import pandas as pd -from owid import catalog -from structlog import get_logger - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -log = get_logger() - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - -# Aggregate regions to add, following OWID definitions. -REGIONS_TO_ADD = { - "North America": { - "country_code": "OWID_NAM", - }, - "South America": { - "country_code": "OWID_SAM", - }, - "Europe": { - "country_code": "OWID_EUR", - }, - # The EU27 is already included in the original BP data, with the same definition as OWID. - # "European Union (27)": { - # "country_code": "OWID_EU27", - # }, - "Africa": { - "country_code": "OWID_AFR", - }, - "Asia": { - "country_code": "OWID_ASI", - }, - "Oceania": { - "country_code": "OWID_OCE", - }, - "Low-income countries": { - "country_code": "OWID_LIC", - }, - "Upper-middle-income countries": { - "country_code": "OWID_UMC", - }, - "Lower-middle-income countries": { - "country_code": "OWID_LMC", - }, - "High-income countries": { - "country_code": "OWID_HIC", - }, -} - -# We need to include the 'Other * (BP)' regions, otherwise continents have incomplete data. -# For example, when constructing the aggregate for Africa, we need to include 'Other Africa (BP)'. -# Otherwise we would be underestimating the region's total contribution. -ADDITIONAL_COUNTRIES_IN_REGIONS = { - "Africa": [ - # Additional African regions in BP's data (e.g. 'Other Western Africa (BP)') seem to be included in - # 'Other Africa (BP)', therefore we ignore them when creating aggregates. - "Other Africa (BP)", - ], - "Asia": [ - # Adding 'Other Asia Pacific (BP)' may include areas of Oceania in Asia. - # However, it seems that this region is usually significantly smaller than Asia. - # So, we are possibly overestimating Asia, but not by a significant amount. - "Other Asia Pacific (BP)", - # Similarly, adding 'Other CIS (BP)' in Asia may include areas of Europe in Asia (e.g. Moldova). - # However, since most countries in 'Other CIS (BP)' are Asian, adding it is more accurate than not adding it. - "Other CIS (BP)", - # Countries defined by BP in 'Middle East' are fully included in OWID's definition of Asia. - "Other Middle East (BP)", - ], - "Europe": [ - "Other Europe (BP)", - ], - "North America": [ - "Other Caribbean (BP)", - "Other North America (BP)", - ], - "South America": [ - "Other South America (BP)", - ], - # Given that 'Other Asia and Pacific (BP)' is often similar or even larger than Oceania, we avoid including it in - # Oceania (and include it in Asia, see comment above). - # This means that we may be underestimating Oceania by a significant amount, but BP does not provide unambiguous - # data to avoid this. - "Oceania": [], -} - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION: Dict[str, Dict[str, Union[str, List[str]]]] = { - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "members": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, -} - - -def load_population() -> pd.DataFrame: - """Load OWID population dataset, and add historical regions to it. - - Returns - ------- - population : pd.DataFrame - Population dataset. - - """ - # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] - - # Add data for historical regions (if not in population) by adding the population of its current successors. - countries_with_population = population["country"].unique() - missing_countries = [country for country in HISTORIC_TO_CURRENT_REGION if country not in countries_with_population] - for country in missing_countries: - members = HISTORIC_TO_CURRENT_REGION[country]["members"] - _population = ( - population[population["country"].isin(members)] - .groupby("year") - .agg({"population": "sum", "country": "nunique"}) - .reset_index() - ) - # Select only years for which we have data for all member countries. - _population = _population[_population["country"] == len(members)].reset_index(drop=True) - _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) - - error = "Duplicate country-years found in population. Check if historical regions changed." - assert population[population.duplicated(subset=["country", "year"])].empty, error - - return cast(pd.DataFrame, population) - - -def load_income_groups() -> pd.DataFrame: - """Load dataset of income groups and add historical regions to it. - - Returns - ------- - income_groups : pd.DataFrame - Income groups data. - - """ - # Load the WorldBank dataset for income grups. - income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() - - # Add historical regions to income groups. - for historic_region in HISTORIC_TO_CURRENT_REGION: - historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] - if historic_region not in income_groups["country"]: - historic_region_df = pd.DataFrame( - { - "country": [historic_region], - "income_group": [historic_region_income_group], - } - ) - income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) - - return cast(pd.DataFrame, income_groups) - - -def add_population( - df: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - This function has been adapted from datautils.geo, because population currently does not include historic regions. - We include them in this function. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Load population dataset. - population = load_population().rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population - - -def detect_overlapping_data_for_regions_and_members( - df: pd.DataFrame, - index_columns: List[str], - regions_and_members: Dict[str, Dict[str, Union[str, List[str]]]], - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]], - ignore_zeros: bool = True, -) -> None: - """Raise a warning if there is data for a particular region and for a country that is a member of that region. - - For example, if there is data for USSR and Russia on the same years, a warning will be raised. - - Parameters - ---------- - df : pd.DataFrame - Data. - index_columns : list - Names of columns that should be index of the data. - regions_and_members : dict - Regions and members (where each key corresponds to a region, and each region is a dictionary of various keys, - one of which is 'members', which is a list of member countries). - known_overlaps : list or None - Instances of known overlaps in the data. If this function raises a warning, new instances should be added to the - list. - ignore_zeros : bool - True to consider zeros in the data as missing values. Doing this, if a region has overlapping data with a member - country, but one of their data points is zero, it will not be considered an overlap. - - """ - if known_overlaps is not None: - df = df.copy() - - if ignore_zeros: - # Replace zeros by nans, so that zeros are ignored when looking for overlapping data. - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - - regions = list(regions_and_members) - for region in regions: - # Create a dataframe with only data for the region, and remove columns that only have nans. - # Optionally, replace zeros by nans, to also remove columns that only have zeros or nans. - region_df = ( - df[df["country"] == region].replace(overlapping_values_to_ignore, np.nan).dropna(axis=1, how="all") - ) - members = regions_and_members[region]["members"] - for member in members: - # Create a dataframe for this particular member country. - member_df = ( - df[df["country"] == member].replace(overlapping_values_to_ignore, np.nan).dropna(axis=1, how="all") - ) - # Find common columns with (non-nan) data between region and member country. - variables = [ - column - for column in (set(region_df.columns) & set(member_df.columns)) - if column not in index_columns - ] - for variable in variables: - # Concatenate region and member country's data for this variable. - combined = ( - pd.concat( - [ - region_df[["year", variable]], - member_df[["year", variable]], - ], - ignore_index=True, - ) - .dropna() - .reset_index(drop=True) - ) - # Find years where region and member country overlap. - overlapping = combined[combined.duplicated(subset="year")] - if not overlapping.empty: - overlapping_years = sorted(set(overlapping["year"])) - new_overlap = { - "region": region, - "member": member, - "years": overlapping_years, - "variable": variable, - } - # Check if the overlap found is already in the list of known overlaps. - # If this overlap is not known, raise a warning. - # Omit the field "entity_to_make_nan" when checking if this overlap is known. - _known_overlaps = [ - {key for key in overlap if key != "entity_to_make_nan"} for overlap in known_overlaps - ] - if new_overlap not in _known_overlaps: # type: ignore - log.warning( - f"Data for '{region}' overlaps with '{member}' on '{variable}' " - f"and years: {overlapping_years}" - ) - - -def remove_overlapping_data_for_regions_and_members( - df: pd.DataFrame, - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]], - country_col: str = "country", - year_col: str = "year", - ignore_zeros: bool = True, -) -> pd.DataFrame: - """Check if list of known overlaps between region (e.g. a historical region like the USSR) and a member country (or - a successor country, like Russia) do overlap, and remove them from the data. - - Parameters - ---------- - df : pd.DataFrame - Data. - known_overlaps : list or None - List of known overlaps between region and member country. - country_col : str - Name of country column. - year_col : str - Name of year column. - ignore_zeros : bool - True to ignore columns of zeros when checking if known overlaps are indeed overlaps. - - Returns - ------- - df : pd.DataFrame - Data after removing known overlapping rows between a region and a member country. - - """ - if known_overlaps is not None: - df = df.copy() - - if ignore_zeros: - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - - for i, overlap in enumerate(known_overlaps): - if set([overlap["region"], overlap["member"]]) <= set(df["country"]): - # Check that the known overlap is indeed found in the data. - duplicated_rows = ( - df[(df[country_col].isin([overlap["region"], overlap["member"]]))][ - [country_col, year_col, overlap["variable"]] - ] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=overlap["variable"]) - ) - duplicated_rows = duplicated_rows[duplicated_rows.duplicated(subset="year", keep=False)] - overlapping_years = sorted(set(duplicated_rows["year"])) - if overlapping_years != overlap["years"]: - log.warning(f"Given overlap number {i} is not found in the data; redefine this list.") - # Make nan data points for either the region or the member (which is specified by "entity to make nan"). - indexes_to_make_nan = duplicated_rows[ - duplicated_rows["country"] == overlap[overlap["entity_to_make_nan"]] # type: ignore - ].index.tolist() - df.loc[indexes_to_make_nan, overlap["variable"]] = np.nan - - return df - - -def load_countries_in_regions() -> Dict[str, List[str]]: - """Create a dictionary of regions (continents and income groups) and their member countries. - - Regions to include are defined above, in REGIONS_TO_ADD. - Additional countries are added to regions following the definitions in ADDITIONAL_COUNTRIES_IN_REGIONS. - - Returns - ------- - countries_in_regions : dict - Dictionary of regions, where the value is a list of member countries in the region. - - """ - # Load income groups. - income_groups = load_income_groups() - - countries_in_regions = {} - for region in list(REGIONS_TO_ADD): - # Add default OWID list of countries in region (which includes historical regions). - countries_in_regions[region] = geo.list_countries_in_region(region=region, income_groups=income_groups) - - # Include additional countries in the region (if any given). - for region in ADDITIONAL_COUNTRIES_IN_REGIONS: - countries_in_regions[region] = countries_in_regions[region] + ADDITIONAL_COUNTRIES_IN_REGIONS[region] - - return countries_in_regions - - -def add_region_aggregates( - data: pd.DataFrame, - regions: List[str], - index_columns: List[str], - country_column: str = "country", - year_column: str = "year", - aggregates: Optional[Dict[str, str]] = None, - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]] = None, - region_codes: Optional[List[str]] = None, - country_code_column: str = "country_code", -) -> pd.DataFrame: - """Add region aggregates for all regions (which may include continents and income groups). - - Parameters - ---------- - data : pd.DataFrame - Data. - regions : list - Regions to include. - index_columns : list - Name of index columns. - country_column : str - Name of country column. - year_column : str - Name of year column. - aggregates : dict or None - Dictionary of type of aggregation to use for each variable. If None, variables will be aggregated by summing. - known_overlaps : list or None - List of known overlaps between regions and their member countries. - region_codes : list or None - List of country codes for each new region. It must have the same number of elements, and in the same order, as - the 'regions' argument. - country_code_column : str - Name of country codes column (only relevant of region_codes is not None). - - Returns - ------- - data : pd.DataFrame - Data after adding aggregate regions. - - """ - data = data.copy() - - if aggregates is None: - # If aggregations are not specified, assume all variables are to be aggregated, by summing. - aggregates = {column: "sum" for column in data.columns if column not in index_columns} - # Get the list of regions to create, and their member countries. - countries_in_regions = load_countries_in_regions() - for region in regions: - # List of countries in region. - countries_in_region = countries_in_regions[region] - # Select rows of data for member countries. - data_region = data[data[country_column].isin(countries_in_region)] - # Remove any known overlaps between regions (e.g. USSR, which is a historical region) in current region (e.g. - # Europe) and their member countries (or successor countries, like Russia). - # If any overlap in known_overlaps is not found, a warning will be raised. - data_region = remove_overlapping_data_for_regions_and_members(df=data_region, known_overlaps=known_overlaps) - - # Check that there are no other overlaps in the data (after having removed the known ones). - detect_overlapping_data_for_regions_and_members( - df=data_region, - regions_and_members=HISTORIC_TO_CURRENT_REGION, - index_columns=index_columns, - known_overlaps=known_overlaps, - ) - - # Add region aggregates. - data_region = geo.add_region_aggregates( - df=data_region, - region=region, - country_col=country_column, - year_col=year_column, - aggregations=aggregates, - countries_in_region=countries_in_region, - countries_that_must_have_data=[], - # Here we allow aggregating even when there are few countries informed (which seems to agree with BP's - # criterion for aggregates). - # However, if absolutely all countries have nan, we want the aggregate to be nan, not zero. - frac_allowed_nans_per_year=0.999, - num_allowed_nans_per_year=None, - ) - data = pd.concat( - [data, data_region[data_region[country_column] == region]], - ignore_index=True, - ).reset_index(drop=True) - - if region_codes is not None: - # Add region codes to regions. - if data[country_code_column].dtype == "category": - data[country_code_column] = data[country_code_column].cat.add_categories(region_codes) - for i, region in enumerate(regions): - data.loc[data[country_column] == region, country_code_column] = region_codes[i] - - return data diff --git a/etl/steps/archive/garden/bp/2022-12-28/statistical_review.meta.yml b/etl/steps/archive/garden/bp/2022-12-28/statistical_review.meta.yml deleted file mode 100644 index dc37f21437f..00000000000 --- a/etl/steps/archive/garden/bp/2022-12-28/statistical_review.meta.yml +++ /dev/null @@ -1,44 +0,0 @@ -dataset: - namespace: bp - version: 2022-12-28 - title: Statistical Review of World Energy (BP, 2022) - short_name: statistical_review - sources: - - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: | - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe", or "Other CIS"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa". - * "Asia" - All Asian countries + "Other Middle East" + "Other CIS" + "Other Asia Pacific". - * "Europe" - All European countries + "Other Europe". - * "North America" - All North American countries + "Other Caribbean" + "Other North America". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa" is included in "Other Africa"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - -tables: - {} diff --git a/etl/steps/archive/garden/bp/2022-12-28/statistical_review.py b/etl/steps/archive/garden/bp/2022-12-28/statistical_review.py deleted file mode 100644 index 010e34edb55..00000000000 --- a/etl/steps/archive/garden/bp/2022-12-28/statistical_review.py +++ /dev/null @@ -1,370 +0,0 @@ -"""Process the BP Statistical Review of World Energy 2022. - -For the moment, this dataset is downloaded and processed by -https://github.com/owid/importers/tree/master/bp_statreview - -However, in this additional step we add region aggregates following OWID definitions of regions. - -We use BP's "Other *" regions to create continent aggregates of each variable, but remove them afterwards, given that -these regions mean a different set of countries for different variables. - -""" - -from copy import deepcopy - -import numpy as np -import pandas as pd -from owid import catalog -from shared import ( - CURRENT_DIR, - HISTORIC_TO_CURRENT_REGION, - REGIONS_TO_ADD, - add_region_aggregates, -) - -from etl.helpers import PathFinder, create_dataset -from etl.paths import DATA_DIR - -P = PathFinder(__file__) - -# Namespace and short name for output dataset. -NAMESPACE = "bp" -# Path to metadata file for current dataset. -METADATA_FILE_PATH = CURRENT_DIR / "statistical_review.meta.yml" -# Original BP's Statistical Review dataset name in the OWID catalog (without the institution and year). -BP_CATALOG_NAME = "statistical_review_of_world_energy" -BP_BACKPORTED_DATASET_NAME = "dataset_5650_statistical_review_of_world_energy__bp__2022" -BP_NAMESPACE_IN_CATALOG = "bp_statreview" -BP_VERSION = 2022 -# Path to previous (processed) Statistical Review dataset. -BP_DATASET_OLD_PATH = DATA_DIR / "garden" / "bp" / "2022-07-11" / "statistical_review" - -# List of known overlaps between regions and member countries (or successor countries). -OVERLAPPING_DATA_TO_REMOVE_IN_AGGREGATES = [ - { - "region": "USSR", - "member": "Russia", - "entity_to_make_nan": "region", - "years": [1991, 1992, 1993, 1994, 1995, 1996], - "variable": "Gas - Proved reserves", - } -] - -# In the new dataset there are many missing values that were not missing in the previous release. -# We use data from the previous dataset to fill those gaps. -# However, we avoid certain countries in regions: -# * All region aggregates, since aggregates each year may include data from different countries. -# * USSR and their successor countries, since, in the old dataset, there is plenty of overlap between them (which does -# not happen in the new release). -COUNTRIES_TO_AVOID_WHEN_FILLING_NANS_WITH_PREVIOUS_RELEASE = ( - list(REGIONS_TO_ADD) + ["USSR"] + HISTORIC_TO_CURRENT_REGION["USSR"]["members"] # type: ignore -) - -# True to ignore zeros when checking for overlaps between regions and member countries. -# This means that, if a region (e.g. USSR) and a member country or successor country (e.g. Russia) overlap, but in a -# variable that only has zeros, it will not be considered an overlap. -IGNORE_ZEROS_WHEN_CHECKING_FOR_OVERLAPPING_DATA = True - -# Variables that can be summed when constructing region aggregates. -# Biofuels in Africa have a non-zero total, while there is no contribution from African countries. -# This causes that our aggregate for 'Africa' would be zero, while the original 'Africa (BP)' is not. -# Also, biodiesels are only given for continents and a few countries. -# For this reason we avoid creating aggregates for biofuels and biodiesels. -AGGREGATES_BY_SUM = [ - "Carbon Dioxide Emissions", - "Coal - Reserves - Anthracite and bituminous", - "Coal - Reserves - Sub-bituminous and lignite", - "Coal - Reserves - Total", - "Coal Consumption - EJ", - "Coal Consumption - TWh", - "Coal Production - EJ", - "Coal Production - TWh", - "Coal Production - Tonnes", - "Cobalt Production-Reserves", - "Elec Gen from Coal", - "Elec Gen from Gas", - "Elec Gen from Oil", - "Electricity Generation", - "Gas - Proved reserves", - "Gas Consumption - Bcf", - "Gas Consumption - Bcm", - "Gas Consumption - EJ", - "Gas Consumption - TWh", - "Gas Production - Bcf", - "Gas Production - Bcm", - "Gas Production - EJ", - "Gas Production - TWh", - "Geo Biomass Other - EJ", - "Geo Biomass Other - TWh", - "Graphite Production-Reserves", - "Hydro Consumption - EJ", - "Hydro Consumption - TWh", - "Hydro Generation - TWh", - "Lithium Production-Reserves", - "Nuclear Consumption - EJ", - "Nuclear Consumption - TWh", - "Nuclear Generation - TWh", - "Oil - Proved reserves", - "Oil - Refinery throughput", - "Oil - Refining capacity", - "Oil Consumption - Barrels", - "Oil Consumption - EJ", - "Oil Consumption - TWh", - "Oil Consumption - Tonnes", - "Oil Production - Barrels", - "Oil Production - Crude Conds", - "Oil Production - NGLs", - "Oil Production - TWh", - "Oil Production - Tonnes", - "Primary Energy Consumption - EJ", - "Primary Energy Consumption - TWh", - "Renewables Consumption - EJ", - "Renewables Consumption - TWh", - "Renewables Power - EJ", - "Renewables power - TWh", - "Solar Capacity", - "Solar Consumption - EJ", - "Solar Consumption - TWh", - "Solar Generation - TWh", - "Total Liquids - Consumption", - "Wind Capacity", - "Wind Consumption - EJ", - "Wind Consumption - TWh", - "Wind Generation - TWh", - # 'Biofuels Consumption - Kboed - Total', - # 'Biofuels Consumption - Kboed - Biodiesel', - # 'Biofuels Consumption - PJ - Total', - # 'Biofuels Consumption - PJ - Biodiesel', - # 'Biofuels Consumption - TWh - Total', - # 'Biofuels Consumption - TWh - Biodiesel', - # 'Biofuels Consumption - TWh - Biodiesel (zero filled)', - # 'Biofuels Consumption - TWh - Total (zero filled)', - # 'Biofuels Production - Kboed - Total', - # 'Biofuels Production - PJ - Total', - # 'Biofuels Production - TWh - Total', - # 'Biofuels Production - Kboed - Biodiesel', - # 'Biofuels Production - PJ - Biodiesel', - # 'Biofuels Production - TWh - Biodiesel', - # 'Coal - Prices', - # 'Coal Consumption - TWh (zero filled)', - # 'Gas - Prices', - # 'Gas Consumption - TWh (zero filled)', - # 'Geo Biomass Other - TWh (zero filled)', - # 'Hydro Consumption - TWh (zero filled)', - # 'Nuclear Consumption - TWh (zero filled)', - # 'Oil - Crude prices since 1861 (2021 $)', - # 'Oil - Crude prices since 1861 (current $)', - # 'Oil - Spot crude prices', - # 'Oil Consumption - TWh (zero filled)', - # 'Primary Energy - Cons capita', - # 'Rare Earth Production-Reserves', - # 'Solar Consumption - TWh (zero filled)', - # 'Wind Consumption - TWh (zero filled)', -] - - -def prepare_output_table(df: pd.DataFrame, bp_table: catalog.Table) -> catalog.Table: - """Create a table with the processed data, ready to be in a garden dataset and to be uploaded to grapher (although - additional metadata may need to be added to the table). - - Parameters - ---------- - df : pd.DataFrame - Processed BP data. - bp_table : catalog.Table - Original table of BP statistical review data (used to transfer its metadata to the new table). - - Returns - ------- - table : catalog.Table - Table, ready to be added to a new garden dataset. - - """ - # Create new table. - table = catalog.Table(df).copy() - - # Replace spurious inf values by nan. - table = table.replace([np.inf, -np.inf], np.nan) - - # Sort conveniently and add an index. - table = ( - table.sort_values(["country", "year"]) - .reset_index(drop=True) - .set_index(["country", "year"], verify_integrity=True) - .astype({"country_code": "category"}) - ) - - # Convert column names to lower, snake case. - table = catalog.utils.underscore_table(table) - - # Get the table metadata from the original table. - table.metadata = deepcopy(bp_table.metadata) - - # Update table metadata. - table.metadata.title = "Statistical Review of World Energy" - table.metadata.short_name = "statistical_review" - - # Get the metadata of each variable from the original table. - for column in table.drop(columns="country_code").columns: - table[column].metadata = deepcopy(bp_table[column].metadata) - - return table - - -def fill_missing_values_with_previous_version(table: catalog.Table, table_old: catalog.Table) -> catalog.Table: - """Fill missing values in current data with values from the previous version of the dataset. - - Parameters - ---------- - table : catalog.Table - Processed data from current dataset. - table_old : catalog.Table - Processed data from previous dataset. - - Returns - ------- - combined : catalog.Table - Combined table, with data from the current data, but after filling missing values with data from the previous - version of the dataset. - - """ - # Remove region aggregates from the old table. - table_old = table_old.reset_index().drop(columns="country_code") - table_old = ( - table_old[~table_old["country"].isin(COUNTRIES_TO_AVOID_WHEN_FILLING_NANS_WITH_PREVIOUS_RELEASE)] - .reset_index(drop=True) - .set_index(["country", "year"]) - ) - - # We should only merge on columns that exist in the new dataset. - # If we merge on all columns, we would get old columns from the old dataset that do not exist in the current one. - # This could be thought of a possitive outcome, but we avoid it because: - # * It would be misleading to claim that this data is from BP Statistical Review (2022), since it would be - # a mix of different releases. - # * By doing an outer join, some countries in the old dataset that may not be present in the current dataset - # may be added (e.g. Kenya and Ethiopia were present in the 2021 release because they had data for - # geothermal_capacity, but they are not included in the 2022 release, since they don't have data for any other - # variable). This could lead to unharmonized country names appearing in the current dataset. - # Combine the current output table with the table from the previous version the dataset. - combined = pd.merge( - table, - table_old[[column for column in table_old.columns if column in table.columns]], - left_index=True, - right_index=True, - how="left", - suffixes=("", "_old"), - ) - - # List the common columns that can be filled with values from the previous version. - columns = [column for column in combined.columns if column.endswith("_old")] - - # Fill missing values in the current table with values from the old table. - for column_old in columns: - column = column_old.replace("_old", "") - combined[column] = combined[column].fillna(combined[column_old]) - - # Remove columns from the old table. - combined = combined.drop(columns=columns) - - # Transfer metadata from the table of the current dataset into the combined table. - combined.metadata = deepcopy(table.metadata) - # When that is not possible (for columns that were only in the old but not in the new table), - # get the metadata from the old table. - - for column in combined.columns: - try: - combined[column].metadata = deepcopy(table[column].metadata) - except KeyError: - combined[column].metadata = deepcopy(table_old[column].metadata) - - # Sanity checks. - assert len(combined) == len(table) - assert set(table.columns) <= set(combined.columns) - - return combined - - -def amend_zero_filled_variables_for_region_aggregates(df: pd.DataFrame) -> pd.DataFrame: - """Fill the "* (zero filled)" variables (which were ignored when creating aggregates) with the new aggregate data, - and fill any possible nan with zeros. - - Parameters - ---------- - df : pd.DataFrame - Data after having created region aggregates (which ignore '* (zero filled)' variables). - - Returns - ------- - df : pd.DataFrame - Data after amending zero filled variables for region aggregates. - - """ - df = df.copy() - - zero_filled_variables = [column for column in df.columns if "(zero filled)" in column] - original_variables = [column.replace(" (zero filled)", "") for column in df.columns if "(zero filled)" in column] - select_regions = df["country"].isin(REGIONS_TO_ADD) - df.loc[select_regions, zero_filled_variables] = df[select_regions][original_variables].fillna(0).values - - return df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load table from latest BP dataset. - bp_ds: catalog.Dataset = P.load_dependency(BP_BACKPORTED_DATASET_NAME) - bp_table = bp_ds[BP_BACKPORTED_DATASET_NAME] - - # Load previous version of the BP energy mix dataset, that will be used at the end to fill missing values in the - # current dataset. - bp_dataset_old = catalog.Dataset(BP_DATASET_OLD_PATH) - bp_table_old = bp_dataset_old[bp_dataset_old.table_names[0]] - - # - # Process data. - # - # Extract dataframe of BP data from table. - bp_data = ( - pd.DataFrame(bp_table) - .reset_index() - .rename(columns={column: bp_table[column].metadata.title for column in bp_table.columns}) - .rename(columns={"entity_name": "country", "entity_code": "country_code"}) - .drop(columns="entity_id") - ) - - # Add region aggregates. - df = add_region_aggregates( - data=bp_data, - regions=list(REGIONS_TO_ADD), - index_columns=["country", "year", "country_code"], - country_column="country", - year_column="year", - aggregates={column: "sum" for column in AGGREGATES_BY_SUM}, - known_overlaps=OVERLAPPING_DATA_TO_REMOVE_IN_AGGREGATES, # type: ignore - region_codes=[REGIONS_TO_ADD[region]["country_code"] for region in REGIONS_TO_ADD], - ) - - # Fill nans with zeros for "* (zero filled)" variables for region aggregates (which were ignored). - df = amend_zero_filled_variables_for_region_aggregates(df) - - # Remove "Other *" regions. They have been properly taken into account to create continent aggregates, but they - # represent different sets of countries for different variables. - # Therefore we remove them to avoid issues when combining different variables from these regions. - df = df[~df["country"].str.startswith("Other")].reset_index(drop=True) - - # Prepare output data in a convenient way. - table = prepare_output_table(df, bp_table) - - # Fill missing values in current table with values from the previous dataset, when possible. - table = fill_missing_values_with_previous_version(table=table, table_old=bp_table_old) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset(dest_dir, tables=[table], default_metadata=bp_ds.metadata) - - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/bp/2023-02-20/energy_mix.meta.yml b/etl/steps/archive/garden/bp/2023-02-20/energy_mix.meta.yml deleted file mode 100644 index 0ded5ea0272..00000000000 --- a/etl/steps/archive/garden/bp/2023-02-20/energy_mix.meta.yml +++ /dev/null @@ -1,89 +0,0 @@ -dataset: - namespace: bp - title: Energy mix (BP, 2023) - short_name: energy_mix - sources: - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: >- - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes - countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" - to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding - up (when possible) the contributions from the countries in the region. - - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, - North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - - * "Australasia (BP)": Australia, New Zealand. - - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - - * "North America (BP)": US (excluding US territories), Canada, Mexico - - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, - Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe", or "Other CIS"). We define our regions in the following way: - - * "Africa" - All African countries + "Other Africa". - - * "Asia" - All Asian countries + "Other Middle East" + "Other CIS" + "Other Asia Pacific". - - * "Europe" - All European countries + "Other Europe". - - * "North America" - All North American countries + "Other Caribbean" + "Other North America". - - * "Oceania" - All Oceanian countries. - - * "South America" - All South American countries + "Other South America". - - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other - regions already included (e.g. the data for "Other Western Africa" is included in "Other Africa"). Finally, income groups are constructed following the definitions - [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - description: | - Raw data on energy consumption is sourced from [the BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - - Primary energy in exajoules (EJ) has been converted to TWh by Our World in Data based on a conversion factor of 1,000,000 / 3,600 (~277.778). - - For non-fossil based electricity sources (nuclear, hydro, wind, solar, geothermal, biomass in power, and other renewable sources), BP's generation (in TWh) corresponds to gross generation and not accounting for cross-border electricity supply. - Also, for non-fossil based electricity, there are two ways to define primary energy: - * One is "direct primary energy", which corresponds to the electricity generation (in TWh). - * The other is "input-equivalent primary energy" (also called "primary energy using the substitution method"). - This is the amount of fuel that would be required by thermal power stations to generate the reported electricity, as explained in [BP's methodology document](https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/pdfs/energy-economics/statistical-review/bp-stats-review-2022-methodology.pdf). For example, if a country's nuclear power generated 100 TWh of electricity, and assuming that the efficiency of a standard thermal power plant is 38%, the input equivalent primary energy for this country would be 100 TWh / 0.38 = 263 TWh = 0.95 EJ. This input-equivalent primary energy takes account of the inefficiencies in fossil fuel production and provides a better approximation of each source's share of "final energy" consumption. - - Additional metrics have been calculated by Our World in Data: - - Annual change in energy consumption by source: this is calculated as the difference from the previous year. - - % of total primary energy: calculated as each source's share of primary energy (direct energy and primary energy using the substitution method) from all sources. - - Per capita energy by source: calculated as primary energy consumption by source, divided by population. - - Per capita figures have been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). diff --git a/etl/steps/archive/garden/bp/2023-02-20/energy_mix.py b/etl/steps/archive/garden/bp/2023-02-20/energy_mix.py deleted file mode 100644 index 536c1f196eb..00000000000 --- a/etl/steps/archive/garden/bp/2023-02-20/energy_mix.py +++ /dev/null @@ -1,471 +0,0 @@ -"""Generate BP energy mix dataset using data from BP's statistical review of the world energy. - -""" - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table -from shared import add_population - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 -# Exajoules to terawatt-hours. -EJ_TO_TWH = 1e6 / 3600 -# Petajoules to exajoules. -PJ_TO_EJ = 1e-3 - -# List all energy sources in the data. -ONLY_DIRECT_ENERGY = ["Coal", "Fossil fuels", "Gas", "Oil", "Biofuels"] -DIRECT_AND_EQUIVALENT_ENERGY = [ - "Hydro", - "Low-carbon energy", - "Nuclear", - "Other renewables", - "Renewables", - "Solar", - "Wind", - "Solar and wind", -] -ALL_SOURCES = sorted(ONLY_DIRECT_ENERGY + DIRECT_AND_EQUIVALENT_ENERGY) - - -def get_bp_data(bp_table: Table) -> pd.DataFrame: - """Extract a simple dataframe of BP statistical review data from the table in the dataset. - - Parameters - ---------- - bp_table : Table - BP table (from the dataset of BP statistical review). - - Returns - ------- - bp_data : pd.DataFrame - BP statistical review data. - - """ - bp_table = bp_table.copy() - - # Convert table (snake case) column names to human readable names. - bp_table = bp_table.rename( - columns={column: bp_table[column].metadata.title for column in bp_table.columns if column != "country_code"} - ).reset_index() - - # Rename human-readable columns (and select only the ones that will be used). - columns = { - "country": "Country", - "country_code": "Country code", - "year": "Year", - # Fossil fuel primary energy (in EJ). - "Coal Consumption - EJ": "Coal (EJ)", - "Gas Consumption - EJ": "Gas (EJ)", - "Oil Consumption - EJ": "Oil (EJ)", - # Non-fossil based electricity generation (in TWh). - "Hydro Generation - TWh": "Hydro (TWh - direct)", - "Nuclear Generation - TWh": "Nuclear (TWh - direct)", - "Solar Generation - TWh": "Solar (TWh - direct)", - "Wind Generation - TWh": "Wind (TWh - direct)", - "Geo Biomass Other - TWh": "Other renewables (TWh - direct)", - # Non-fossil based electricity generation converted into input-equivalent primary energy (in EJ). - "Hydro Consumption - EJ": "Hydro (EJ - equivalent)", - "Nuclear Consumption - EJ": "Nuclear (EJ - equivalent)", - "Solar Consumption - EJ": "Solar (EJ - equivalent)", - "Wind Consumption - EJ": "Wind (EJ - equivalent)", - "Geo Biomass Other - EJ": "Other renewables (EJ - equivalent)", - # Total, input-equivalent primary energy consumption (in EJ). - "Primary Energy Consumption - EJ": "Primary energy (EJ - equivalent)", - # Biofuels consumption (in PJ, that will be converted into EJ). - "Biofuels Consumption - PJ - Total": "Biofuels (PJ)", - } - - # Create a simple dataframe (without metadata and with a dummy index). - assert set(columns) < set(bp_table.columns), "Column names have changed in BP data." - - bp_data = pd.DataFrame(bp_table)[list(columns)].rename(errors="raise", columns=columns) - - return bp_data - - -def _check_that_substitution_method_is_well_calculated( - primary_energy: pd.DataFrame, -) -> None: - # Check that the constructed primary energy using the substitution method (in TWh) coincides with the - # input-equivalent primary energy (converted from EJ into TWh) given in the original data. - check = primary_energy[ - [ - "Year", - "Country", - "Primary energy (EJ - equivalent)", - "Primary energy (TWh - equivalent)", - ] - ].reset_index(drop=True) - check["Primary energy (TWh - equivalent) - original"] = check["Primary energy (EJ - equivalent)"] * EJ_TO_TWH - check = check.dropna().reset_index(drop=True) - # They may not coincide exactly, but at least check that they differ (point by point) by less than 10%. - max_deviation = max( - abs( - (check["Primary energy (TWh - equivalent)"] - check["Primary energy (TWh - equivalent) - original"]) - / check["Primary energy (TWh - equivalent) - original"] - ) - ) - assert max_deviation < 0.1 - - -def calculate_direct_primary_energy(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Convert direct primary energy into TWh and create various aggregates (e.g. Fossil fuels and Renewables). - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - Data, after adding direct primary energy. - - """ - primary_energy = primary_energy.copy() - - # Convert units of biofuels consumption. - primary_energy["Biofuels (EJ)"] = primary_energy["Biofuels (PJ)"] * PJ_TO_EJ - - # Create column for fossil fuels primary energy (if any of them is nan, the sum will be nan). - primary_energy["Fossil fuels (EJ)"] = ( - primary_energy["Coal (EJ)"] + primary_energy["Oil (EJ)"] + primary_energy["Gas (EJ)"] - ) - - # Convert primary energy of fossil fuels and biofuels into TWh. - for cat in ["Coal", "Oil", "Gas", "Biofuels"]: - primary_energy[f"{cat} (TWh)"] = primary_energy[f"{cat} (EJ)"] * EJ_TO_TWH - - # Create column for primary energy from fossil fuels (in TWh). - primary_energy["Fossil fuels (TWh)"] = ( - primary_energy["Coal (TWh)"] + primary_energy["Oil (TWh)"] + primary_energy["Gas (TWh)"] - ) - - # Create column for direct primary energy from renewable sources in TWh. - # (total renewable electricity generation and biofuels) (in TWh). - # By visually inspecting the original data, it seems that many data points that used to be zero are - # missing in the 2022 release, so filling nan with zeros seems to be a reasonable approach to avoids losing a - # significant amount of data. - primary_energy["Renewables (TWh - direct)"] = ( - primary_energy["Hydro (TWh - direct)"] - + primary_energy["Solar (TWh - direct)"].fillna(0) - + primary_energy["Wind (TWh - direct)"].fillna(0) - + primary_energy["Other renewables (TWh - direct)"].fillna(0) - + primary_energy["Biofuels (TWh)"].fillna(0) - ) - # Create column for direct primary energy from low-carbon sources in TWh. - # (total renewable electricity generation, biofuels, and nuclear power) (in TWh). - primary_energy["Low-carbon energy (TWh - direct)"] = primary_energy["Renewables (TWh - direct)"] + primary_energy[ - "Nuclear (TWh - direct)" - ].fillna(0) - # Create column for direct primary energy from solar and wind in TWh. - primary_energy["Solar and wind (TWh - direct)"] = primary_energy["Solar (TWh - direct)"].fillna(0) + primary_energy[ - "Wind (TWh - direct)" - ].fillna(0) - # Create column for total direct primary energy. - primary_energy["Primary energy (TWh - direct)"] = ( - primary_energy["Fossil fuels (TWh)"] + primary_energy["Low-carbon energy (TWh - direct)"] - ) - - return primary_energy - - -def calculate_equivalent_primary_energy(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Convert input-equivalent primary energy into TWh and create various aggregates (e.g. Fossil fuels and - Renewables). - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - Data, after adding input-equivalent primary energy. - - """ - primary_energy = primary_energy.copy() - # Create column for total renewable input-equivalent primary energy (in EJ). - # Fill missing values with zeros (see comment above). - primary_energy["Renewables (EJ - equivalent)"] = ( - primary_energy["Hydro (EJ - equivalent)"] - + primary_energy["Solar (EJ - equivalent)"].fillna(0) - + primary_energy["Wind (EJ - equivalent)"].fillna(0) - + primary_energy["Other renewables (EJ - equivalent)"].fillna(0) - + primary_energy["Biofuels (EJ)"].fillna(0) - ) - # Create column for low carbon energy (i.e. renewable plus nuclear energy). - primary_energy["Low-carbon energy (EJ - equivalent)"] = primary_energy[ - "Renewables (EJ - equivalent)" - ] + primary_energy["Nuclear (EJ - equivalent)"].fillna(0) - # Create column for solar and wind. - primary_energy["Solar and wind (EJ - equivalent)"] = primary_energy["Solar (EJ - equivalent)"].fillna( - 0 - ) + primary_energy["Wind (EJ - equivalent)"].fillna(0) - # Convert input-equivalent primary energy of non-fossil based electricity into TWh. - # The result is primary energy using the "substitution method". - for cat in DIRECT_AND_EQUIVALENT_ENERGY: - primary_energy[f"{cat} (TWh - equivalent)"] = primary_energy[f"{cat} (EJ - equivalent)"] * EJ_TO_TWH - # Create column for primary energy from all sources (which corresponds to input-equivalent primary - # energy for non-fossil based sources). - primary_energy["Primary energy (TWh - equivalent)"] = ( - primary_energy["Fossil fuels (TWh)"] + primary_energy["Low-carbon energy (TWh - equivalent)"] - ) - # Check that the primary energy constructed using the substitution method coincides with the - # input-equivalent primary energy. - _check_that_substitution_method_is_well_calculated(primary_energy) - - return primary_energy - - -def calculate_share_of_primary_energy(primary_energy: pd.DataFrame) -> pd.DataFrame: - """Calculate the share (percentage) of (direct or direct and input-equivalent) primary energy for each energy - source. - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - BP data after adding columns for the share of primary energy. - - """ - primary_energy = primary_energy.copy() - # Check that all sources are included in the data. - expected_sources = sorted( - set( - [ - source.split("(")[0].strip() - for source in primary_energy.columns - if not source.startswith(("Country", "Year", "Primary")) - ] - ) - ) - assert expected_sources == ALL_SOURCES, "Sources may have changed names." - - for source in ONLY_DIRECT_ENERGY: - # Calculate each source as share of direct primary energy. - primary_energy[f"{source} (% direct primary energy)"] = ( - primary_energy[f"{source} (TWh)"] / primary_energy["Primary energy (TWh - direct)"] * 100 - ) - # Calculate each source as share of input-equivalent primary energy (i.e. substitution method). - primary_energy[f"{source} (% equivalent primary energy)"] = ( - primary_energy[f"{source} (EJ)"] / primary_energy["Primary energy (EJ - equivalent)"] * 100 - ) - - for source in DIRECT_AND_EQUIVALENT_ENERGY: - # Calculate each source as share of direct primary energy. - primary_energy[f"{source} (% direct primary energy)"] = ( - primary_energy[f"{source} (TWh - direct)"] / primary_energy["Primary energy (TWh - direct)"] * 100 - ) - # Calculate each source as share of input-equivalent primary energy (i.e. substitution method). - primary_energy[f"{source} (% equivalent primary energy)"] = ( - primary_energy[f"{source} (EJ - equivalent)"] / primary_energy["Primary energy (EJ - equivalent)"] * 100 - ) - - return primary_energy - - -def calculate_primary_energy_annual_change( - primary_energy: pd.DataFrame, -) -> pd.DataFrame: - """Calculate annual change of (direct or direct and input-equivalent) primary energy for each energy source. - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - - Returns - ------- - primary_energy : pd.DataFrame - BP data after adding annual changes. - - """ - primary_energy = primary_energy.copy() - - # Calculate annual change in each source. - primary_energy = primary_energy.sort_values(["Country", "Year"]).reset_index(drop=True) - for source in ONLY_DIRECT_ENERGY: - # Create column for source percentage growth as a function of direct primary energy. - primary_energy[f"{source} (% growth)"] = primary_energy.groupby("Country")[f"{source} (TWh)"].pct_change() * 100 - # Create column for source absolute growth as a function of direct primary energy. - primary_energy[f"{source} (TWh growth)"] = primary_energy.groupby("Country")[f"{source} (TWh)"].diff() - - for source in DIRECT_AND_EQUIVALENT_ENERGY: - # Create column for source percentage growth as a function of primary energy - # (as a percentage, it is irrelevant whether it is direct or equivalent). - primary_energy[f"{source} (% growth)"] = ( - primary_energy.groupby("Country")[f"{source} (TWh - direct)"].pct_change() * 100 - ) - # Create column for source absolute growth as a function of direct primary energy. - primary_energy[f"{source} (TWh growth - direct)"] = primary_energy.groupby("Country")[ - f"{source} (TWh - direct)" - ].diff() - # Create column for source absolute growth as a function of input-equivalent primary energy. - primary_energy[f"{source} (TWh growth - equivalent)"] = primary_energy.groupby("Country")[ - f"{source} (TWh - equivalent)" - ].diff() - - return primary_energy - - -def add_per_capita_variables(primary_energy: pd.DataFrame, df_population: pd.DataFrame) -> pd.DataFrame: - """Add per-capita variables. - - Parameters - ---------- - primary_energy : pd.DataFrame - BP data. - df_population : pd.DataFrame - Population data. - - Returns - ------- - primary_energy : pd.DataFrame - BP data after adding per-capita variables. - - """ - primary_energy = primary_energy.copy() - - primary_energy = add_population( - df=primary_energy, - population=df_population, - country_col="Country", - year_col="Year", - population_col="Population", - warn_on_missing_countries=False, - ) - for source in ONLY_DIRECT_ENERGY: - primary_energy[f"{source} per capita (kWh)"] = ( - primary_energy[f"{source} (TWh)"] / primary_energy["Population"] * TWH_TO_KWH - ) - for source in DIRECT_AND_EQUIVALENT_ENERGY: - primary_energy[f"{source} per capita (kWh - direct)"] = ( - primary_energy[f"{source} (TWh - direct)"] / primary_energy["Population"] * TWH_TO_KWH - ) - primary_energy[f"{source} per capita (kWh - equivalent)"] = ( - primary_energy[f"{source} (TWh - equivalent)"] / primary_energy["Population"] * TWH_TO_KWH - ) - - # Drop unnecessary column. - primary_energy = primary_energy.drop(columns=["Population"]) - - return primary_energy - - -def prepare_output_table(primary_energy: pd.DataFrame) -> Table: - """Create a table with the processed data, ready to be in a garden dataset and to be uploaded to grapher (although - additional metadata may need to be added to the table). - - Parameters - ---------- - primary_energy : pd.DataFrame - Processed BP data. - - Returns - ------- - table : catalog.Table - Table, ready to be added to a new garden dataset. - - """ - # Keep only columns in TWh (and not EJ or PJ). - table = Table(primary_energy, short_name="energy_mix").drop( - errors="raise", - columns=[column for column in primary_energy.columns if (("(EJ" in column) or ("(PJ" in column))], - ) - - # Replace spurious inf values by nan. - table = table.replace([np.inf, -np.inf], np.nan) - - # Sort conveniently and add an index. - table = ( - table.sort_values(["Country", "Year"]) - .reset_index(drop=True) - .set_index(["Country", "Year"], verify_integrity=True) - .astype({"Country code": "category"}) - ) - - # Add metadata (e.g. unit) to each column. - # Define unit names (these are the long and short unit names that will be shown in grapher). - # The keys of the dictionary should correspond to units expected to be found in each of the variable names in table. - short_unit_to_unit = { - "TWh": "terawatt-hours", - "kWh": "kilowatt-hours", - "%": "%", - } - # Define number of decimal places to show (only relevant for grapher, not for the data). - short_unit_to_num_decimals = { - "TWh": 0, - "kWh": 0, - } - for column in table.columns: - table[column].metadata.title = column - for short_unit in ["TWh", "kWh", "%"]: - if short_unit in column: - table[column].metadata.short_unit = short_unit - table[column].metadata.unit = short_unit_to_unit[short_unit] - table[column].metadata.display = {} - if short_unit in short_unit_to_num_decimals: - table[column].metadata.display["numDecimalPlaces"] = short_unit_to_num_decimals[short_unit] - # Add the variable name without unit (only relevant for grapher). - table[column].metadata.display["name"] = column.split(" (")[0] - - return table - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load the latest BP statistical review. - ds_bp: Dataset = paths.load_dependency("statistical_review") - tb_bp = ds_bp["statistical_review"] - - # Load the population dataset. - ds_population: Dataset = paths.load_dependency("population") - # Get table from dataset. - tb_population = ds_population["population"] - # Make a dataframe out of the data in the table, with the required columns. - df_population = pd.DataFrame(tb_population) - - # - # Process data. - # - # Get a dataframe out of the BP table. - primary_energy = get_bp_data(bp_table=tb_bp) - - # Calculate direct and primary energy using the substitution method. - primary_energy = calculate_direct_primary_energy(primary_energy=primary_energy) - primary_energy = calculate_equivalent_primary_energy(primary_energy=primary_energy) - - # Calculate share of (direct and sub-method) primary energy. - primary_energy = calculate_share_of_primary_energy(primary_energy=primary_energy) - - # Calculate annual change of primary energy. - primary_energy = calculate_primary_energy_annual_change(primary_energy) - - # Add per-capita variables. - primary_energy = add_per_capita_variables(primary_energy=primary_energy, df_population=df_population) - - # Prepare output data in a convenient way. - table = prepare_output_table(primary_energy) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[table], default_metadata=ds_bp.metadata) - ds_garden.save() diff --git a/etl/steps/archive/garden/bp/2023-02-20/shared.py b/etl/steps/archive/garden/bp/2023-02-20/shared.py deleted file mode 100644 index e3035e32a3d..00000000000 --- a/etl/steps/archive/garden/bp/2023-02-20/shared.py +++ /dev/null @@ -1,106 +0,0 @@ -from pathlib import Path -from typing import List - -import pandas as pd -from structlog import get_logger - -from etl.data_helpers import geo - -log = get_logger() - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - - -def add_population( - df: pd.DataFrame, - population: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - interpolate_missing_population: bool = False, - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, - expected_countries_without_population: List[str] = [], -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - population : pd.DataFrame - Population data. - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - interpolate_missing_population : bool - True to linearly interpolate population on years that are presented in df, but for which we do not have - population data; otherwise False to keep missing population data as nans. - For example, if interpolate_missing_population is True and df has data for all years between 1900 and 1910, - but population is only given for 1900 and 1910, population will be linearly interpolated between those years. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - regions : dict - Definitions of regions whose population also needs to be included. - expected_countries_without_population : list - Countries that are expected to not have population (that should be ignored if warnings are activated). - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Prepare population dataset. - population = population.reset_index().rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - if interpolate_missing_population: - # For some countries we have population data only on certain years, e.g. 1900, 1910, etc. - # Optionally fill missing years linearly. - countries_in_data = df[country_col].unique() - years_in_data = df[year_col].unique() - - population = population.set_index([country_col, year_col]).reindex( - pd.MultiIndex.from_product([countries_in_data, years_in_data], names=[country_col, year_col]) - ) - - population = population.groupby(country_col).transform( - lambda x: x.interpolate(method="linear", limit_direction="both") - ) - - error = "Countries without population data differs from list of expected countries without population data." - assert set(population[population[population_col].isnull()].reset_index()[country_col]) == set( - expected_countries_without_population - ), error - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population diff --git a/etl/steps/archive/garden/cait/2022-08-10/ghg_emissions_by_sector.countries.json b/etl/steps/archive/garden/cait/2022-08-10/ghg_emissions_by_sector.countries.json deleted file mode 100644 index b8805f2e1fb..00000000000 --- a/etl/steps/archive/garden/cait/2022-08-10/ghg_emissions_by_sector.countries.json +++ /dev/null @@ -1,197 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cape Verde": "Cape Verde", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "European Union (27)": "European Union (27)", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Grenada": "Grenada", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Laos": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macedonia": "North Macedonia", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Micronesia": "Micronesia (country)", - "Moldova": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North Korea": "North Korea", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Qatar": "Qatar", - "Republic of Congo": "Congo", - "Romania": "Romania", - "Russia": "Russia", - "Rwanda": "Rwanda", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syria": "Syria", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Vietnam": "Vietnam", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe" -} diff --git a/etl/steps/archive/garden/cait/2022-08-10/ghg_emissions_by_sector.meta.yml b/etl/steps/archive/garden/cait/2022-08-10/ghg_emissions_by_sector.meta.yml deleted file mode 100644 index 1d497d5aaa5..00000000000 --- a/etl/steps/archive/garden/cait/2022-08-10/ghg_emissions_by_sector.meta.yml +++ /dev/null @@ -1,260 +0,0 @@ -dataset: - namespace: cait - version: 2022-08-10 - title: Greenhouse gas emissions by sector (CAIT, 2022) - short_name: ghg_emissions_by_sector - description: | - Emissions are measured in tonnes of carbon dioxide equivalents (CO₂e), based on 100-year global warming potential factors for non-CO₂ gases. - - Emissions are broken down by sector. Further information on sector definitions is available [here](https://ourworldindata.org/ghg-emissions-by-sector). - sources: - - - name: Our World in Data based on Climate Analysis Indicators Tool (CAIT). - published_by: CAIT Climate Data Explorer via Climate Watch - publication_year: 2022 - date_accessed: 2022-08-10 - url: https://www.climatewatchdata.org/data-explorer/historical-emissions -tables: - greenhouse_gas_emissions_by_sector: - variables: - agriculture: - description: "Greenhouse gas emissions from agriculture, measured in million tonnes of carbon dioxide-equivalents." - agriculture__per_capita: - description: "Greenhouse gas emissions from agriculture, measured in tonnes of carbon dioxide-equivalents per capita." - aviation_and_shipping: - description: "Greenhouse gas emissions from bunker fuels, measured in million tonnes of carbon dioxide-equivalents." - aviation_and_shipping__per_capita: - description: "Greenhouse gas emissions from bunker fuels, measured in tonnes of carbon dioxide-equivalents per capita." - buildings: - description: "Greenhouse gas emissions from buildings, measured in million tonnes of carbon dioxide-equivalents." - buildings__per_capita: - description: "Greenhouse gas emissions from buildings, measured in tonnes of carbon dioxide-equivalents per capita." - electricity_and_heat: - description: "Greenhouse gas emissions from electricity and heat, measured in million tonnes of carbon dioxide-equivalents." - electricity_and_heat__per_capita: - description: "Greenhouse gas emissions from electricity and heat, measured in tonnes of carbon dioxide-equivalents per capita." - energy: - description: "Greenhouse gas emissions from energy, measured in million tonnes of carbon dioxide-equivalents." - energy__per_capita: - description: "Greenhouse gas emissions from energy, measured in tonnes of carbon dioxide-equivalents per capita." - fugitive_emissions: - description: "Fugitive emissions of greenhouse gases from energy production, measured in million tonnes of carbon dioxide-equivalents." - fugitive_emissions__per_capita: - description: "Fugitive emissions of greenhouse gases from energy production, measured in tonnes of carbon dioxide-equivalents per capita." - industry: - description: "Greenhouse gas emissions from industry, measured in million tonnes of carbon dioxide-equivalents." - industry__per_capita: - description: "Greenhouse gas emissions from industry, measured in tonnes of carbon dioxide-equivalents per capita." - land_use_change_and_forestry: - description: "Greenhouse gas emissions from land use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - land_use_change_and_forestry__per_capita: - description: "Greenhouse gas emissions from land use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - manufacturing_and_construction: - description: "Greenhouse gas emissions from manufacturing and construction, measured in million tonnes of carbon dioxide-equivalents." - manufacturing_and_construction__per_capita: - description: "Greenhouse gas emissions from manufacturing and construction, measured in tonnes of carbon dioxide-equivalents per capita." - other_fuel_combustion: - description: "Greenhouse gas emissions from other fuel combustion, measured in million tonnes of carbon dioxide-equivalents." - other_fuel_combustion__per_capita: - description: "Greenhouse gas emissions from other fuel combustion, measured in tonnes of carbon dioxide-equivalents per capita." - total_excluding_lucf: - description: "Total greenhouse gas emissions excluding land-use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - total_excluding_lucf__per_capita: - description: "Total greenhouse gas emissions excluding land-use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - total_including_lucf: - description: "Total greenhouse gas emissions including land-use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - total_including_lucf__per_capita: - description: "Total greenhouse gas emissions including land-use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - transport: - description: "Greenhouse gas emissions from transport, measured in million tonnes of carbon dioxide-equivalents." - transport__per_capita: - description: "Greenhouse gas emissions from transport, measured in tonnes of carbon dioxide-equivalents per capita." - waste: - description: "Greenhouse gas emissions from waste, measured in million tonnes of carbon dioxide-equivalents." - waste__per_capita: - description: "Greenhouse gas emissions from waste, measured in tonnes of carbon dioxide-equivalents per capita." - # population: - carbon_dioxide_emissions_by_sector: - variables: - aviation_and_shipping: - description: "Carbon dioxide emissions from bunker fuels, measured in million tonnes." - aviation_and_shipping__per_capita: - description: "Carbon dioxide emissions from bunker fuels, measured in tonnes per capita." - buildings: - description: "Carbon dioxide emissions from buildings, measured in million tonnes." - buildings__per_capita: - description: "Carbon dioxide emissions from buildings, measured in tonnes per capita." - electricity_and_heat: - description: "Carbon dioxide emissions from electricity and heat, measured in million tonnes." - electricity_and_heat__per_capita: - description: "Carbon dioxide emissions from electricity and heat, measured in tonnes per capita." - energy: - description: "Carbon dioxide emissions from energy, measured in million tonnes." - energy__per_capita: - description: "Carbon dioxide emissions from energy, measured in tonnes per capita." - fugitive_emissions: - description: "Fugitive emissions of carbon dioxide from energy production, measured in million tonnes." - fugitive_emissions__per_capita: - description: "Fugitive emissions of carbon dioxide from energy production, measured in tonnes per capita." - industry: - description: "Carbon dioxide emissions from industry, measured in million tonnes." - industry__per_capita: - description: "Carbon dioxide emissions from industry, measured in tonnes per capita." - land_use_change_and_forestry: - description: "Carbon dioxide emissions from land use change and forestry, measured in million tonnes." - land_use_change_and_forestry__per_capita: - description: "Carbon dioxide emissions from land use change and forestry, measured in tonnes per capita." - manufacturing_and_construction: - description: "Carbon dioxide emissions from manufacturing and construction, measured in million tonnes." - manufacturing_and_construction__per_capita: - description: "Carbon dioxide emissions from manufacturing and construction, measured in tonnes per capita." - other_fuel_combustion: - description: "Carbon dioxide emissions from other fuel combustion, measured in million tonnes." - other_fuel_combustion__per_capita: - description: "Carbon dioxide emissions from other fuel combustion, measured in tonnes per capita." - total_excluding_lucf: - description: "Total carbon dioxide emissions excluding land-use change and forestry, measured in million tonnes." - total_excluding_lucf__per_capita: - description: "Total carbon dioxide emissions excluding land-use change and forestry, measured in tonnes per capita." - total_including_lucf: - description: "Total carbon dioxide emissions including land-use change and forestry, measured in million tonnes." - total_including_lucf__per_capita: - description: "Total carbon dioxide emissions including land-use change and forestry, measured in tonnes per capita." - transport: - description: "Carbon dioxide emissions from transport, measured in million tonnes." - transport__per_capita: - description: "Carbon dioxide emissions from transport, measured in tonnes per capita." - # population: - methane_emissions_by_sector: - variables: - agriculture: - description: "Methane emissions from agriculture, measured in million tonnes of carbon dioxide-equivalents." - agriculture__per_capita: - description: "Methane emissions from agriculture, measured in tonnes of carbon dioxide-equivalents per capita." - aviation_and_shipping: - description: "Methane emissions from bunker fuels, measured in million tonnes of carbon dioxide-equivalents." - aviation_and_shipping__per_capita: - description: "Methane emissions from bunker fuels, measured in tonnes of carbon dioxide-equivalents per capita." - buildings: - description: "Methane emissions from buildings, measured in million tonnes of carbon dioxide-equivalents." - buildings__per_capita: - description: "Methane emissions from buildings, measured in tonnes of carbon dioxide-equivalents per capita." - electricity_and_heat: - description: "Methane emissions from electricity and heat, measured in million tonnes of carbon dioxide-equivalents." - electricity_and_heat__per_capita: - description: "Methane emissions from electricity and heat, measured in tonnes of carbon dioxide-equivalents per capita." - energy: - description: "Methane emissions from energy, measured in million tonnes of carbon dioxide-equivalents." - energy__per_capita: - description: "Methane emissions from energy, measured in tonnes of carbon dioxide-equivalents per capita." - fugitive_emissions: - description: "Fugitive emissions of methane from energy production, measured in million tonnes of carbon dioxide-equivalents." - fugitive_emissions__per_capita: - description: "Fugitive emissions of methane from energy production, measured in tonnes of carbon dioxide-equivalents per capita." - industry: - description: "Methane emissions from industry, measured in million tonnes of carbon dioxide-equivalents." - industry__per_capita: - description: "Methane emissions from industry, measured in tonnes of carbon dioxide-equivalents per capita." - land_use_change_and_forestry: - description: "Methane emissions from land use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - land_use_change_and_forestry__per_capita: - description: "Methane emissions from land use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - manufacturing_and_construction: - description: "Methane emissions from manufacturing and construction, measured in million tonnes of carbon dioxide-equivalents." - manufacturing_and_construction__per_capita: - description: "Methane emissions from manufacturing and construction, measured in tonnes of carbon dioxide-equivalents per capita." - other_fuel_combustion: - description: "Methane emissions from other fuel combustion, measured in million tonnes of carbon dioxide-equivalents." - other_fuel_combustion__per_capita: - description: "Methane emissions from other fuel combustion, measured in tonnes of carbon dioxide-equivalents per capita." - total_excluding_lucf: - description: "Total methane emissions excluding land-use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - total_excluding_lucf__per_capita: - description: "Total methane emissions excluding land-use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - total_including_lucf: - description: "Total methane emissions including land-use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - total_including_lucf__per_capita: - description: "Total methane emissions including land-use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - transport: - description: "Methane emissions from transport, measured in million tonnes of carbon dioxide-equivalents." - transport__per_capita: - description: "Methane emissions from transport, measured in tonnes of carbon dioxide-equivalents per capita." - waste: - description: "Methane emissions from waste, measured in million tonnes of carbon dioxide-equivalents." - waste__per_capita: - description: "Methane emissions from waste, measured in tonnes of carbon dioxide-equivalents per capita." - # population: - nitrous_oxide_emissions_by_sector: - variables: - agriculture: - description: "Nitrous oxide emissions from agriculture, measured in million tonnes of carbon dioxide-equivalents." - agriculture__per_capita: - description: "Nitrous oxide emissions from agriculture, measured in tonnes of carbon dioxide-equivalents per capita." - aviation_and_shipping: - description: "Nitrous oxide emissions from bunker fuels, measured in million tonnes of carbon dioxide-equivalents." - aviation_and_shipping__per_capita: - description: "Nitrous oxide emissions from bunker fuels, measured in tonnes of carbon dioxide-equivalents per capita." - buildings: - description: "Nitrous oxide emissions from buildings, measured in million tonnes of carbon dioxide-equivalents." - buildings__per_capita: - description: "Nitrous oxide emissions from buildings, measured in tonnes of carbon dioxide-equivalents per capita." - electricity_and_heat: - description: "Nitrous oxide emissions from electricity and heat, measured in million tonnes of carbon dioxide-equivalents." - electricity_and_heat__per_capita: - description: "Nitrous oxide emissions from electricity and heat, measured in tonnes of carbon dioxide-equivalents per capita." - energy: - description: "Nitrous oxide emissions from energy, measured in million tonnes of carbon dioxide-equivalents." - energy__per_capita: - description: "Nitrous oxide emissions from energy, measured in tonnes of carbon dioxide-equivalents per capita." - fugitive_emissions: - description: "Fugitive emissions of nitrous oxide from energy production, measured in million tonnes of carbon dioxide-equivalents." - fugitive_emissions__per_capita: - description: "Fugitive emissions of nitrous oxide from energy production, measured in tonnes of carbon dioxide-equivalents per capita." - industry: - description: "Nitrous oxide emissions from industry, measured in million tonnes of carbon dioxide-equivalents." - industry__per_capita: - description: "Nitrous oxide emissions from industry, measured in tonnes of carbon dioxide-equivalents per capita." - land_use_change_and_forestry: - description: "Nitrous oxide emissions from land use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - land_use_change_and_forestry__per_capita: - description: "Nitrous oxide emissions from land use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - manufacturing_and_construction: - description: "Nitrous oxide emissions from manufacturing and construction, measured in million tonnes of carbon dioxide-equivalents." - manufacturing_and_construction__per_capita: - description: "Nitrous oxide emissions from manufacturing and construction, measured in tonnes of carbon dioxide-equivalents per capita." - other_fuel_combustion: - description: "Nitrous oxide emissions from other fuel combustion, measured in million tonnes of carbon dioxide-equivalents." - other_fuel_combustion__per_capita: - description: "Nitrous oxide emissions from other fuel combustion, measured in tonnes of carbon dioxide-equivalents per capita." - total_excluding_lucf: - description: "Total nitrous oxide emissions excluding land-use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - total_excluding_lucf__per_capita: - description: "Total nitrous oxide emissions excluding land-use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - total_including_lucf: - description: "Total nitrous oxide emissions including land-use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - total_including_lucf__per_capita: - description: "Total nitrous oxide emissions including land-use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - transport: - description: "Nitrous oxide emissions from transport, measured in million tonnes of carbon dioxide-equivalents." - transport__per_capita: - description: "Nitrous oxide emissions from transport, measured in tonnes of carbon dioxide-equivalents per capita." - waste: - description: "Nitrous oxide emissions from waste, measured in million tonnes of carbon dioxide-equivalents." - waste__per_capita: - description: "Nitrous oxide emissions from waste, measured in tonnes of carbon dioxide-equivalents per capita." - # population: - fluorinated_gas_emissions_by_sector: - variables: - industry: - description: "Fluorinated gas emissions from industry, measured in million tonnes of carbon dioxide-equivalents." - industry__per_capita: - description: "Fluorinated gas emissions from industry, measured in tonnes of carbon dioxide-equivalents per capita." - total_excluding_lucf: - description: "Total fluorinated gas emissions excluding land-use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - total_excluding_lucf__per_capita: - description: "Total fluorinated gas emissions excluding land-use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - total_including_lucf: - description: "Total fluorinated gas emissions including land-use change and forestry, measured in million tonnes of carbon dioxide-equivalents." - total_including_lucf__per_capita: - description: "Total fluorinated gas emissions including land-use change and forestry, measured in tonnes of carbon dioxide-equivalents per capita." - # population: diff --git a/etl/steps/archive/garden/cait/2022-08-10/ghg_emissions_by_sector.py b/etl/steps/archive/garden/cait/2022-08-10/ghg_emissions_by_sector.py deleted file mode 100644 index 9cc2a8587d1..00000000000 --- a/etl/steps/archive/garden/cait/2022-08-10/ghg_emissions_by_sector.py +++ /dev/null @@ -1,218 +0,0 @@ -from typing import Dict, List - -import pandas as pd -from owid import catalog -from owid.datautils import dataframes -from shared import CURRENT_DIR - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -# Details for dataset to export. -DATASET_SHORT_NAME = "ghg_emissions_by_sector" -COUNTRY_MAPPING_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.countries.json" -METADATA_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Details for dataset to import. -MEADOW_DATASET_PATH = DATA_DIR / f"meadow/cait/2022-08-10/{DATASET_SHORT_NAME}" - -# All sectors expected in the data, and how to rename them. -SECTORS = { - "Agriculture": "Agriculture", - "Building": "Buildings", - "Bunker Fuels": "Aviation and shipping", - "Electricity/Heat": "Electricity and heat", - "Energy": "Energy", - "Fugitive Emissions": "Fugitive emissions", - "Industrial Processes": "Industry", - "Land-Use Change and Forestry": "Land-use change and forestry", - "Manufacturing/Construction": "Manufacturing and construction", - "Other Fuel Combustion": "Other fuel combustion", - "Total excluding LUCF": "Total excluding LUCF", - "Total including LUCF": "Total including LUCF", - "Transportation": "Transport", - "Waste": "Waste", -} - -# Suffix to add to the name of per capita variables. -PER_CAPITA_SUFFIX = " (per capita)" - -# Mapping of gas name (as given in CAIT data) to the name of the corresponding output table. -TABLE_NAMES = { - "All GHG": "Greenhouse gas emissions by sector", - "CH4": "Methane emissions by sector", - "CO2": "Carbon dioxide emissions by sector", - "F-Gas": "Fluorinated gas emissions by sector", - "N2O": "Nitrous oxide emissions by sector", -} - -# Aggregate regions to add, following OWID definitions. -REGIONS_TO_ADD = [ - # Continents. - "Africa", - "Asia", - "Europe", - # The EU27 is already included in the original data, and after inspection the data coincides with our aggregate. - # So we simply keep the original data for EU27 given in the data. - "North America", - "Oceania", - "South America", - # Income groups. - "Low-income countries", - "Upper-middle-income countries", - "Lower-middle-income countries", - "High-income countries", -] - -# Convert million tonnes to tonnes. -MT_TO_T = 1e6 - - -def create_table_for_gas(df: pd.DataFrame, gas: str, countries_in_regions: Dict[str, List[str]]) -> catalog.Table: - """Extract data for a particular gas and create a table with variables' metadata. - - Parameters - ---------- - df : pd.DataFrame - gas : str - Name of gas to consider (as called in "gas" column of the original data). - countries_in_regions : dict - Countries in regions (a dictionary where each key is the name of the region, and the value is a list of country - names in that region). This is used to avoid loading the list of countries in a region for each gas. - - Returns - ------- - table_gas : catalog.Table - Table with data for considered gas, and metadata for each variable. - - """ - # Select data for current gas. - df_gas = df[df["gas"] == gas].drop(columns="gas").reset_index(drop=True) - - # Pivot table to have a column for each sector. - df_gas = df_gas.pivot(index=["country", "year"], columns="sector", values="value").reset_index() - - # Create region aggregates. - for region in REGIONS_TO_ADD: - df_gas = geo.add_region_aggregates( - df=df_gas, - region=region, - countries_in_region=countries_in_regions[region], - countries_that_must_have_data=[], - num_allowed_nans_per_year=None, - frac_allowed_nans_per_year=0.2, - aggregations=None, - keep_original_region_with_suffix=" (CAIT)", - ) - - # Add population to data. - df_gas = geo.add_population_to_dataframe(df=df_gas) - - # Add per capita variables. - variables = [column for column in df_gas.columns if column not in ["country", "year", "population"]] - for variable in variables: - new_column = variable + PER_CAPITA_SUFFIX - df_gas[new_column] = MT_TO_T * df_gas[variable] / df_gas["population"] - - # Remove columns that only have nans. - df_gas = df_gas.drop(columns=df_gas.columns[df_gas.isnull().all()]) - # Remove rows that only have nans. - df_gas = df_gas.dropna( - subset=[column for column in df_gas.columns if column not in ["country", "year"]], - how="all", - ).reset_index(drop=True) - - # Set index and sort rows and columns conveniently. - df_gas = df_gas.set_index(["country", "year"], verify_integrity=True).sort_index() - df_gas = df_gas[sorted(df_gas.columns)] - - # Create table with this data but no metadata. - table_gas = catalog.Table(df_gas) - # Create variable metadata. - for variable in table_gas.columns: - if PER_CAPITA_SUFFIX in variable: - table_gas[variable].metadata.unit = "tonnes per capita" - table_gas[variable].metadata.short_unit = "t" - table_gas[variable].metadata.title = variable - table_gas[variable].metadata.display = {"name": variable.replace(PER_CAPITA_SUFFIX, "")} - else: - table_gas[variable].metadata.unit = "million tonnes" - table_gas[variable].metadata.short_unit = "million t" - table_gas[variable].metadata.title = variable - table_gas[variable].metadata.display = { - "name": variable, - "numDecimalPlaces": 0, - } - - return table_gas - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read dataset from meadow. - ds_meadow = catalog.Dataset(MEADOW_DATASET_PATH) - # Get table from meadow dataset. - tb_meadow = ds_meadow[ds_meadow.table_names[0]] - # Get dataframe from table. - df = pd.DataFrame(tb_meadow).reset_index() - - # List all countries inside each region. - countries_in_regions = { - region: sorted(set(geo.list_countries_in_region(region)) & set(df["country"])) for region in REGIONS_TO_ADD - } - - # - # Process data. - # - # Select only one data source (CAIT). - df = df[df["data_source"] == "CAIT"].reset_index(drop=True) - - # Check that there is only one unit in dataset. - assert set(df["unit"]) == {"MtCO₂e"}, "Unknown units in dataset" - # Remove unnecessary columns. - df = df.drop(columns=["unit", "id", "data_source", "iso_code3"]) - - # Rename sectors. - df["sector"] = dataframes.map_series( - series=df["sector"], - mapping=SECTORS, - warn_on_missing_mappings=True, - warn_on_unused_mappings=True, - ) - - # Harmonize country names. - df = geo.harmonize_countries( - df=df, - countries_file=COUNTRY_MAPPING_PATH, - warn_on_missing_countries=True, - warn_on_unused_countries=True, - ) - - # Create one table for each gas, and one for all gases combined. - tables = { - gas: create_table_for_gas(df=df, gas=gas, countries_in_regions=countries_in_regions) - for gas in df["gas"].unique() - } - - # - # Save outputs. - # - ds_garden = catalog.Dataset.create_empty(dest_dir) - # Import metadata from meadow dataset and update attributes using the metadata yaml file. - ds_garden.metadata.update_from_yaml(METADATA_PATH) - # Create dataset. - ds_garden.save() - - # Add all tables to dataset. - for table_name in list(tables): - table_title = TABLE_NAMES[table_name] - table_short_name = catalog.utils.underscore(table_title) - table = tables[table_name] - # Make column names snake lower case. - table = catalog.utils.underscore_table(table) - table.metadata.title = table_title - table.metadata.short_name = table_short_name - table.update_metadata_from_yaml(METADATA_PATH, table_short_name) - # Add table to dataset. - ds_garden.add(table) diff --git a/etl/steps/archive/garden/cait/2022-08-10/shared.py b/etl/steps/archive/garden/cait/2022-08-10/shared.py deleted file mode 100644 index 177853ccb74..00000000000 --- a/etl/steps/archive/garden/cait/2022-08-10/shared.py +++ /dev/null @@ -1,5 +0,0 @@ -from pathlib import Path - -NAMESPACE = "ember" -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name diff --git a/etl/steps/archive/garden/demography/2022-11-30/life_expectancy.historical_events.yml b/etl/steps/archive/garden/demography/2022-11-30/life_expectancy.historical_events.yml deleted file mode 100644 index 17beebc3f94..00000000000 --- a/etl/steps/archive/garden/demography/2022-11-30/life_expectancy.historical_events.yml +++ /dev/null @@ -1,274 +0,0 @@ -Afghanistan: - - name: Soviet-Afghan War (1979 - 1989) - link: https://en.wikipedia.org/wiki/Soviet%E2%80%93Afghan_War -American Samoa: - - name: Samoa earthquake and tsunami (2009) - link: https://en.wikipedia.org/wiki/2009_Samoa_earthquake_and_tsunami -Armenia: - - name: First Nagorno-Karabakh War (1988 - 1994) - link: https://en.wikipedia.org/wiki/First_Nagorno-Karabakh_War -Bangladesh: - - name: Bhola cyclone (1970) - link: https://en.wikipedia.org/wiki/1970_Bhola_cyclone - - name: Bangladesh Liberation War (1971) - link: https://en.wikipedia.org/wiki/Bangladesh_Liberation_War -Belize: - - name: Hurricane Hattie (1961) - link: https://en.wikipedia.org/wiki/Hurricane_Hattie -Bosnia and Herzegovina: - - name: Bosnian War (1992 - 1995) - link: https://en.wikipedia.org/wiki/Bosnian_War -Burundi: - - name: Ikiza (1972) - link: https://en.wikipedia.org/wiki/Ikiza - - name: Hutu Massacres of 1988 - link: https://en.wikipedia.org/wiki/List_of_massacres_in_Burundi - - name: 1993 ethnic violence in Burundi - link: https://en.wikipedia.org/wiki/1993_ethnic_violence_in_Burundi -Cambodia: - - name: Cambodian Civil War (1967 - 1975) - link: https://en.wikipedia.org/wiki/Cambodian_Civil_War - - name: Cambodian genocide (1975 - 1979) - link: https://en.wikipedia.org/wiki/Cambodian_genocide -China: - - name: Great Chinese Famine (1959 – 1961) - link: https://en.wikipedia.org/wiki/Great_Chinese_Famine -Congo: - - name: Second Republic of the Congo Civil War (1997 - 1999) - link: https://en.wikipedia.org/wiki/Republic_of_the_Congo_Civil_War_(1997%E2%80%931999) -Cyprus: - - name: Crisis of 1963 - 1964 - link: https://en.wikipedia.org/wiki/Cypriot_intercommunal_violence#Crisis_of_1963%E2%80%931964 - - name: Turkish invasion of Cyprus (1974) - link: https://en.wikipedia.org/wiki/Turkish_invasion_of_Cyprus -Equatorial Guinea: - - name: Independence under Macías (1968 - 1979) - link: https://en.wikipedia.org/wiki/Equatorial_Guinea#Independence_under_Mac%C3%ADas_(1968%E2%80%931979) -Eritrea: - - name: Eritrean–Ethiopian War (1999) - link: https://en.wikipedia.org/wiki/Eritrean%E2%80%93Ethiopian_War -Ethiopia: - - name: 1983 – 1985 famine in Ethiopia - link: https://en.wikipedia.org/wiki/1983%E2%80%931985_famine_in_Ethiopia -Finland: - - name: Finnish Civil War (1918) - link: https://en.wikipedia.org/wiki/Finnish_Civil_War - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu - - name: Winter War (1939 - 1940) - link: https://en.wikipedia.org/wiki/Winter_War - - name: Continuation War (1941 - 1944) - link: https://en.wikipedia.org/wiki/Continuation_War -France: - - name: Franco-Prussian War (1870 - 1871) - link: https://en.wikipedia.org/wiki/Franco-Prussian_War - - name: World War I (1914 - 1918) - link: https://en.wikipedia.org/wiki/World_War_I - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu - - name: Liberation of France (1944 - 1945) - link: https://en.wikipedia.org/wiki/Liberation_of_Franceg - - name: Paris Commune (1971) - link: https://en.wikipedia.org/wiki/Paris_Commune -Georgia: - - name: Georgian Civil War (1991 - 1993) - link: https://en.wikipedia.org/wiki/Georgian_Civil_War -Grenada: - - name: United States invasion of Grenada (1983) - link: https://en.wikipedia.org/wiki/United_States_invasion_of_Grenada -Guatemala: - - name: 1976 Guatemala earthquake - link: https://en.wikipedia.org/wiki/1976_Guatemala_earthquake - - name: Guatemalan genocide (1981 - 1983) - link: https://en.wikipedia.org/wiki/Guatemalan_genocide -Guinea-Bissau: - - name: Guinea-Bissau War of Independence (1974) - link: https://en.wikipedia.org/wiki/Guinea-Bissau_War_of_Independence - - name: Guinea-Bissau Civil War (1998 - 1999) - link: https://en.wikipedia.org/wiki/Guinea-Bissau_Civil_War -Haiti: - - name: 2010 Haiti earthquake - link: https://en.wikipedia.org/wiki/2010_Haiti_earthquake -Honduras: - - name: Hurricane Fifi–Orlene (1974) - link: https://en.wikipedia.org/wiki/Hurricane_Fifi%E2%80%93Orlene - - name: Hurricane Mitch (1998) - link: https://en.wikipedia.org/wiki/Hurricane_Mitch -India: - - name: Indo-Pakistani War of 1965 - link: https://en.wikipedia.org/wiki/Indo-Pakistani_War_of_1965 -Indonesia: - - name: Japanese occupation of the Dutch East Indies (1942 - 1945) - link: https://en.wikipedia.org/wiki/Japanese_occupation_of_the_Dutch_East_Indies - - name: Indonesian National Revolution (1945 - 1949) - link: https://en.wikipedia.org/wiki/Indonesian_National_Revolution - - name: Indonesian mass killings of 1965 - 1966 - link: https://en.wikipedia.org/wiki/Indonesian_mass_killings_of_1965%E2%80%9366 - - name: 2004 Indian Ocean earthquake and tsunami - link: https://en.wikipedia.org/wiki/2004_Indian_Ocean_earthquake_and_tsunami -Iran: - - name: Buin Zahra earthquake (1962) - link: https://en.wikipedia.org/wiki/1962_Buin_Zahra_earthquake - - name: Manjil–Rudbar earthquake (1990) - link: https://en.wikipedia.org/wiki/1990_Manjil%E2%80%93Rudbar_earthquake - - name: Iranian Revolution (1978 - 1979) - link: https://en.wikipedia.org/wiki/Casualties_of_the_Iranian_Revolution - - name: Bam earthquake (2003) - link: https://en.wikipedia.org/wiki/2003_Bam_earthquake -Iraq: - - name: Iran–Iraq War (1980 - 1988) - link: https://en.wikipedia.org/wiki/Iran%E2%80%93Iraq_War - - name: Iraq War (2003 - 2011) - link: https://en.wikipedia.org/wiki/Iraq_War -Italy: - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu - - name: Italian campaign (World War II) (1943 - 1945) - link: https://en.wikipedia.org/wiki/Italian_campaign_(World_War_II) -Japan: - - name: Atomic bombings of Hiroshima and Nagasaki (1945) - link: https://en.wikipedia.org/wiki/Atomic_bombings_of_Hiroshima_and_Nagasaki -Kosovo: - - name: Kosovo War (1998 - 1999) - link: https://en.wikipedia.org/wiki/Kosovo_War -Lebanon: - - name: Lebanese Civil War (1975 - 1990) - link: https://en.wikipedia.org/wiki/Lebanese_Civil_War -Liberia: - - name: First Liberian Civil War (1989 - 1997) - link: https://en.wikipedia.org/wiki/First_Liberian_Civil_War#ECOWAS_intervention_force_(August_1990) -Luxembourg: - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu - - name: German occupation of Luxembourg during World War I (1918) - link: https://en.wikipedia.org/wiki/German_occupation_of_Luxembourg_during_World_War_I - - name: German occupation of Luxembourg during World War II (1940 - 1944) - link: https://en.wikipedia.org/wiki/German_occupation_of_Luxembourg_during_World_War_II -Montserrat: - - name: Hurricane Hugo (1989) - link: https://en.wikipedia.org/wiki/Hurricane_Hugo#Montserrat - - name: Eruption of Soufriere Hills Volcano (1995) - link: https://earthobservatory.nasa.gov/images/16081/eruption-of-soufriere-hills-volcano#:~:text=Located%20on%20the%20Caribbean%20Island,was%20destroyed%20in%20the%20eruption. -Mozambique: - - name: Mozambican Civil War (1977 - 1992) - link: https://en.wikipedia.org/wiki/Mozambican_Civil_War -Myanmar: - - name: Cyclone Nargis (2008) - link: https://en.wikipedia.org/wiki/Cyclone_Nargis -Namibia: - - name: South African Border War (1966 - 1990) - link: https://en.wikipedia.org/wiki/South_African_Border_War -Netherlands: - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu - - name: World War II (1939 - 1945) - link: https://en.wikipedia.org/wiki/Netherlands_in_World_War_II -Nicaragua: - - name: 1972 Nicaragua earthquake - link: https://en.wikipedia.org/wiki/1972_Nicaragua_earthquake -Nigeria: - - name: Nigerian Civil War (1967 - 1970) - link: https://en.wikipedia.org/wiki/Nigerian_Civil_War -North Korea: - - name: Korean War (1950 - 1953 de facto) - link: https://en.wikipedia.org/wiki/Korean_War - - name: North Korean famine (1994 - 2002) - link: https://en.wikipedia.org/wiki/North_Korean_famine -Norway: - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu -Pakistan: - - name: Indo-Pakistani War of 1971 - link: https://en.wikipedia.org/wiki/Indo-Pakistani_War_of_1971 -Palestine: - - name: Yom Kippur War (1973) - link: https://en.wikipedia.org/wiki/Yom_Kippur_War -Panama: - - name: United States invasion of Panama (1989) - link: https://en.wikipedia.org/wiki/United_States_invasion_of_Panama -Peru: - - name: Terremoto de Áncash (1970) - link: https://es.wikipedia.org/wiki/Terremoto_de_%C3%81ncash_de_1970 -Russia: - - name: Soviet famine (1930 – 1933) - link: https://en.wikipedia.org/wiki/Soviet_famine_of_1930%E2%80%931933 - - name: Soviet famine (1946 – 1947) - link: https://en.wikipedia.org/wiki/Soviet_famine_of_1946%E2%80%931947 -Rwanda: - - name: Rwandan genocide (1994) - link: https://en.wikipedia.org/wiki/Rwandan_genocide -Samoa: - - name: Samoa earthquake and tsunami (2009) - link: https://en.wikipedia.org/wiki/2009_Samoa_earthquake_and_tsunami -Sierra Leone: - - name: Sierra Leone Civil War (1991 - 2002) - link: https://en.wikipedia.org/wiki/Sierra_Leone_Civil_War -Somalia: - - name: Isaaq genocide (1087 - 1989) - link: https://en.wikipedia.org/wiki/Isaaq_genocide - - name: Somali Civil War (1991 - present) - link: https://en.wikipedia.org/wiki/Somali_Civil_War -South Korea: - - name: Korean War (1950 - 1953 de facto) - link: https://en.wikipedia.org/wiki/Korean_War -South Sudan: - - name: Second Sudanese Civil War (1983 - 2005) - link: https://en.wikipedia.org/wiki/Second_Sudanese_Civil_War -Spain: - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu - - name: Spanish Civil War (1936 - 1939) - link: https://en.wikipedia.org/wiki/Spanish_Civil_War -Sri Lanka: - - name: 2004 Indian Ocean earthquake and tsunami - link: https://en.wikipedia.org/wiki/2004_Indian_Ocean_earthquake_and_tsunami -Sudan: - - name: Second Sudanese Civil War (1983 - 2005) - link: https://en.wikipedia.org/wiki/Second_Sudanese_Civil_War -Sweden: - - name: Famine caused by bad harvests in the previous years (1773) - link: https://lup.lub.lu.se/search/ws/files/5987932/8032075.pdf - - name: Coup of 1809 - link: https://en.wikipedia.org/wiki/Coup_of_1809 - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu -Switzerland: - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu -Syria: - - name: Yom Kippur War (1973) - link: https://en.wikipedia.org/wiki/Yom_Kippur_War - - name: Hama massacre (1982) - link: https://en.wikipedia.org/wiki/1982_Hama_massacre - - name: Syrian civil war (2011 - present) - link: https://en.wikipedia.org/wiki/Syrian_civil_war -Tajikistan: - - name: Tajikistani Civil War (1992 - 1997) - link: https://en.wikipedia.org/wiki/Tajikistani_Civil_War -East Timor: - - name: Indonesian occupation of East Timor (1975 - 1999) - link: https://en.wikipedia.org/wiki/Indonesian_occupation_of_East_Timor -Ukraine: - - name: Holocaust in Ukraine (1941 - 1944) - link: https://en.wikipedia.org/wiki/The_Holocaust_in_Ukraine - - name: Battle of Kiev (1943) - link: https://en.wikipedia.org/wiki/Battle_of_Kiev_(1943) - - name: Battle of the Dnieper (1943) - link: https://en.wikipedia.org/wiki/Battle_of_the_Dnieper -United Kingdom: - - name: 1557 influenza pandemic - link: https://en.wikipedia.org/wiki/1557_influenza_pandemic - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu -United States: - - name: Spanish Flu (1918 - 1920) - link: https://en.wikipedia.org/wiki/Spanish_flu -Vietnam: - - name: Vietnam War (1955 - 1975) - link: https://en.wikipedia.org/wiki/Vietnam_War -Western Sahara: - - name: Western Sahara War (1075 - 1991) - link: https://en.wikipedia.org/wiki/Western_Sahara_War -Yemen: - - name: South Yemen Civil War (1986) - link: https://en.wikipedia.org/wiki/South_Yemen_Civil_War diff --git a/etl/steps/archive/garden/demography/2022-11-30/life_expectancy.meta.yml b/etl/steps/archive/garden/demography/2022-11-30/life_expectancy.meta.yml deleted file mode 100644 index 571856518f6..00000000000 --- a/etl/steps/archive/garden/demography/2022-11-30/life_expectancy.meta.yml +++ /dev/null @@ -1,251 +0,0 @@ -dataset: - namespace: demography - short_name: life_expectancy - title: Life Expectancy (various sources) - description: >- - This dataset has been created using multiple sources. We use UN WPP for data since 1950 (estimates and medium variant) and a combination of other sources before this year. - - - For continents, we use UN's definitions for values after 1950 and Riley (2005) definitions for values prior to 1950. Note that Riley reports "Americas", while the UN reports "Northern America" and "Latin America and the Caribbean" separately. - - - **SOURCES** - - - **World Population Prospects - UN (2022)** - - World Population Prospects 2022 is the 27th edition of the official estimates and projections of the global population that have been published by the United Nations since 1951. - The estimates are based on all available sources of data on population size and levels of fertility, mortality and international migration for 237 countries or areas. - More details at https://population.un.org/wpp/Publications/. - - - **Life Tables - Human Mortality Database (2022-11-04)** - - To facilitate rapid downloads, the database has been organized into zipped data files. Two series of files are intended for different purposes and for different users. - For users who want to obtain all available data for an individual country or for all countries, the zipped data files labeled "By country" are recommended. The file - organization follows internal practices and is not particularly user-friendly, but all publicly-available HMD data are included in this set.For users who only want - information of a given kind for all countries, the files "By statistic" are recommended. In this case the file organization is simpler, but only certain parts of - the database (i.e., items labeled "Complete Data Series" on country pages) are available in this format. - - - More details can be found at https://www.mortality.org/Data/ExplanatoryNotes. - - - **Life Expectancy at Birth (Total) - Zijdeman et al. (2015)** - - This dataset provides Period Life Expectancy at birth per country and year. The overall aim of the dataset is to cover the entire world for the period 1500-2000. - The current version (version 2) was build as part of the OECD "How was life" project. The dataset has nearly global coverage for the post 1950 period, while pre - 1950 the coverage decreases the more historic the time period. Depending on sources, the data are annual estimates, 5 yearly or decadel estimates - - - The sources used are: - - - - [UN World Population Project](http://esa.un.org/wpp/). - - - [Human Mortality Database](http://www.mortality.org). - - - [Gapminder](http://www.gapminder.org). - - - [OECD](http://stats.oecd.org). - - - [Montevideo-Oxford Latin America Economic History Database](http://www.lac.ox.ac.uk/moxlad-database). - - - [ONS](http://www.ons.gov.uk/ons/datasets-and-tables/index.html). - - - [Australian Bureau of Statistics](http://www.abs.gov.au/ausstats/abs@.nsf/web+pages/statistics?opendocument#from-banner=LN). - - - Kannisto, V., Nieminen, M. & Turpeinen, O. (1999). Finnish Life Tables since 1751, Demographic Research, 1(1), DOI: 10.4054/DemRes.1999.1.1 - - - For specifics concerning (selections of) the sources, see the R-file below, with which the dataset was created. - - - Link to paper can be found at https://clio-infra.eu/docs/Total_life_expectancy.docx. - licenses: - - name: CC BY 3.0 IGO - url: http://creativecommons.org/licenses/by/3.0/igo/ - - name: CC BY 4.0 - url: https://www.mortality.org/Data/UserAgreement - - name: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication License - url: https://datasets.iisg.amsterdam/dataset.xhtml?persistentId=hdl:10622/LKYT53 - - name: JSTOR - url: https://about.jstor.org/terms/ - version: '2022-11-30' - sources: - - name: United Nations, Department of Economic and Social Affairs, Population Division (2022) - url: https://population.un.org/wpp/Download/ - owid_data_url: https://walden.nyc3.digitaloceanspaces.com/un/2022-07-11/un_wpp.zip - date_accessed: '2022-09-09' - publication_date: '2022-07-11' - publication_year: 2022 - - name: Human Mortality Database - url: https://www.mortality.org/Data/ZippedDataFiles - owid_data_url: https://walden.nyc3.digitaloceanspaces.com/hmd/2022-11-04/life_tables.zip - date_accessed: '2022-11-04' - publication_year: 2022 - - name: Zijdeman et al. (2015) (via clio-infra.eu) - url: https://clio-infra.eu/Indicators/LifeExpectancyatBirthTotal.html - source_data_url: https://clio-infra.eu/data/LifeExpectancyatBirth(Total)_Broad.xlsx - owid_data_url: https://walden.nyc3.digitaloceanspaces.com/papers/2022-11-01/zijdeman_et_al_2015.xlsx - date_accessed: '2022-11-01' - publication_year: 2015 - - name: Riley (2005) - url: https://doi.org/10.1111/j.1728-4457.2005.00083.x - source_data_url: https://u.demog.berkeley.edu/~jrw/Biblio/Eprints/%20P-S/riley.2005_estimates.global.e0.pdf - owid_data_url: https://walden.nyc3.digitaloceanspaces.com/papers/2022-11-01/riley_2005.pdf - date_accessed: '2022-11-01' - publication_date: '2005-10-21' - publication_year: 2005 -tables: - historical: - title: Life Expectancy (various sources) - Historical - variables: - life_expectancy_0_hist: - title: Life expectancy at birth (historical) - description: "The average number of years that a newborn could expect to live, if he or she were to pass through life - exposed to the sex- and age-specific death rates prevailing at the time of his or her birth, for a specific year, - in a given country, territory, or geographic area.\n\nDefinition from the WHO.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - - name: Zijdeman et al. (2015) - url: https://clio-infra.eu/Indicators/LifeExpectancyatBirthTotal.html - - name: Riley (2005) - url: https://doi.org/10.1111/j.1728-4457.2005.00083.x - life_expectancy_15_hist: - title: Life expectancy at 15 (historical) - description: "The average number of remaining years of life expected by a hypothetical cohort of individuals who already - reached age 15 and would be subject during the remainder of their lives to the mortality rates of a given period. - It is expressed as years.\n\nDefinition from the UN.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - - name: Human Mortality Database - url: https://www.mortality.org/Data/ZippedDataFiles - life_expectancy_65_hist: - title: Life expectancy at 65 (historical) - description: "The average number of remaining years of life expected by a hypothetical cohort of individuals who already - reached age 65 and would be subject during the remainder of their lives to the mortality rates of a given period. - It is expressed as years.\n\nDefinition from the UN.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - - name: Human Mortality Database - url: https://www.mortality.org/Data/ZippedDataFiles - life_expectancy_80_hist: - title: Life expectancy at 80 (historical) - description: "The average number of remaining years of life expected by a hypothetical cohort of individuals who already - reached age 80 and would be subject during the remainder of their lives to the mortality rates of a given period. - It is expressed as years.\n\nDefinition from the UN.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - - name: Human Mortality Database - url: https://www.mortality.org/Data/ZippedDataFiles - life_expectancy: - title: Life Expectancy (various sources) - variables: - life_expectancy_0: - title: Life expectancy at birth - description: "The average number of years that a newborn could expect to live, if he or she were to pass through life - exposed to the sex- and age-specific death rates prevailing at the time of his or her birth, for a specific year, - in a given country, territory, or geographic area.\n\nDefinition from the WHO.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - - name: Zijdeman et al. (2015) - url: https://clio-infra.eu/Indicators/LifeExpectancyatBirthTotal.html - - name: Riley (2005) - url: https://doi.org/10.1111/j.1728-4457.2005.00083.x - life_expectancy_15: - title: Life expectancy at 15 - description: "The average number of remaining years of life expected by a hypothetical cohort of individuals who already - reached age 15 and would be subject during the remainder of their lives to the mortality rates of a given period. - It is expressed as years.\n\nDefinition from the UN.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - - name: Human Mortality Database - url: https://www.mortality.org/Data/ZippedDataFiles - life_expectancy_65: - title: Life expectancy at 65 - description: "The average number of remaining years of life expected by a hypothetical cohort of individuals who already - reached age 65 and would be subject during the remainder of their lives to the mortality rates of a given period. - It is expressed as years.\n\nDefinition from the UN.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - - name: Human Mortality Database - url: https://www.mortality.org/Data/ZippedDataFiles - life_expectancy_80: - title: Life expectancy at 80 - description: "The average number of remaining years of life expected by a hypothetical cohort of individuals who already - reached age 80 and would be subject during the remainder of their lives to the mortality rates of a given period. - It is expressed as years.\n\nDefinition from the UN.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - - name: Human Mortality Database - url: https://www.mortality.org/Data/ZippedDataFiles - projection: - title: Life Expectancy (various sources) - Projection - variables: - life_expectancy_0_proj: - title: Life expectancy at birth (projection) - description: "The average number of years that a newborn could expect to live, if he or she were to pass through life - exposed to the sex- and age-specific death rates prevailing at the time of his or her birth, for a specific year, - in a given country, territory, or geographic area.\n\nDefinition from the WHO.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - life_expectancy_15_proj: - title: Life expectancy at 15 (projection) - description: "The average number of remaining years of life expected by a hypothetical cohort of individuals who already - reached age 15 and would be subject during the remainder of their lives to the mortality rates of a given period. - It is expressed as years.\n\nDefinition from the UN.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - life_expectancy_65_proj: - title: Life expectancy at 65 (projection) - description: "The average number of remaining years of life expected by a hypothetical cohort of individuals who already - reached age 65 and would be subject during the remainder of their lives to the mortality rates of a given period. - It is expressed as years.\n\nDefinition from the UN.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ - - name: Human Mortality Database - url: https://www.mortality.org/Data/ZippedDataFiles - life_expectancy_80_proj: - title: Life expectancy at 80 (projection) - description: "The average number of remaining years of life expected by a hypothetical cohort of individuals who already - reached age 80 and would be subject during the remainder of their lives to the mortality rates of a given period. - It is expressed as years.\n\nDefinition from the UN.\n" - unit: years - short_unit: years - sources: - - name: UN WPP (2022) - url: https://population.un.org/wpp/Download/ diff --git a/etl/steps/archive/garden/demography/2022-11-30/life_expectancy.py b/etl/steps/archive/garden/demography/2022-11-30/life_expectancy.py deleted file mode 100644 index 923f75f331b..00000000000 --- a/etl/steps/archive/garden/demography/2022-11-30/life_expectancy.py +++ /dev/null @@ -1,496 +0,0 @@ -"""This script generates a dataset with the OMM on Life Expectancy. - -To do this, we use different sources, depending on the year and metric: - -> Life expectancy at birth: - - UN WPP for data since 1950. - - Zijdeman et al. (2015) for data prior to 1950. - - Riley (2005) for data prior to 1950 for region aggregates. - -> Life expectancy at age X: - - UN WPP for data since 1950. - - HMD for data prior to 1950. -""" -from typing import List - -import pandas as pd -import yaml -from owid.catalog import Dataset, DatasetMeta, Table, TableMeta -from owid.catalog.utils import underscore_table -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -log = get_logger() - -# naming conventions -N = PathFinder(__file__) - -# short name of new dataset -SHORT_NAME = N.short_name -# dataset paths -GARDEN_WPP_DATASET = DATA_DIR / "garden" / "un" / "2022-07-11" / "un_wpp" -GARDEN_HMD_DATASET = DATA_DIR / "garden" / "hmd" / "2022-11-04" / "life_tables" -GARDEN_ZIJDEMAN_DATASET = DATA_DIR / "garden" / "papers" / "2022-11-03" / "zijdeman_et_al_2015" -GARDEN_RILEY_DATASET = DATA_DIR / "garden" / "papers" / "2022-11-04" / "riley_2005" -# auxiliary datasets -GARDEN_POPULATION_WPP = DATA_DIR / "garden" / "un" / "2022-07-11" / "un_wpp" -# index column names: this is used when setting indices in dataframes -COLUMNS_IDX = ["country", "year"] -# age groups considered besides at 0 (at birth) -AGES_EXTRA = ["15", "65", "80"] -# year when UN WPP data starts -YEAR_WPP_START = 1950 -# year separating historical and projection data in UN WPP dataset. -YEAR_HIST_PROJ = 2021 -# Versioning -VERSION = "2022-11-30" -# Cold start -# The first time this is executed, no metadata file is available. It is created on the fly, during execution time. -# Once this is done, we create the metadata YAML file using etl-metadata-export command. -# From then, we use the metadata in that YAML file, which might have some manual edits. -COLD_START = False - -# Region mapping -# We will be using continent names without (Entity) suffix. This way charts show continuity between lines from different datasets (e.g. riley and UN) -REGION_MAPPING = { - "Africa (Riley 2005)": "Africa", - "Americas (Riley 2005)": "Americas", - "Asia (Riley 2005)": "Asia", - "Europe (Riley 2005)": "Europe", - "Oceania (Riley 2005)": "Oceania", - "Africa (UN)": "Africa", - "Northern America (UN)": "Northern America", - "Latin America and the Caribbean (UN)": "Latin America and the Caribbean", - "Asia (UN)": "Asia", - "Europe (UN)": "Europe", - "Oceania (UN)": "Oceania", -} -# Path to historical events file -# this file contains a list of historical events that likely caused data anomalies in the dataset. -# note that proving that these anomalies are caused by those events would require some complicated causal inference. -PATH_HIST_EVENTS = N.directory / "life_expectancy.historical_events.yml" - - -def run(dest_dir: str) -> None: - log.info("life_expectancy.start") - - # read datasets from garden - ds_wpp = Dataset(GARDEN_WPP_DATASET) - ds_hmd = Dataset(GARDEN_HMD_DATASET) - ds_zij = Dataset(GARDEN_ZIJDEMAN_DATASET) - ds_ril = Dataset(GARDEN_RILEY_DATASET) - # group datasets into single list - all_ds = [ds_wpp, ds_hmd, ds_zij, ds_ril] - # load dataframes - df_wpp = load_wpp(ds_wpp) - df_hmd = load_hmd(ds_hmd) - df_zij = load_zijdeman(ds_zij) - df_ril = load_riley(ds_ril) - - # create tables (all-years, historical and projections) - log.info("life_expectancy: create table with all-years data") - tb = make_table(df_wpp, df_hmd, df_zij, df_ril) - log.info("life_expectancy: create table with historical data") - tb_historical = make_table(df_wpp, df_hmd, df_zij, df_ril, only_historical=True) - log.info("life_expectancy: create table with projection data") - tb_projection = make_table(df_wpp, df_hmd, df_zij, df_ril, only_projections=True) - - # create dataset - ds_garden = Dataset.create_empty(dest_dir) - if COLD_START: - ds_garden.metadata = make_metadata(all_ds) - else: - ds_garden.metadata.update_from_yaml(N.metadata_path) - - # add tables to dataset - log.info("life_expectancy: add tables to dataset") - ds_garden.add(tb) - ds_garden.save() - ds_garden.add(tb_historical) - ds_garden.save() - ds_garden.add(tb_projection) - ds_garden.save() - - # add historical events table - ds_garden.add(make_hist_events_table()) - ds_garden.save() - - log.info("life_expectancy.end") - - -def make_table( - df_wpp: pd.DataFrame, - df_hmd: pd.DataFrame, - df_zij: pd.DataFrame, - df_ril: pd.DataFrame, - only_historical: bool = False, - only_projections: bool = False, -) -> Table: - """Create table. - - Joins all different sources into a single dataframe. - - By default, it creates a table with all years. Use `only_historical` and `only_projections` to create tables with - only historical data or only projections, respectively. - """ - log.info("life_expectancy.make_table") - df = merge_dfs(df_wpp, df_hmd, df_zij, df_ril) - - # Filter - assert not (only_historical and only_projections), "Both only_historical and only_projections can't be True!" - if only_historical: - df = df[df.index.get_level_values("year") <= YEAR_HIST_PROJ] - if only_projections: - df = df[df.index.get_level_values("year") > YEAR_HIST_PROJ] - - # Build table - log.info("life_expectancy.make_table.build_table") - tb = Table(df) - tb = underscore_table(tb) - - # metadata - tb = add_metadata_to_table(tb, only_historical, only_projections) - return tb - - -def add_metadata_to_table(tb: Table, only_historical: bool, only_projections: bool) -> Table: - """Add metadata to table. - - This is done from scratch or by reading the YAML file. Note that only one table is actually defined. - The other two (historical and projections) are equivalent with minor changes in title and variable titles/names. - """ - - def _get_metadata_cold_start(short_name): - return TableMeta(short_name=short_name, title=f"Life Expectancy (various sources) - {short_name.capitalize()}") - - def _get_metadata(tb, short_name): - tb.columns = [f"{col}_{short_name[:4]}" for col in tb.columns] - tb.update_metadata_from_yaml(N.metadata_path, short_name) - return tb - - if COLD_START: - if only_projections: - tb.metadata = _get_metadata_cold_start("projection") - elif only_historical: - tb.metadata = _get_metadata_cold_start("historical") - else: - tb.metadata = TableMeta(short_name=SHORT_NAME, title="Life Expectancy (various sources)") - else: - if only_projections: - tb = _get_metadata(tb, "projection") - elif only_historical: - tb = _get_metadata(tb, "historical") - else: - tb.update_metadata_from_yaml(N.metadata_path, SHORT_NAME) - return tb - - -def make_metadata(all_ds: List[Dataset]) -> DatasetMeta: - """Create metadata for the dataset.""" - log.info("life_expectancy: creating metadata") - # description - description = "------\n\n" - for ds in all_ds: - description += f"{ds.metadata.title}:\n\n{ds.metadata.description}\n\n------\n\n" - description = ( - "This dataset has been created using multiple sources. We use UN WPP for data since 1950 (estimates and medium" - " variant). Prior to that, other sources are combined.\n\n" + description - ) - - # sources - sources = [source for ds in all_ds for source in ds.sources] - # licenses - licenses_ = [license_ for ds in all_ds for license_ in ds.licenses] - - # metadata object - metadata = DatasetMeta( - namespace="demography", - short_name="life_expectancy", - title="Life Expectancy (various sources)", - description=description, - sources=sources, - licenses=licenses_, - version="2022-11-30", - ) - return metadata - - -def load_wpp(ds: Dataset) -> pd.DataFrame: - """Load data from WPP 2022 dataset. - - It loads medium variant for future projections. - - Output has the following columns: country, year, life_expectancy_0, life_expectancy_15, life_expectancy_65, life_expectancy_80. - """ - log.info("life_expectancy: loading wpp data") - # Load table - df = ds["un_wpp"] - df = df.reset_index() - # Filter relevant rows - df = df.loc[ - (df["metric"] == "life_expectancy") - & (df["age"].isin(["at birth"] + AGES_EXTRA)) - & (df["variant"].isin(["medium", "estimates"])) - & (df["sex"] == "all") - ] - # Change age group from 'at birth' to '0' - df = df.assign(age=df["age"].replace({"at birth": "0"})) - # Pivot and set column names - df = df.rename(columns={"location": "country"}) - df = df.pivot(index=COLUMNS_IDX, columns=["age"], values="value") - df.columns = [f"life_expectancy_{col}" for col in df.columns] - df = df.reset_index() - return df - - -def load_hmd(ds: Dataset) -> pd.DataFrame: - """Load data from HMD dataset. - - Output has the following columns: country, year, life_expectancy_15, life_expectancy_65, life_expectancy_80. - """ - log.info("life_expectancy: loading hmd data") - df = ds["period_1x1"] - df = df.reset_index() - # Filter - df = df.loc[ - (df["age"].isin(AGES_EXTRA)) & (df["year"].astype(int) < YEAR_WPP_START), - COLUMNS_IDX + ["age", "life_expectancy"], - ] - # Pivot and set column names - df = df.pivot(index=COLUMNS_IDX, columns=["age"], values="life_expectancy") - df.columns = [f"life_expectancy_{col}" for col in df.columns] - df = df.reset_index() - # Correct values: expected years ahead => expected total years lived - df = df.assign( - life_expectancy_15=df["life_expectancy_15"] + 15, - life_expectancy_65=df["life_expectancy_65"] + 65, - life_expectancy_80=df["life_expectancy_80"] + 80, - ) - return df - - -def load_zijdeman(ds: Dataset) -> pd.DataFrame: - """Load data from Zijdeman et al. (2015) dataset. - - Output has the following columns: country, year, life_expectancy_0. - """ - log.info("life_expectancy: loading zijdeman 2015 data") - df = ds["zijdeman_et_al_2015"].reset_index() - # Filter - df = df[df["year"] < YEAR_WPP_START] - # Rename columns, drop columns - columns_rename = { - "country": "country", - "year": "year", - "life_expectancy": "life_expectancy_0", - } - df = df[columns_rename.keys()].rename(columns=columns_rename) - return df - - -def load_riley(ds: Dataset) -> pd.DataFrame: - """Load data from Zijdeman et al. (2015) dataset. - - Output has the following columns: country, year, life_expectancy_0. - """ - log.info("life_expectancy: loading riley 2005 data") - df = ds["riley_2005"].reset_index() - # Filter - df = df[df["year"] < YEAR_WPP_START] - # Rename columns, drop columns - columns_rename = { - "entity": "country", - "year": "year", - "life_expectancy": "life_expectancy_0", - } - df = df[columns_rename.keys()].rename(columns=columns_rename) - return df - - -def merge_dfs(df_wpp: pd.DataFrame, df_hmd: pd.DataFrame, df_zij: pd.DataFrame, df_ril: pd.DataFrame) -> pd.DataFrame: - """Merge all involved dataframes into a single one. - - - Life expectancy at birth is taken from UN WPP, Zijdeman et al. (2015) and Riley (2005) datasets. - - Life expectancy at X is taken from UN WPP and HMD datasets. - """ - log.info("life_expectancy: merging dataframes") - # Merge with HMD - df = pd.concat([df_wpp, df_hmd], ignore_index=True) - # # Merge with Zijdeman et al. (2015) - column_og = "life_expectancy_0" - suffix = "_zij" - column_extra = f"{column_og}{suffix}" - df = df.merge(df_zij, how="outer", on=COLUMNS_IDX, suffixes=("", suffix)) - df = df.assign(life_expectancy_0=df[column_og].fillna(df[column_extra])).drop(columns=[column_extra]) - - # Merge with Riley (2005) - assert not set(df.loc[df["year"] <= df_ril["year"].max(), "country"]).intersection( - set(df_ril["country"]) - ), "There is some overlap between the dataset and Riley (2005) dataset" - df = pd.concat([df, df_ril], ignore_index=True) - - # add region aggregates - # df = add_region_aggregates(df) - - # Rename regions - df["country"] = df["country"].replace(REGION_MAPPING) - - # add americas for >1950 using UN WPP data - df = add_americas(df) - - # Dtypes, row sorting - df = df.astype({"year": int}) - df = df.set_index(COLUMNS_IDX, verify_integrity=True).sort_index() - df = df.dropna(how="all", axis=0) - - # Rounding resolution - # We round to 2 decimals - rounding = 1e2 - df = ((df * rounding).round().fillna(-1).astype(int) / rounding).astype("float") - for col in df.columns: - if col not in COLUMNS_IDX: - df.loc[df[col] < 0, col] = pd.NA - return df - - -def add_americas(frame: pd.DataFrame) -> pd.DataFrame: - """Estimate value for the Americas using North America and LATAM/Caribbean.""" - # filter only member countries of the region - region_members = ["Northern America", "Latin America and the Caribbean"] - df = frame.loc[frame["country"].isin(region_members)].copy() - # add population for LATAM and Northern America (from WPP, hence since 1950) - assert df["year"].min() == YEAR_WPP_START - df = add_population_americas_from_wpp(df) - # sanity check: ensure there are NO missing values. This way, we can safely do the groupby - assert (df.isna().sum() == 0).all() - # estimate values for regions - # y(country) = weight(country) * metric(country) - df["life_expectancy_0"] *= df["population"] - df["life_expectancy_15"] *= df["population"] - df["life_expectancy_65"] *= df["population"] - df["life_expectancy_80"] *= df["population"] - # z(region) = sum{ y(country) } for country in region - df = df.groupby("year", as_index=False).sum(numeric_only=True) - # z(region) / sum{ population(country) } for country in region - df["life_expectancy_0"] /= df["population"] - df["life_expectancy_15"] /= df["population"] - df["life_expectancy_65"] /= df["population"] - df["life_expectancy_80"] /= df["population"] - - # assign region name - df = df.assign(country="Americas") - # concatenate - df = pd.concat([frame, df]).sort_values(["country", "year"], ignore_index=True).drop(columns="population") - return df - - -def add_region_aggregates(frame: pd.DataFrame) -> pd.DataFrame: - """Add life expectancy for continents. - - This function is currently not in use, but might be useful in the future. - """ - log.info("life_expectancy: adding region aggregates") - # new regions - regions_new = [ - "Europe", - "Oceania", - "Asia", - "Africa", - "North America", - "South America", - ] - # remove regions - regions_ignore = [ - "Africa", - # "Africa (UN)", - "Asia", - # "Asia (UN)", - "Europe", - # "Europe (UN)", - # "Latin America and the Caribbean (UN)", - "North America", - # "Northern America (UN)", - "Oceania", - # "Oceania (UN)", - "South America", - ] - frame = frame.loc[-frame["country"].isin(regions_ignore)] - # add population - df = geo.add_population_to_dataframe(frame.copy()) - - # estimate values for regions - # y(country) = weight(country) * metric(country) - df["life_expectancy_0"] *= df["population"] - df["life_expectancy_15"] *= df["population"] - df["life_expectancy_65"] *= df["population"] - df["life_expectancy_80"] *= df["population"] - # z(region) = sum{ y(country) } for country in region - for region in regions_new: - df = geo.add_region_aggregates(df, region=region) - df = df[df["country"].isin(regions_new)] - # z(region) / sum{ population(country) } for country in region - df["life_expectancy_0"] /= df["population"] - df["life_expectancy_15"] /= df["population"] - df["life_expectancy_65"] /= df["population"] - df["life_expectancy_80"] /= df["population"] - - # concatenate - df = pd.concat([frame, df]).sort_values(["country", "year"], ignore_index=True).drop(columns="population") - return df - - -def add_population_americas_from_wpp(df: pd.DataFrame): - """Add population values for LATAM and Northern America. - - Data is sourced from UN WPP, hence only available since 1950. - """ - pop = load_america_population_from_unwpp() - df = df.merge(pop, on=["country", "year"]) - return df - - -def load_america_population_from_unwpp(): - """Load population data from UN WPP for Northern America and Latin America and the Caribbean. - - We use this dataset instead of the long-run because we want the entities as defined by the UN. - """ - # load population from WPP - locations = ["Latin America and the Caribbean (UN)", "Northern America (UN)"] - ds = Dataset(GARDEN_POPULATION_WPP) - df = ds["population"].reset_index() - df = df.loc[ - (df["location"].isin(locations)) - & (df["metric"] == "population") - & (df["sex"] == "all") - & (df["age"] == "all") - & (df["variant"].isin(["estimates", "medium"])), - ["location", "year", "value"], - ] - assert len(set(df["location"])) == 2, f"Check that all of {locations} are in df" - df["location"] = df["location"].replace(REGION_MAPPING) - - # rename columns - df = df.rename(columns={"location": "country", "value": "population"}) - return df - - -def make_hist_events_table() -> Table: - log.info("life_expectancy: making 'historical events' table") - # Load historical events yaml file - with open(PATH_HIST_EVENTS) as f: - hist_events = yaml.safe_load(f) - # store all yaml's content as a string in a cell in the table - df = pd.DataFrame({"hist_events": [str(hist_events)]}) - tb = Table(df) - # add metadata - tb.metadata = TableMeta( - short_name="_hist_events", - description=( - "this table contains a list of historical events that likely caused data anomalies for the life expectancy" - " data in YAML format." - ), - ) - return tb diff --git a/etl/steps/archive/garden/demography/2023-04-14/population_density.meta.yml b/etl/steps/archive/garden/demography/2023-04-14/population_density.meta.yml deleted file mode 100644 index 3e8475fa9e5..00000000000 --- a/etl/steps/archive/garden/demography/2023-04-14/population_density.meta.yml +++ /dev/null @@ -1,2 +0,0 @@ -dataset: {} -tables: {} diff --git a/etl/steps/archive/garden/demography/2023-04-14/population_density.py b/etl/steps/archive/garden/demography/2023-04-14/population_density.py deleted file mode 100644 index d140597c3c3..00000000000 --- a/etl/steps/archive/garden/demography/2023-04-14/population_density.py +++ /dev/null @@ -1,136 +0,0 @@ -"""Build population density OMM dataset. - -This dataset is built using our population OMM dataset and the land area given by FAOSTAT (RL): - - `population_density = population / land_area` -""" - -import pandas as pd -from owid.catalog import Dataset, DatasetMeta, Table, VariableMeta -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - log.info("population_density: start") - - # - # Load inputs. - # - # Load dependency datasets. - ds_population: Dataset = paths.load_dependency("population") - ds_land_area: Dataset = paths.load_dependency("faostat_rl") - - # Read relevant tables - tb_population = ds_population["population"] - tb_land_area = ds_land_area["faostat_rl_flat"] - - # - # Process data. - # - tb = make_table(tb_population, tb_land_area) - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset( - dest_dir, - tables=[tb], - default_metadata=build_metadata(ds_population, ds_land_area), - ) - # Save changes in the new garden dataset. - ds_garden.save() - - log.info("population_density: end") - - -def make_table(tb_population: Table, tb_land_area: Table) -> Table: - """Create a table with population density data.""" - # Dataframe population - df_population = pd.DataFrame(tb_population).reset_index() - # Dataframe land area - # We use land area of countries as they are defined today (latest reported value) - log.info("population_density: process land area datafame") - column_area = "land_area__00006601__area__005110__hectares" - df_land_area = ( - pd.DataFrame(tb_land_area)[[column_area]] - .reset_index() - .rename(columns={column_area: "area"}) - .sort_values(["country", "year"]) - .drop_duplicates(subset=["country"], keep="last") - .drop(columns=["year"]) - ) - - # Merge dataframes - log.info("population_density: merge dataframes") - df = df_population.merge(df_land_area, on="country", how="inner") - # Drop NaN (no data for area) - df = df.dropna(subset=["area"]) - # Estimate population density as population / land_area(in km2) - df["population_density"] = df["population"] / (0.01 * df["area"]) # 0.01 to convert from hectares to km2 - # Rename column source -> source_population - df = df.rename(columns={"source": "source_population"}) - # Select relevant columns, order them, set index - df = df[["country", "year", "population_density", "source_population"]].set_index(["country", "year"]).sort_index() - - # Build table - log.info("population_density: build table") - tb = Table(df, short_name=paths.short_name) - - # Define variable metadata - log.info("population_density: define variable metadata") - tb.population_density.metadata = VariableMeta( - title="Population density", - description=( - "Population density estimated by Our World in Data using population estimates from multiple sources " - "and land area estimates by the Food and Agriculture Organization of the United Nations. We obtain it" - "by dividing the population estimates by the land area estimates.\n\n" - + tb_population.population.metadata.description - ), - unit="people per km²", - ) - tb.source_population.metadata = VariableMeta( - title="Source (population)", - description=( - "Name of the source of the population estimate for a specific data point (country-year). The name includes a short name of the source and a link." - ), - unit="", - ) - return tb - - -def build_metadata(ds_population: Dataset, ds_land_area: Dataset) -> DatasetMeta: - """Generate metadata for the dataset based on the metadata from `ds_population` and `ds_land_area`. - - Parameters - ---------- - ds_population : Dataset - Dataset with population estimates. - ds_land_area : Dataset - Dataset with land area estimates. - - Returns - ------- - DatasetMeta - Dataset metadata. - """ - log.info("population_density: add metadata") - return DatasetMeta( - channel=paths.channel, - namespace=paths.namespace, - short_name=paths.short_name, - title="Population density (various sources, 2023.1)", - description=( - "Population density is obtained by dividing population by land area.\n\n" - + ds_population.metadata.description - ), - sources=ds_population.metadata.sources + ds_land_area.metadata.sources, - licenses=ds_population.metadata.licenses + ds_land_area.metadata.licenses, - ) diff --git a/etl/steps/archive/garden/eia/2022-07-27/energy_consumption.countries.json b/etl/steps/archive/garden/eia/2022-07-27/energy_consumption.countries.json deleted file mode 100644 index 0df2c12f811..00000000000 --- a/etl/steps/archive/garden/eia/2022-07-27/energy_consumption.countries.json +++ /dev/null @@ -1,260 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Africa": "Africa", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Angola": "Angola", - "Antarctica": "Antarctica", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burma": "Myanmar", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cayman Islands": "Cayman Islands", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo-Brazzaville": "Congo", - "Congo-Kinshasa": "Democratic Republic of Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Europe": "Europe", - "Falkland Islands": "Falkland Islands", - "Faroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "Former Czechoslovakia": "Czechoslovakia", - "Former Serbia and Montenegro": "Serbia and Montenegro", - "Former U.S.S.R.": "USSR", - "Former Yugoslavia": "Yugoslavia", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia, The": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Germany, East": "East Germany (EIA)", - "Germany, West": "West Germany (EIA)", - "Ghana": "Ghana", - "Gibraltar": "Gibraltar", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Laos": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macau": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Micronesia": "Micronesia (country)", - "Moldova": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "Netherlands Antilles": "Netherlands Antilles", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North America": "North America", - "North Korea": "North Korea", - "North Macedonia": "North Macedonia", - "Northern Mariana Islands": "Northern Mariana Islands", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palestinian Territories": "Palestine", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Reunion": "Reunion", - "Romania": "Romania", - "Russia": "Russia", - "Rwanda": "Rwanda", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent/Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syria": "Syria", - "Taiwan": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "The Bahamas": "Bahamas", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "U.S. Virgin Islands": "United States Virgin Islands", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Vietnam": "Vietnam", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Asia & Oceania": "Asia & Oceania (EIA)", - "Australia and New Zealand": "Australia and New Zealand (EIA)", - "Central & South America": "Central & South America (EIA)", - "Cote d?Ivoire": "Cote d'Ivoire", - "Eurasia": "Eurasia (EIA)", - "European Union": "European Union (EIA)", - "Hawaiian Trade Zone": "Hawaiian Trade Zone (EIA)", - "IEO - Africa": "IEO - Africa (EIA)", - "IEO - Middle East": "IEO - Middle East (EIA)", - "IEO OECD - Europe": "IEO OECD - Europe (EIA)", - "Mexico, Chile, and other OECD Americas": "Mexico, Chile, and other OECD Americas (EIA)", - "Middle East": "Middle East (EIA)", - "Non-OECD": "Non-OECD (EIA)", - "Non-OPEC": "Non-OPEC (EIA)", - "OECD": "OECD (EIA)", - "OECD - Asia And Oceania": "OECD - Asia And Oceania (EIA)", - "OECD - Europe": "OECD - Europe (EIA)", - "OECD - North America": "OECD - North America (EIA)", - "OPEC": "OPEC (EIA)", - "OPEC - Africa": "OPEC - Africa (EIA)", - "OPEC - South America": "OPEC - South America (EIA)", - "Other Non-OECD - America": "Other Non-OECD - America (EIA)", - "Other Non-OECD - Asia": "Other Non-OECD - Asia (EIA)", - "Other Non-OECD - Europe and Eurasia": "Other Non-OECD - Europe and Eurasia (EIA)", - "Persian Gulf": "Persian Gulf (EIA)", - "South Korea and other OECD Asia": "South Korea and other OECD Asia (EIA)", - "U.S. Pacific Islands": "U.S. Pacific Islands (EIA)", - "U.S. Territories": "U.S. Territories (EIA)", - "Wake Island": "Wake Island (EIA)" -} diff --git a/etl/steps/archive/garden/eia/2022-07-27/energy_consumption.meta.yml b/etl/steps/archive/garden/eia/2022-07-27/energy_consumption.meta.yml deleted file mode 100644 index ea8551ecde8..00000000000 --- a/etl/steps/archive/garden/eia/2022-07-27/energy_consumption.meta.yml +++ /dev/null @@ -1,24 +0,0 @@ -dataset: - namespace: eia - version: 2022-07-27 - title: Total energy consumption (EIA, 2022) - short_name: energy_consumption - description: | - Total energy consumption, extracted from EIA's international energy data from the EIA, downloaded using their [Bulk Download Facility](https://www.eia.gov/opendata/bulkfiles.php). - - EIA's region definitions sometimes differ from Our World in Data's definitions. For example, in EIA's data, Russia is not included in Europe, whereas Our World in Data includes Russia in Europe (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "Europe (EIA)" to refer to EIA's original data using their definition of the region, as well as "Europe", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - sources: - - name: Our World in Data based on EIA's total energy consumption (2022) - published_by: U.S. Energy Information Administration (EIA) - publication_year: 2022 - date_accessed: 2022-07-27 - url: https://www.eia.gov/opendata/bulkfiles.php -tables: - energy_consumption: - variables: - energy_consumption: - title: Total energy consumption (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total energy consumption diff --git a/etl/steps/archive/garden/eia/2022-07-27/energy_consumption.py b/etl/steps/archive/garden/eia/2022-07-27/energy_consumption.py deleted file mode 100644 index 2b69b9b29e4..00000000000 --- a/etl/steps/archive/garden/eia/2022-07-27/energy_consumption.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Garden step for EIA total energy consumption. - -""" - -import pandas as pd -from owid import catalog -from owid.catalog.utils import underscore_table -from shared import ( - CURRENT_DIR, - NAMESPACE, - OVERLAPPING_DATA_TO_REMOVE_IN_AGGREGATES, - REGIONS_TO_ADD, - VERSION, - add_region_aggregates, - log, -) - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -DATASET_SHORT_NAME = "energy_consumption" -# Path to country mapping file. -COUNTRY_MAPPING_PATH = CURRENT_DIR / "energy_consumption.countries.json" -# Path to metadata file. -METADATA_PATH = CURRENT_DIR / "energy_consumption.meta.yml" - -# Conversion factor from terajoules to terawatt-hours. -TJ_TO_TWH = 1 / 3600 - - -def run(dest_dir: str) -> None: - log.info(f"{DATASET_SHORT_NAME}.start") - # - # Load data. - # - # Load meadow dataset and get the only table inside (with the same name). - ds_meadow = catalog.Dataset(DATA_DIR / f"meadow/{NAMESPACE}/{VERSION}/{DATASET_SHORT_NAME}") - tb_meadow = ds_meadow[DATASET_SHORT_NAME] - - # Convert table into a dataframe. - df = pd.DataFrame(tb_meadow).reset_index() - - # - # Process data. - # - # Harmonize country names. - log.info(f"{DATASET_SHORT_NAME}.harmonize_countries") - df = geo.harmonize_countries(df=df, countries_file=str(COUNTRY_MAPPING_PATH)) - - # Convert terajoules to terawatt-hours. - df["energy_consumption"] = df["values"] * TJ_TO_TWH - df = df.drop(columns=["values", "members"]) - - # Create aggregate regions. - log.info(f"{DATASET_SHORT_NAME}.add_region_aggregates") - df = add_region_aggregates( - data=df, - regions=list(REGIONS_TO_ADD), - index_columns=["country", "year"], - known_overlaps=OVERLAPPING_DATA_TO_REMOVE_IN_AGGREGATES, # type: ignore - ) - - # Prepare output data. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index() - - # - # Save outputs. - # - # Create a new garden dataset (with the same metadata as the meadow version). - ds_garden = catalog.Dataset.create_empty(dest_dir) - # ds_garden.metadata = ds_meadow.metadata - ds_garden.metadata.update_from_yaml(METADATA_PATH) - ds_garden.save() - - # Create a new table. - tb_garden = underscore_table(catalog.Table(df)) - tb_garden.metadata = tb_meadow.metadata - tb_garden.update_metadata_from_yaml(METADATA_PATH, DATASET_SHORT_NAME) - # Add table to dataset. - ds_garden.add(tb_garden) - - log.info(f"{DATASET_SHORT_NAME}.end") diff --git a/etl/steps/archive/garden/eia/2022-07-27/shared.py b/etl/steps/archive/garden/eia/2022-07-27/shared.py deleted file mode 100644 index 279f4017536..00000000000 --- a/etl/steps/archive/garden/eia/2022-07-27/shared.py +++ /dev/null @@ -1,593 +0,0 @@ -from pathlib import Path -from typing import Dict, List, Optional, Union, cast - -import numpy as np -import pandas as pd -from owid import catalog -from structlog import get_logger - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -log = get_logger() - -NAMESPACE = "eia" -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - - -# Aggregate regions to add, following OWID definitions. -REGIONS_TO_ADD = { - "North America": { - "country_code": "OWID_NAM", - }, - "South America": { - "country_code": "OWID_SAM", - }, - "Europe": { - "country_code": "OWID_EUR", - }, - "European Union (27)": { - "country_code": "OWID_EU27", - }, - "Africa": { - "country_code": "OWID_AFR", - }, - "Asia": { - "country_code": "OWID_ASI", - }, - "Oceania": { - "country_code": "OWID_OCE", - }, - "Low-income countries": { - "country_code": "OWID_LIC", - }, - "Upper-middle-income countries": { - "country_code": "OWID_UMC", - }, - "Lower-middle-income countries": { - "country_code": "OWID_LMC", - }, - "High-income countries": { - "country_code": "OWID_HIC", - }, -} - -# Additional countries to include in region aggregates. -ADDITIONAL_COUNTRIES_IN_REGIONS: Dict[str, List[str]] = {} -# ADDITIONAL_COUNTRIES_IN_REGIONS = { -# "Africa": [ -# "Other Africa (BP)", -# ], -# } - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION: Dict[str, Dict[str, Union[str, List[str]]]] = { - "Czechoslovakia": { - "continent": "Europe", - "income_group": "High-income countries", - "members": [ - # Europe - High-income countries. - "Czechia", - "Slovakia", - ], - }, - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "members": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - ], - }, - "Serbia and Montenegro": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - Upper-middle-income countries. - "Serbia", - "Montenegro", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, - "Yugoslavia": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Croatia", - "Slovenia", - # Europe - Upper-middle-income countries. - "North Macedonia", - "Bosnia and Herzegovina", - "Serbia", - "Montenegro", - ], - }, -} - -# List of known overlaps between regions and member countries (or successor countries). -OVERLAPPING_DATA_TO_REMOVE_IN_AGGREGATES = [ - { - "region": "Netherlands Antilles", - "member": "Aruba", - "entity_to_make_nan": "region", - "years": [ - 1986, - 1987, - 1988, - 1989, - 1990, - 1991, - 1992, - 1993, - 1994, - 1995, - 1996, - 1997, - 1998, - 1999, - 2000, - 2001, - 2002, - 2003, - 2004, - 2005, - 2006, - 2007, - 2008, - 2009, - 2010, - 2011, - 2012, - 2013, - 2014, - 2015, - 2016, - 2017, - 2018, - 2019, - ], - "variable": "energy_consumption", - } -] - - -def load_population() -> pd.DataFrame: - """Load OWID population dataset, and add historical regions to it. - - Returns - ------- - population : pd.DataFrame - Population dataset. - - """ - # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] - - # Add data for historical regions (if not in population) by adding the population of its current successors. - countries_with_population = population["country"].unique() - missing_countries = [country for country in HISTORIC_TO_CURRENT_REGION if country not in countries_with_population] - for country in missing_countries: - members = HISTORIC_TO_CURRENT_REGION[country]["members"] - _population = ( - population[population["country"].isin(members)] - .groupby("year") - .agg({"population": "sum", "country": "nunique"}) - .reset_index() - ) - # Select only years for which we have data for all member countries. - _population = _population[_population["country"] == len(members)].reset_index(drop=True) - _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) - - error = "Duplicate country-years found in population. Check if historical regions changed." - assert population[population.duplicated(subset=["country", "year"])].empty, error - - return cast(pd.DataFrame, population) - - -def load_income_groups() -> pd.DataFrame: - """Load dataset of income groups and add historical regions to it. - - Returns - ------- - income_groups : pd.DataFrame - Income groups data. - - """ - # Load the WorldBank dataset for income grups. - income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() - - # Add historical regions to income groups. - for historic_region in HISTORIC_TO_CURRENT_REGION: - historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] - if historic_region not in income_groups["country"]: - historic_region_df = pd.DataFrame( - { - "country": [historic_region], - "income_group": [historic_region_income_group], - } - ) - income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) - - return cast(pd.DataFrame, income_groups) - - -def add_population( - df: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - This function has been adapted from datautils.geo, because population currently does not include historic regions. - We include them in this function. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Load population dataset. - population = load_population().rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population - - -def detect_overlapping_data_for_regions_and_members( - df: pd.DataFrame, - index_columns: List[str], - regions_and_members: Dict[str, Dict[str, Union[str, List[str]]]], - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]], - ignore_zeros: bool = True, -) -> None: - """Raise a warning if there is data for a particular region and for a country that is a member of that region. - - For example, if there is data for USSR and Russia on the same years, a warning will be raised. - - Parameters - ---------- - df : pd.DataFrame - Data. - index_columns : list - Names of columns that should be index of the data. - regions_and_members : dict - Regions and members (where each key corresponds to a region, and each region is a dictionary of various keys, - one of which is 'members', which is a list of member countries). - known_overlaps : list or None - Instances of known overlaps in the data. If this function raises a warning, new instances should be added to the - list. - ignore_zeros : bool - True to consider zeros in the data as missing values. Doing this, if a region has overlapping data with a member - country, but one of their data points is zero, it will not be considered an overlap. - - """ - if known_overlaps is not None: - df = df.copy() - - if ignore_zeros: - # Replace zeros by nans, so that zeros are ignored when looking for overlapping data. - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - - regions = list(regions_and_members) - for region in regions: - # Create a dataframe with only data for the region, and remove columns that only have nans. - # Optionally, replace zeros by nans, to also remove columns that only have zeros or nans. - region_df = ( - df[df["country"] == region].replace(overlapping_values_to_ignore, np.nan).dropna(axis=1, how="all") - ) - members = regions_and_members[region]["members"] - for member in members: - # Create a dataframe for this particular member country. - member_df = ( - df[df["country"] == member].replace(overlapping_values_to_ignore, np.nan).dropna(axis=1, how="all") - ) - # Find common columns with (non-nan) data between region and member country. - variables = [ - column - for column in (set(region_df.columns) & set(member_df.columns)) - if column not in index_columns - ] - for variable in variables: - # Concatenate region and member country's data for this variable. - combined = ( - pd.concat( - [ - region_df[["year", variable]], - member_df[["year", variable]], - ], - ignore_index=True, - ) - .dropna() - .reset_index(drop=True) - ) - # Find years where region and member country overlap. - overlapping = combined[combined.duplicated(subset="year")] - if not overlapping.empty: - overlapping_years = sorted(set(overlapping["year"])) - new_overlap = { - "region": region, - "member": member, - "years": overlapping_years, - "variable": variable, - } - # Check if the overlap found is already in the list of known overlaps. - # If this overlap is not known, raise a warning. - # Omit the field "entity_to_make_nan" when checking if this overlap is known. - _known_overlaps = [ - {key for key in overlap if key != "entity_to_make_nan"} for overlap in known_overlaps - ] - if new_overlap not in _known_overlaps: # type: ignore - log.warning( - f"Data for '{region}' overlaps with '{member}' on '{variable}' " - f"and years: {overlapping_years}" - ) - - -def remove_overlapping_data_for_regions_and_members( - df: pd.DataFrame, - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]], - country_col: str = "country", - year_col: str = "year", - ignore_zeros: bool = True, -) -> pd.DataFrame: - """Check if list of known overlaps between region (e.g. a historical region like the USSR) and a member country (or - a successor country, like Russia) do overlap, and remove them from the data. - - Parameters - ---------- - df : pd.DataFrame - Data. - known_overlaps : list or None - List of known overlaps between region and member country. - country_col : str - Name of country column. - year_col : str - Name of year column. - ignore_zeros : bool - True to ignore columns of zeros when checking if known overlaps are indeed overlaps. - - Returns - ------- - df : pd.DataFrame - Data after removing known overlapping rows between a region and a member country. - - """ - if known_overlaps is not None: - df = df.copy() - - if ignore_zeros: - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - - for i, overlap in enumerate(known_overlaps): - if set([overlap["region"], overlap["member"]]) <= set(df["country"]): - # Check that the known overlap is indeed found in the data. - duplicated_rows = ( - df[(df[country_col].isin([overlap["region"], overlap["member"]]))][ - [country_col, year_col, overlap["variable"]] - ] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=overlap["variable"]) - ) - duplicated_rows = duplicated_rows[duplicated_rows.duplicated(subset="year", keep=False)] - overlapping_years = sorted(set(duplicated_rows["year"])) - if overlapping_years != overlap["years"]: - log.warning(f"Given overlap number {i} is not found in the data; redefine this list.") - # Make nan data points for either the region or the member (which is specified by "entity to make nan"). - indexes_to_make_nan = duplicated_rows[ - duplicated_rows["country"] == overlap[overlap["entity_to_make_nan"]] # type: ignore - ].index.tolist() - df.loc[indexes_to_make_nan, overlap["variable"]] = np.nan - - return df - - -def load_countries_in_regions() -> Dict[str, List[str]]: - """Create a dictionary of regions (continents and income groups) and their member countries. - - Regions to include are defined above, in REGIONS_TO_ADD. - Additional countries are added to regions following the definitions in ADDITIONAL_COUNTRIES_IN_REGIONS. - - Returns - ------- - countries_in_regions : dict - Dictionary of regions, where the value is a list of member countries in the region. - - """ - # Load income groups. - income_groups = load_income_groups() - - countries_in_regions = {} - for region in list(REGIONS_TO_ADD): - # Add default OWID list of countries in region (which includes historical regions). - countries_in_regions[region] = geo.list_countries_in_region(region=region, income_groups=income_groups) - - # Include additional countries in the region (if any given). - for region in ADDITIONAL_COUNTRIES_IN_REGIONS: - countries_in_regions[region] = countries_in_regions[region] + ADDITIONAL_COUNTRIES_IN_REGIONS[region] - - return countries_in_regions - - -def add_region_aggregates( - data: pd.DataFrame, - regions: List[str], - index_columns: List[str], - country_column: str = "country", - year_column: str = "year", - aggregates: Optional[Dict[str, str]] = None, - known_overlaps: Optional[List[Dict[str, Union[str, List[int]]]]] = None, - region_codes: Optional[List[str]] = None, - country_code_column: str = "country_code", - keep_original_region_with_suffix: Optional[str] = None, -) -> pd.DataFrame: - """Add region aggregates for all regions (which may include continents and income groups). - - Parameters - ---------- - data : pd.DataFrame - Data. - regions : list - Regions to include. - index_columns : list - Name of index columns. - country_column : str - Name of country column. - year_column : str - Name of year column. - aggregates : dict or None - Dictionary of type of aggregation to use for each variable. If None, variables will be aggregated by summing. - known_overlaps : list or None - List of known overlaps between regions and their member countries. - region_codes : list or None - List of country codes for each new region. It must have the same number of elements, and in the same order, as - the 'regions' argument. - country_code_column : str - Name of country codes column (only relevant of region_codes is not None). - keep_original_region_with_suffix : str or None - If None, original data for region will be replaced by aggregate data constructed by this function. If not None, - original data for region will be kept, with the same name, but having suffix keep_original_region_with_suffix - added to its name. - - Returns - ------- - data : pd.DataFrame - Data after adding aggregate regions. - - """ - data = data.copy() - - if aggregates is None: - # If aggregations are not specified, assume all variables are to be aggregated, by summing. - aggregates = {column: "sum" for column in data.columns if column not in index_columns} - # Get the list of regions to create, and their member countries. - countries_in_regions = load_countries_in_regions() - for region in regions: - # List of countries in region. - countries_in_region = countries_in_regions[region] - # Select rows of data for member countries. - data_region = data[data[country_column].isin(countries_in_region)] - # Remove any known overlaps between regions (e.g. USSR, which is a historical region) in current region (e.g. - # Europe) and their member countries (or successor countries, like Russia). - # If any overlap in known_overlaps is not found, a warning will be raised. - data_region = remove_overlapping_data_for_regions_and_members(df=data_region, known_overlaps=known_overlaps) - - # Check that there are no other overlaps in the data (after having removed the known ones). - detect_overlapping_data_for_regions_and_members( - df=data_region, - regions_and_members=HISTORIC_TO_CURRENT_REGION, - index_columns=index_columns, - known_overlaps=known_overlaps, - ) - - # Add region aggregates. - data_region = geo.add_region_aggregates( - df=data_region, - region=region, - country_col=country_column, - year_col=year_column, - aggregations=aggregates, - countries_in_region=countries_in_region, - countries_that_must_have_data=[], - # Here we allow aggregating even when there are few countries informed. - # However, if absolutely all countries have nan, we want the aggregate to be nan, not zero. - frac_allowed_nans_per_year=0.999, - num_allowed_nans_per_year=None, - keep_original_region_with_suffix=keep_original_region_with_suffix, - ) - data = pd.concat( - [ - data[data[country_column] != region], - data_region[data_region[country_column] == region], - ], - ignore_index=True, - ).reset_index(drop=True) - - if region_codes is not None: - # Add region codes to regions. - if data[country_code_column].dtype == "category": - data[country_code_column] = data[country_code_column].cat.add_categories(region_codes) - for i, region in enumerate(regions): - data.loc[data[country_column] == region, country_code_column] = region_codes[i] - - return data diff --git a/etl/steps/archive/garden/ember/2022-12-13/combined_electricity.meta.yml b/etl/steps/archive/garden/ember/2022-12-13/combined_electricity.meta.yml deleted file mode 100644 index 8c25ac4c50d..00000000000 --- a/etl/steps/archive/garden/ember/2022-12-13/combined_electricity.meta.yml +++ /dev/null @@ -1,1094 +0,0 @@ -dataset: - namespace: ember - version: 2022-12-13 - title: Combined Electricity Data (Ember, 2022) - short_name: combined_electricity - sources: - - - name: Our World in Data based on Ember's Yearly Electricity Data (2022). - published_by: Ember - publication_year: 2022 - date_accessed: 2022-12-13 - url: https://ember-climate.org/data-catalogue/yearly-electricity-data/ - - - name: Our World in Data based on Ember's European Electricity Review (2022). - published_by: Ember - publication_year: 2022 - date_accessed: 2022-08-01 - url: https://ember-climate.org/insights/research/european-electricity-review-2022/ - -tables: - european_electricity_review: - title: European electricity review - variables: - demand__total_demand__twh: - title: Demand - Total demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total demand - demand__total_demand_per_capita__kwh: - title: Demand - Total demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Total per capita demand - emissions__co2_intensity__gco2_kwh: - title: Emissions - CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity - emissions__total_emissions__mtco2: - title: Emissions - Total emissions (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - generation__bioenergy__pct: - title: Generation - Bioenergy (%) - short_unit: "%" - unit: "%" - display: - name: Bioenergy generation - generation__bioenergy__twh: - title: Generation - Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy generation - generation__clean__pct: - title: Generation - Clean (%) - short_unit: "%" - unit: "%" - display: - name: Clean generation - generation__clean__twh: - title: Generation - Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean generation - generation__coal__pct: - title: Generation - Coal (%) - short_unit: "%" - unit: "%" - display: - name: Coal generation - generation__coal__twh: - title: Generation - Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal generation - generation__fossil__pct: - title: Generation - Fossil (%) - short_unit: "%" - unit: "%" - display: - name: Fossil generation - generation__fossil__twh: - title: Generation - Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil generation - generation__gas__pct: - title: Generation - Gas (%) - short_unit: "%" - unit: "%" - display: - name: Gas generation - generation__gas__twh: - title: Generation - Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas generation - generation__gas_and_other_fossil__pct: - title: Generation - Gas and other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Gas and other fossil generation - generation__gas_and_other_fossil__twh: - title: Generation - Gas and other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and other fossil generation - generation__hard_coal__pct: - title: Generation - Hard coal (%) - short_unit: "%" - unit: "%" - display: - name: Hard coal generation - generation__hard_coal__twh: - title: Generation - Hard coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hard coal generation - generation__hydro__bioenergy_and_other_renewables__pct: - title: Generation - Hydro bioenergy and other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__bioenergy_and_other_renewables__twh: - title: Generation - Hydro bioenergy and other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__pct: - title: Generation - Hydro (%) - short_unit: "%" - unit: "%" - display: - name: Hydro generation - generation__hydro__twh: - title: Generation - Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro generation - generation__lignite__pct: - title: Generation - Lignite (%) - short_unit: "%" - unit: "%" - display: - name: Lignite generation - generation__lignite__twh: - title: Generation - Lignite (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Lignite generation - generation__nuclear__pct: - title: Generation - Nuclear (%) - short_unit: "%" - unit: "%" - display: - name: Nuclear generation - generation__nuclear__twh: - title: Generation - Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear generation - generation__other_fossil__pct: - title: Generation - Other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Other fossil generation - generation__other_fossil__twh: - title: Generation - Other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other fossil generation - generation__other_renewables__pct: - title: Generation - Other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Other renewables generation - generation__other_renewables__twh: - title: Generation - Other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables generation - generation__renewables__pct: - title: Generation - Renewables (%) - short_unit: "%" - unit: "%" - display: - name: Renewables generation - generation__renewables__twh: - title: Generation - Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables generation - generation__solar__pct: - title: Generation - Solar (%) - short_unit: "%" - unit: "%" - display: - name: Solar generation - generation__solar__twh: - title: Generation - Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar generation - generation__total_generation__twh: - title: Generation - Total - short_unit: TWh - unit: terawatt-hours - display: - name: Total generation - generation__wind__pct: - title: Generation - Wind (%) - short_unit: "%" - unit: "%" - display: - name: Wind generation - generation__wind__twh: - title: Generation - Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind generation - generation__wind_and_solar__pct: - title: Generation - Wind and solar (%) - short_unit: "%" - unit: "%" - display: - name: Wind and solar generation - generation__wind_and_solar__twh: - title: Generation - Wind and solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and solar generation - imports__total_net_imports__twh: - title: Imports - Total net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total net imports - yearly_electricity: - title: Yearly Electricity Data - variables: - capacity__bioenergy__gw: - title: Capacity - Bioenergy (GW) - short_unit: GW - unit: gigawatts - display: - name: Bioenergy capacity - capacity__clean__gw: - title: Capacity - Clean (GW) - short_unit: GW - unit: gigawatts - display: - name: Clean capacity - capacity__coal__gw: - title: Capacity - Coal (GW) - short_unit: GW - unit: gigawatts - display: - name: Coal capacity - capacity__fossil__gw: - title: Capacity - Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Fossil capacity - capacity__gas__gw: - title: Capacity - Gas (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas capacity - capacity__gas_and_other_fossil__gw: - title: Capacity - Gas and other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas and other fossil capacity - capacity__hydro__bioenergy_and_other_renewables__gw: - title: Capacity - Hydro bioenergy and other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro bioenergy and other renewables capacity - capacity__hydro__gw: - title: Capacity - Hydro (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro capacity - capacity__nuclear__gw: - title: Capacity - Nuclear (GW) - short_unit: GW - unit: gigawatts - display: - name: Nuclear capacity - capacity__other_fossil__gw: - title: Capacity - Other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Other fossil capacity - capacity__other_renewables__gw: - title: Capacity - Other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Other renewables capacity - capacity__renewables__gw: - title: Capacity - Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Renewables capacity - capacity__solar__gw: - title: Capacity - Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Solar capacity - capacity__wind__gw: - title: Capacity - Wind (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind capacity - capacity__wind_and_solar__gw: - title: Capacity - Wind and solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind and solar capacity - demand__total_demand__twh: - title: Demand - Total demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total demand - demand__total_demand_per_capita__kwh: - title: Demand - Total demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Total per capita demand - emissions__bioenergy__mtco2: - title: Emissions - Bioenergy (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Bioenergy emissions - emissions__clean__mtco2: - title: Emissions - Clean (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Clean emissions - emissions__co2_intensity__gco2_kwh: - title: Emissions - CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity - emissions__coal__mtco2: - title: Emissions - Coal (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Coal emissions - emissions__fossil__mtco2: - title: Emissions - Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Fossil emissions - emissions__gas__mtco2: - title: Emissions - Gas (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas emissions - emissions__gas_and_other_fossil__mtco2: - title: Emissions - Gas and other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas and other fossil emissions - emissions__hydro__bioenergy_and_other_renewables__mtco2: - title: Emissions - Hydro bioenergy and other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro bioenergy and other renewables emissions - emissions__hydro__mtco2: - title: Emissions - Hydro (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro emissions - emissions__nuclear__mtco2: - title: Emissions - Nuclear (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Nuclear emissions - emissions__other_fossil__mtco2: - title: Emissions - Other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other fossil emissions - emissions__other_renewables__mtco2: - title: Emissions - Other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other renewables emissions - emissions__renewables__mtco2: - title: Emissions - Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Renewables emissions - emissions__solar__mtco2: - title: Emissions - Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Solar emissions - emissions__total_emissions__mtco2: - title: Emissions - Total - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - emissions__wind__mtco2: - title: Emissions - Wind (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind emissions - emissions__wind_and_solar__mtco2: - title: Emissions - Wind and solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind and solar emissions - generation__bioenergy__pct: - title: Generation - Bioenergy (%) - short_unit: "%" - unit: "%" - display: - name: Bioenergy generation - generation__bioenergy__twh: - title: Generation - Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy generation - generation__clean__pct: - title: Generation - Clean (%) - short_unit: "%" - unit: "%" - display: - name: Clean generation - generation__clean__twh: - title: Generation - Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean generation - generation__coal__pct: - title: Generation - Coal (%) - short_unit: "%" - unit: "%" - display: - name: Coal generation - generation__coal__twh: - title: Generation - Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal generation - generation__fossil__pct: - title: Generation - Fossil (%) - short_unit: "%" - unit: "%" - display: - name: Fossil generation - generation__fossil__twh: - title: Generation - Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil generation - generation__gas__pct: - title: Generation - Gas (%) - short_unit: "%" - unit: "%" - display: - name: Gas generation - generation__gas__twh: - title: Generation - Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas generation - generation__gas_and_other_fossil__pct: - title: Generation - Gas and other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Gas and other fossil generation - generation__gas_and_other_fossil__twh: - title: Generation - Gas and other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and other fossil generation - generation__hydro__bioenergy_and_other_renewables__pct: - title: Generation - Hydro bioenergy and other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__bioenergy_and_other_renewables__twh: - title: Generation - Hydro bioenergy and other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__pct: - title: Generation - Hydro (%) - short_unit: "%" - unit: "%" - display: - name: Hydro generation - generation__hydro__twh: - title: Generation - Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro generation - generation__nuclear__pct: - title: Generation - Nuclear (%) - short_unit: "%" - unit: "%" - display: - name: Nuclear generation - generation__nuclear__twh: - title: Generation - Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear generation - generation__other_fossil__pct: - title: Generation - Other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Other fossil generation - generation__other_fossil__twh: - title: Generation - Other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other fossil generation - generation__other_renewables__pct: - title: Generation - Other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Other renewables generation - generation__other_renewables__twh: - title: Generation - Other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables generation - generation__renewables__pct: - title: Generation - Renewables (%) - short_unit: "%" - unit: "%" - display: - name: Renewables generation - generation__renewables__twh: - title: Generation - Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables generation - generation__solar__pct: - title: Generation - Solar (%) - short_unit: "%" - unit: "%" - display: - name: Solar generation - generation__solar__twh: - title: Generation - Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar generation - generation__total_generation__twh: - title: Generation - Total - short_unit: TWh - unit: terawatt-hours - display: - name: Total generation - generation__wind__pct: - title: Generation - Wind (%) - short_unit: "%" - unit: "%" - display: - name: Wind generation - generation__wind__twh: - title: Generation - Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind generation - generation__wind_and_solar__pct: - title: Generation - Wind and solar (%) - short_unit: "%" - unit: "%" - display: - name: Wind and solar generation - generation__wind_and_solar__twh: - title: Generation - Wind and solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and solar generation - imports__total_net_imports__twh: - title: Imports - Total net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total net imports - combined_electricity: - title: Electricity - variables: - capacity__bioenergy__gw: - title: Capacity - Bioenergy (GW) - short_unit: GW - unit: gigawatts - display: - name: Bioenergy capacity - capacity__clean__gw: - title: Capacity - Clean (GW) - short_unit: GW - unit: gigawatts - display: - name: Clean capacity - capacity__coal__gw: - title: Capacity - Coal (GW) - short_unit: GW - unit: gigawatts - display: - name: Coal capacity - capacity__fossil__gw: - title: Capacity - Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Fossil capacity - capacity__gas__gw: - title: Capacity - Gas (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas capacity - capacity__gas_and_other_fossil__gw: - title: Capacity - Gas and other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas and other fossil capacity - capacity__hydro__bioenergy_and_other_renewables__gw: - title: Capacity - Hydro bioenergy and other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro bioenergy and other renewables capacity - capacity__hydro__gw: - title: Capacity - Hydro (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro capacity - capacity__nuclear__gw: - title: Capacity - Nuclear (GW) - short_unit: GW - unit: gigawatts - display: - name: Nuclear capacity - capacity__other_fossil__gw: - title: Capacity - Other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Other fossil capacity - capacity__other_renewables__gw: - title: Capacity - Other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Other renewables capacity - capacity__renewables__gw: - title: Capacity - Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Renewables capacity - capacity__solar__gw: - title: Capacity - Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Solar capacity - capacity__wind__gw: - title: Capacity - Wind (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind capacity - capacity__wind_and_solar__gw: - title: Capacity - Wind and solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind and solar capacity - demand__total_demand__twh: - title: Demand - Total demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total demand - demand__total_demand_per_capita__kwh: - title: Demand - Total demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Total per capita demand - emissions__bioenergy__mtco2: - title: Emissions - Bioenergy (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Bioenergy emissions - emissions__clean__mtco2: - title: Emissions - Clean (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Clean emissions - emissions__co2_intensity__gco2_kwh: - title: Emissions - CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity - emissions__coal__mtco2: - title: Emissions - Coal (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Coal emissions - emissions__fossil__mtco2: - title: Emissions - Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Fossil emissions - emissions__gas__mtco2: - title: Emissions - Gas (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas emissions - emissions__gas_and_other_fossil__mtco2: - title: Emissions - Gas and other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas and other fossil emissions - emissions__hydro__bioenergy_and_other_renewables__mtco2: - title: Emissions - Hydro bioenergy and other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro bioenergy and other renewables emissions - emissions__hydro__mtco2: - title: Emissions - Hydro (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro emissions - emissions__nuclear__mtco2: - title: Emissions - Nuclear (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Nuclear emissions - emissions__other_fossil__mtco2: - title: Emissions - Other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other fossil emissions - emissions__other_renewables__mtco2: - title: Emissions - Other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other renewables emissions - emissions__renewables__mtco2: - title: Emissions - Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Renewables emissions - emissions__solar__mtco2: - title: Emissions - Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Solar emissions - emissions__total_emissions__mtco2: - title: Emissions - Total - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - emissions__wind__mtco2: - title: Emissions - Wind (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind emissions - emissions__wind_and_solar__mtco2: - title: Emissions - Wind and solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind and solar emissions - generation__bioenergy__pct: - title: Generation - Bioenergy (%) - short_unit: "%" - unit: "%" - display: - name: Bioenergy generation - generation__bioenergy__twh: - title: Generation - Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy generation - generation__clean__pct: - title: Generation - Clean (%) - short_unit: "%" - unit: "%" - display: - name: Clean generation - generation__clean__twh: - title: Generation - Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean generation - generation__coal__pct: - title: Generation - Coal (%) - short_unit: "%" - unit: "%" - display: - name: Coal generation - generation__coal__twh: - title: Generation - Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal generation - generation__fossil__pct: - title: Generation - Fossil (%) - short_unit: "%" - unit: "%" - display: - name: Fossil generation - generation__fossil__twh: - title: Generation - Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil generation - generation__gas__pct: - title: Generation - Gas (%) - short_unit: "%" - unit: "%" - display: - name: Gas generation - generation__gas__twh: - title: Generation - Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas generation - generation__gas_and_other_fossil__pct: - title: Generation - Gas and other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Gas and other fossil generation - generation__gas_and_other_fossil__twh: - title: Generation - Gas and other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and other fossil generation - generation__hard_coal__pct: - title: Generation - Hard coal (%) - short_unit: "%" - unit: "%" - display: - name: Hard coal generation - generation__hard_coal__twh: - title: Generation - Hard coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hard coal generation - generation__hydro__bioenergy_and_other_renewables__pct: - title: Generation - Hydro bioenergy and other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__bioenergy_and_other_renewables__twh: - title: Generation - Hydro bioenergy and other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__pct: - title: Generation - Hydro (%) - short_unit: "%" - unit: "%" - display: - name: Hydro generation - generation__hydro__twh: - title: Generation - Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro generation - generation__lignite__pct: - title: Generation - Lignite (%) - short_unit: "%" - unit: "%" - display: - name: Lignite generation - generation__lignite__twh: - title: Generation - Lignite (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Lignite generation - generation__nuclear__pct: - title: Generation - Nuclear (%) - short_unit: "%" - unit: "%" - display: - name: Nuclear generation - generation__nuclear__twh: - title: Generation - Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear generation - generation__other_fossil__pct: - title: Generation - Other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Other fossil generation - generation__other_fossil__twh: - title: Generation - Other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other fossil generation - generation__other_renewables__pct: - title: Generation - Other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Other renewables generation - generation__other_renewables__twh: - title: Generation - Other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables generation - generation__renewables__pct: - title: Generation - Renewables (%) - short_unit: "%" - unit: "%" - display: - name: Renewables generation - generation__renewables__twh: - title: Generation - Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables generation - generation__solar__pct: - title: Generation - Solar (%) - short_unit: "%" - unit: "%" - display: - name: Solar generation - generation__solar__twh: - title: Generation - Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar generation - generation__total_generation__twh: - title: Generation - Total - short_unit: TWh - unit: terawatt-hours - display: - name: Total generation - generation__wind__pct: - title: Generation - Wind (%) - short_unit: "%" - unit: "%" - display: - name: Wind generation - generation__wind__twh: - title: Generation - Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind generation - generation__wind_and_solar__pct: - title: Generation - Wind and solar (%) - short_unit: "%" - unit: "%" - display: - name: Wind and solar generation - generation__wind_and_solar__twh: - title: Generation - Wind and solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and solar generation - imports__total_net_imports__twh: - title: Imports - Total net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total net imports - net_flows: - title: Net flows - variables: - net_flow__twh: - title: Net flow (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Net flow diff --git a/etl/steps/archive/garden/ember/2022-12-13/combined_electricity.py b/etl/steps/archive/garden/ember/2022-12-13/combined_electricity.py deleted file mode 100644 index 8a1d133328d..00000000000 --- a/etl/steps/archive/garden/ember/2022-12-13/combined_electricity.py +++ /dev/null @@ -1,362 +0,0 @@ -"""Garden step that combines Ember's European Electricity Review (EER) and Ember's Yearly Electricity Data (YED). - -The YED dataset contains data for all countries in EER. However, YED starts in 2000, while EER starts in 1990. - -Therefore, to gather as much data as possible, we combine both datasets. - -NOTE: This step used to combine Ember's Global Electricity Review and the EER, but now we have replaced the former by -the YED. However, there may be instances in the code where "global" refers to the YED. - -""" - -import pandas as pd -from owid import catalog -from owid.datautils import dataframes -from shared import CURRENT_DIR - -from etl.paths import DATA_DIR - -# Details for dataset to export. -DATASET_SHORT_NAME = "combined_electricity" -METADATA_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Details for datasets to import. -GLOBAL_DATASET_PATH = DATA_DIR / "garden/ember/2022-12-13/yearly_electricity" -EUROPEAN_DATASET_PATH = DATA_DIR / "garden/ember/2022-08-01/european_electricity_review" - -# Define aggregates, following their Ember-Electricity-Data-Methodology document: -# https://ember-climate.org/app/uploads/2022/03/GER22-Methodology.pdf -# The European review also has its own methodology document: -# https://ember-climate.org/app/uploads/2022/02/EER-Methodology.pdf -# but it does not explicitly define aggregates. We assume they are consistent with each other. -# This will be also checked, along with other sanity checks, in a separate analysis. -AGGREGATES = { - "coal__twh": [ - "hard_coal__twh", - "lignite__twh", - ], - "wind_and_solar__twh": ["wind__twh", "solar__twh"], - "hydro__bioenergy_and_other_renewables__twh": [ - "hydro__twh", - "bioenergy__twh", - "other_renewables__twh", - ], - "renewables__twh": [ - "wind_and_solar__twh", - "hydro__bioenergy_and_other_renewables__twh", - ], - "clean__twh": [ - "renewables__twh", - "nuclear__twh", - ], - "gas_and_other_fossil__twh": [ - "gas__twh", - "other_fossil__twh", - ], - "fossil__twh": ["gas_and_other_fossil__twh", "coal__twh"], - "total_generation__twh": [ - "clean__twh", - "fossil__twh", - ], -} - - -def combine_yearly_electricity_data(ds_global: catalog.Dataset) -> catalog.Table: - """Combine all tables in Ember's Yearly Electricity Data into one table. - - Parameters - ---------- - ds_global : catalog.Dataset - Yearly Electricity dataset (containing tables for capacity, electricity demand, generation, imports and - emissions). - - Returns - ------- - combined_global : catalog.Table - Combined table containing all data in the Yearly Electricity dataset. - - """ - category_renaming = { - "capacity": "Capacity - ", - "electricity_demand": "", - "electricity_generation": "Generation - ", - "electricity_imports": "", - "power_sector_emissions": "Emissions - ", - } - error = "Tables in yearly electricity dataset have changed" - assert set(category_renaming) == set(ds_global.table_names), error - index_columns = ["country", "year"] - tables = [] - for category in category_renaming: - table = ds_global[category].copy() - table = table.rename( - columns={ - column: catalog.utils.underscore(category_renaming[category] + column) - for column in table.columns - if column not in index_columns - } - ) - table = table.reset_index() - tables.append(table) - - # Merge all tables into one. - combined_global = dataframes.multi_merge(dfs=tables, on=index_columns, how="outer") - - # Rename certain columns for consistency. - combined_global = combined_global.rename( - columns={ - "net_imports__twh": "imports__total_net_imports__twh", - "demand__twh": "demand__total_demand__twh", - "demand_per_capita__kwh": "demand__total_demand_per_capita__kwh", - }, - errors="raise", - ) - - # Sanity check. - error = "Total generation column in emissions and generation tables are not identical." - assert all( - combined_global["emissions__total_generation__twh"] == combined_global["generation__total_generation__twh"] - ), error - - # Remove unnecessary columns and any possible rows with no data. - combined_global = combined_global.drop(columns=["population", "emissions__total_generation__twh"]).dropna(how="all") - - # Set a convenient index and sort. - combined_global = combined_global.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Sort columns conveniently. - combined_global = combined_global[sorted(combined_global.columns)] - - # Ensure all column names are snake lower case. - combined_global = catalog.utils.underscore_table(combined_global) - - # Import metadata from metadata yaml file. - combined_global.update_metadata_from_yaml(METADATA_PATH, "yearly_electricity") - - return combined_global - - -def combine_european_electricity_review_data( - ds_european: catalog.Dataset, -) -> catalog.Table: - """Combine tables in Ember's European Electricity Review dataset into one table. - - The tables to be combined are 'country_overview', 'generation', and 'emissions'. The remaining table on net flows - has a different structure and cannot be combined with the others, so it will remain as a separate table. - - Parameters - ---------- - ds_european : catalog.Dataset - European Electricity Review dataset. - - Returns - ------- - combined_european : catalog.Table - Combined table containing all data in the European Electricity Review dataset (except net flows). - - """ - index_columns = ["country", "year"] - # Extract the necessary tables from the dataset. - country_overview = ds_european["country_overview"].copy() - generation = ds_european["generation"].copy() - emissions = ds_european["emissions"].copy() - - # Create aggregates (defined in AGGREGATES) that are in yearly electricity but not in the european review. - for aggregate in AGGREGATES: - generation[aggregate] = pd.DataFrame(generation)[AGGREGATES[aggregate]].sum(axis=1) - - # Create a column for each of those new aggregates, giving percentage share of total generation. - for aggregate in AGGREGATES: - column = aggregate.replace("__twh", "__pct") - generation[column] = pd.DataFrame(generation)[aggregate] / generation["total_generation__twh"] * 100 - - # Check that total generation adds up to 100%. - error = "Total generation does not add up to 100%." - assert set(generation["total_generation__pct"]) == {100}, error - - # Check that the constructed "total generation" column agrees with the one given in table "country_overview". - columns = ["country", "year", "total_generation__twh"] - check = pd.merge( - ds_european["country_overview"].reset_index()[columns], - generation.reset_index()[columns], - on=index_columns, - ) - # Assert that the percentage change is smaller than 1% - error = "Total generation does not agree with the on in country_overview." - assert all( - (abs(check["total_generation__twh_x"] - check["total_generation__twh_y"]) / check["total_generation__twh_x"]) - < 0.01 - ), error - - # Remove unnecessary columns. - generation = generation.drop(columns=["total_generation__pct", "total_generation__twh"]) - - # Rename all column names to start with the category, before combining all categories. - generation = generation.rename(columns={column: "generation__" + column for column in generation.columns}) - emissions = emissions.rename(columns={column: "emissions__" + column for column in emissions.columns}) - country_overview = country_overview.rename( - columns={ - "total_generation__twh": "generation__total_generation__twh", - "demand__twh": "demand__total_demand__twh", - "demand_per_capita__kwh": "demand__total_demand_per_capita__kwh", - "net_imports__twh": "imports__total_net_imports__twh", - }, - errors="raise", - ) - - # Combine tables into one dataframe. - combined_european = dataframes.multi_merge( - [ - country_overview.reset_index(), - emissions.reset_index(), - generation.reset_index(), - ], - on=index_columns, - how="outer", - ) - - # If any column was repeated in the merge, it will have a "_x" at the end of the name. - # Check that no other columns were repeated. - error = "There are repeated columns in combined dataframe." - assert len([column for column in combined_european.columns if column.endswith("_x")]) == 0, error - - # Remove any possible rows with no data. - combined_european = combined_european.dropna(how="all") - - # Ensure that the index is well constructed. - combined_european = combined_european.set_index(index_columns, verify_integrity=True).sort_index() - - # Sort columns conveniently. - combined_european = combined_european[sorted(combined_european.columns)] - - # Ensure all column names are snake lower case. - combined_european = catalog.utils.underscore_table(combined_european) - - # Import metadata from metadata yaml file. - combined_european.update_metadata_from_yaml(METADATA_PATH, "european_electricity_review") - - return combined_european - - -def combine_yearly_electricity_data_and_european_electricity_review( - combined_global: catalog.Table, combined_european: catalog.Table -) -> catalog.Table: - """Combine the combined table of the Yearly Electricity Data with the combined table of the European Electricity - Review. - - Parameters - ---------- - combined_global : catalog.Table - Table that combines all tables of the Yearly Electricity Data. - combined_european : catalog.Table - Table that combines all tables of the European Electricity Review (except net flows). - - Returns - ------- - combined : catalog.Table - Combined data. - - """ - # Concatenate variables one by one, so that, if one of the two sources does not have data, we can take the - # source that is complete. - # When both sources are complete (for european countries), prioritise the european review (since it has more data, - # and it possibly is more up-to-date than the yearly electricity data). - combined_global = combined_global.reset_index() - combined_european = combined_european.reset_index() - - index_columns = ["country", "year"] - data_columns = sorted( - [col for col in (set(combined_global.columns) | set(combined_european.columns)) if col not in index_columns] - ) - # We should not concatenate bp and shift data directly, since there are nans in different places. - # Instead, go column by column, concatenate, remove nans, and then keep the BP version on duplicated rows. - combined = pd.DataFrame({column: [] for column in index_columns}) - for variable in data_columns: - _global_data = pd.DataFrame() - _european_data = pd.DataFrame() - if variable in combined_global.columns: - _global_data = combined_global[index_columns + [variable]].dropna(subset=variable) - if variable in combined_european.columns: - _european_data = combined_european[index_columns + [variable]].dropna(subset=variable) - _combined = pd.concat([_global_data, _european_data], ignore_index=True) - # On rows where both datasets overlap, give priority to european review data. - _combined = _combined.drop_duplicates(subset=index_columns, keep="last") - # Combine data for different variables. - combined = pd.merge(combined, _combined, on=index_columns, how="outer") - error = "There are repeated columns when combining yearly electricity data and european electricity review tables." - assert len([column for column in combined.columns if column.endswith("_x")]) == 0, error - - # Create a table (with no metadata) and sort data appropriately. - combined = catalog.Table(combined).set_index(index_columns).sort_index() - - # Ensure all column names are snake lower case. - combined = catalog.utils.underscore_table(combined) - - # Import metadata from metadata yaml file. - combined.update_metadata_from_yaml(METADATA_PATH, "combined_electricity") - - return combined - - -def create_net_flows_table(ds_european: catalog.Dataset) -> catalog.Table: - """Create a table for the net flows of the European Electricity Review, since it has a different structure and - cannot be combined with the rest of tables in the Yearly Electricity Data or European Electricity Review. - - Parameters - ---------- - ds_european : catalog.Dataset - European Electricity Review. - - Returns - ------- - net_flows : catalog.Table - Table of net flows. - - """ - # Keep the net flows table as in the original european review (this table was not present in yearly electricity). - net_flows = ds_european["net_flows"].copy() - # Import metadata from metadata yaml file. - net_flows.update_metadata_from_yaml(METADATA_PATH, "net_flows") - - return net_flows - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read yearly electricity data and european electricity review datasets from garden. - ds_global = catalog.Dataset(GLOBAL_DATASET_PATH) - ds_european = catalog.Dataset(EUROPEAN_DATASET_PATH) - - # - # Process data. - # - # Combine all tables of the yearly electricity data into one. - combined_global = combine_yearly_electricity_data(ds_global=ds_global) - - # Combine all tables of the european electricity review into one. - combined_european = combine_european_electricity_review_data(ds_european=ds_european) - - # Combine yearly electricity and european reviews. - combined = combine_yearly_electricity_data_and_european_electricity_review( - combined_global=combined_global, combined_european=combined_european - ) - - # Create an additional table with the electricity net flows (only available in european review). - net_flows = create_net_flows_table(ds_european=ds_european) - - # - # Save outputs. - # - ds_garden = catalog.Dataset.create_empty(dest_dir) - # Import metadata from the metadata yaml file. - ds_garden.metadata.update_from_yaml(METADATA_PATH, if_source_exists="replace") - - # Add all tables to the new dataset. - ds_garden.add(combined_global) - ds_garden.add(combined_european) - ds_garden.add(combined) - ds_garden.add(net_flows) - - # Create dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/ember/2022-12-13/shared.py b/etl/steps/archive/garden/ember/2022-12-13/shared.py deleted file mode 100644 index d039bbfcde7..00000000000 --- a/etl/steps/archive/garden/ember/2022-12-13/shared.py +++ /dev/null @@ -1,567 +0,0 @@ -from pathlib import Path -from typing import Any, Dict, List, Optional, Union, cast - -import numpy as np -import pandas as pd -from owid import catalog -from structlog import get_logger - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -log = get_logger() - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - -# Aggregate regions to add, following OWID definitions. -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -# REGIONS = { -# # Default continents. -# "Africa": {}, -# "Asia": {}, -# "Europe": {}, -# "European Union (27)": {}, -# "North America": {}, -# "Oceania": {}, -# "South America": {}, -# "World": {}, -# # Income groups. -# "Low-income countries": {}, -# "Upper-middle-income countries": {}, -# "Lower-middle-income countries": {}, -# "High-income countries": {}, -# } - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION: Dict[str, Dict[str, Union[str, List[str]]]] = { - "Czechoslovakia": { - "continent": "Europe", - "income_group": "High-income countries", - "regions_included": [ - # Europe - High-income countries. - "Czechia", - "Slovakia", - ], - }, - "East Germany": { - "continent": "Europe", - "income_group": "", - "regions_included": [ - # Europe - High-income countries. - "Germany", - ], - }, - "West Germany": { - "continent": "Europe", - "income_group": "", - "regions_included": [ - # Europe - High-income countries. - "Germany", - ], - }, - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "regions_included": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - "Bonaire Sint Eustatius and Saba", - ], - }, - "Serbia and Montenegro": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - Upper-middle-income countries. - "Serbia", - "Montenegro", - ], - }, - "North Yemen": { - "continent": "Asia", - "income_group": "Low-income countries", - "regions_included": [ - # Asia - Low-income countries. - "Yemen", - ], - }, - "South Yemen": { - "continent": "Asia", - "income_group": "Low-income countries", - "regions_included": [ - # Asia - Low-income countries. - "Yemen", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, - "Yugoslavia": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - High-income countries. - "Croatia", - "Slovenia", - # Europe - Upper-middle-income countries. - "North Macedonia", - "Bosnia and Herzegovina", - "Serbia", - "Montenegro", - ], - }, -} - -# Historical countries whose population can be built by adding up the population of their successor countries. -# Those historical countries not listed here will have no population data. -BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES = [ - # The following regions split into smaller ones, and can be estimated by the population of the successors. - "Czechoslovakia", - "Netherlands Antilles", - "Serbia and Montenegro", - "USSR", - "Yugoslavia", - # The following countries cannot be replaced by the successor countries. - # 'East Germany', - # 'West Germany', - # 'North Yemen', - # 'South Yemen', -] - - -# Historical countries for which we don't have population, and can't be built from successor countries. -EXPECTED_COUNTRIES_WITHOUT_POPULATION = list( - set(HISTORIC_TO_CURRENT_REGION) - set(BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES) -) - -# Overlaps found between historical regions and successor countries, that we accept in the data. -# We accept them either because they happened close to the transition, or to avoid needing to introduce new -# countries for which we do not have data (like the Russian Empire). -ACCEPTED_OVERLAPS = { - # 1991: {"Georgia", "USSR"}, -} - - -def gather_sources_from_tables( - tables: List[catalog.Table], -) -> List[catalog.meta.Source]: - """Gather unique sources from the metadata.dataset of each table in a list of tables. - - Note: To check if a source is already listed, only the name of the source is considered (not the description or any - other field in the source). - - Parameters - ---------- - tables : list - List of tables with metadata. - - Returns - ------- - known_sources : list - List of unique sources from all tables. - - """ - # Initialise list that will gather all unique metadata sources from the tables. - known_sources: List[catalog.meta.Source] = [] - for table in tables: - # Get list of sources of the dataset of current table. - table_sources = table.metadata.dataset.sources - # Go source by source of current table, and check if its name is not already in the list of known_sources. - for source in table_sources: - # Check if this source's name is different to all known_sources. - if all([source.name != known_source.name for known_source in known_sources]): - # Add the new source to the list. - known_sources.append(source) - - return known_sources - - -def get_countries_in_region( - region: str, region_modifications: Optional[Dict[str, Dict[str, List[str]]]] = None -) -> List[str]: - """Get countries in a region, both for known regions (e.g. "Africa") and custom ones (e.g. "Europe (excl. EU-27)"). - - Parameters - ---------- - region : str - Region name (e.g. "Africa", or "Europe (excl. EU-27)"). - region_modifications : dict or None - If None (or an empty dictionary), the region should be in OWID's countries-regions dataset. - If not None, it should be a dictionary with any (or all) of the following keys: - - "regions_included": List of regions whose countries will be included. - - "regions_excluded": List of regions whose countries will be excluded. - - "countries_included": List of additional individual countries to be included. - - "countries_excluded": List of additional individual countries to be excluded. - NOTE: All regions and countries defined in this dictionary should be in OWID's countries-regions dataset. - - Returns - ------- - countries : list - List of countries in the specified region. - - """ - if region_modifications is None: - region_modifications = {} - - # Check that the fields in the regions_modifications dictionary are well defined. - expected_fields = ["regions_included", "regions_excluded", "countries_included", "countries_excluded"] - assert all([field in expected_fields for field in region_modifications]) - - # Get lists of regions whose countries will be included and excluded. - regions_included = region_modifications.get("regions_included", [region]) - regions_excluded = region_modifications.get("regions_excluded", []) - # Get lists of additional individual countries to include and exclude. - countries_included = region_modifications.get("countries_included", []) - countries_excluded = region_modifications.get("countries_excluded", []) - - # List countries from the list of regions included. - countries_set = set( - sum([geo.list_countries_in_region(region_included) for region_included in regions_included], []) - ) - - # Remove all countries from the list of regions excluded. - countries_set -= set( - sum([geo.list_countries_in_region(region_excluded) for region_excluded in regions_excluded], []) - ) - - # Add the list of individual countries to be included. - countries_set |= set(countries_included) - - # Remove the list of individual countries to be excluded. - countries_set -= set(countries_excluded) - - # Convert set of countries into a sorted list. - countries = sorted(countries_set) - - return countries - - -def load_population(regions: Optional[Dict[Any, Any]] = None) -> pd.DataFrame: - """Load OWID population dataset, and add historical regions to it. - - Returns - ------- - population : pd.DataFrame - Population dataset. - - """ - # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] - - # Add data for historical regions (if not in population) by adding the population of its current successors. - countries_with_population = population["country"].unique() - - # Consider additional regions (e.g. historical regions). - if regions is None: - regions = {} - missing_countries = [country for country in regions if country not in countries_with_population] - for country in missing_countries: - members = regions[country]["regions_included"] - _population = ( - population[population["country"].isin(members)] - .groupby("year") - .agg({"population": "sum", "country": "nunique"}) - .reset_index() - ) - # Select only years for which we have data for all member countries. - _population = _population[_population["country"] == len(members)].reset_index(drop=True) - _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) - - error = "Duplicate country-years found in population. Check if historical regions changed." - assert population[population.duplicated(subset=["country", "year"])].empty, error - - return cast(pd.DataFrame, population) - - -def load_income_groups() -> pd.DataFrame: - """Load dataset of income groups and add historical regions to it. - - Returns - ------- - income_groups : pd.DataFrame - Income groups data. - - """ - # Load the WorldBank dataset for income grups. - income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() - - # Add historical regions to income groups. - for historic_region in HISTORIC_TO_CURRENT_REGION: - historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] - if historic_region not in income_groups["country"]: - historic_region_df = pd.DataFrame( - { - "country": [historic_region], - "income_group": [historic_region_income_group], - } - ) - income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) - - return cast(pd.DataFrame, income_groups) - - -def add_population( - df: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - interpolate_missing_population: bool = False, - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, - regions: Optional[Dict[Any, Any]] = None, - expected_countries_without_population: List[str] = [], -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - This function has been adapted from datautils.geo, because population currently does not include historic regions. - We include them in this function. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - interpolate_missing_population : bool - True to linearly interpolate population on years that are presented in df, but for which we do not have - population data; otherwise False to keep missing population data as nans. - For example, if interpolate_missing_population is True and df has data for all years between 1900 and 1910, - but population is only given for 1900 and 1910, population will be linearly interpolated between those years. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - regions : dict - Definitions of regions whose population also needs to be included. - expected_countries_without_population : list - Countries that are expected to not have population (that should be ignored if warnings are activated). - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Load population dataset. - population = load_population(regions=regions).rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - if interpolate_missing_population: - # For some countries we have population data only on certain years, e.g. 1900, 1910, etc. - # Optionally fill missing years linearly. - countries_in_data = df[country_col].unique() - years_in_data = df[year_col].unique() - - population = population.set_index([country_col, year_col]).reindex( - pd.MultiIndex.from_product([countries_in_data, years_in_data], names=[country_col, year_col]) - ) - - population = population.groupby(country_col).transform( - lambda x: x.interpolate(method="linear", limit_direction="both") - ) - - error = "Countries without population data differs from list of expected countries without population data." - assert set(population[population[population_col].isnull()].reset_index()[country_col]) == set( - expected_countries_without_population - ), error - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population - - -def detect_overlapping_regions( - df, index_columns, region_and_members, country_col="country", year_col="year", ignore_zeros=True -): - """Detect years on which the data for two regions overlap, e.g. a historical region and one of its successors. - - Parameters - ---------- - df : _type_ - Data (with a dummy index). - index_columns : _type_ - Names of index columns. - region_and_members : _type_ - Regions to check for overlaps. Each region must have a dictionary "regions_included", listing the subregions - contained. If the region is historical, "regions_included" would be the list of successor countries. - country_col : str, optional - Name of country column (usually "country"). - year_col : str, optional - Name of year column (usually "year"). - ignore_zeros : bool, optional - True to ignore overlaps of zeros. - - Returns - ------- - all_overlaps : dict - All overlaps found. - - """ - # Sum over all columns to get the total sum of each column for each country-year. - df_total = ( - df.groupby([country_col, year_col]) - .agg({column: "sum" for column in df.columns if column not in index_columns}) - .reset_index() - ) - # Create a list of values that will be ignored in overlaps (usually zero or nothing). - if ignore_zeros: - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - # List all variables in data (ignoring index columns). - variables = [column for column in df.columns if column not in index_columns] - # List all country names found in data. - countries_in_data = df[country_col].unique().tolist() - # List all regions found in data. - regions = [country for country in list(region_and_members) if country in countries_in_data] - # Initialize a dictionary that will store all overlaps found. - all_overlaps = {} - for region in regions: - # List members of current region. - members = [member for member in region_and_members[region]["regions_included"] if member in countries_in_data] - for member in members: - # Select data for current region. - region_values = ( - df_total[df_total[country_col] == region] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=variables, how="all") - ) - # Select data for current member. - member_values = ( - df_total[df_total[country_col] == member] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=variables, how="all") - ) - # Concatenate both selections of data, and select duplicated rows. - combined = pd.concat([region_values, member_values]) - overlaps = combined[combined.duplicated(subset=[year_col], keep=False)] # type: ignore - if len(overlaps) > 0: - # Add the overlap found to the dictionary of all overlaps. - all_overlaps.update({year: set(overlaps[country_col]) for year in overlaps[year_col].unique()}) - - # Sort overlaps conveniently. - all_overlaps = {year: all_overlaps[year] for year in sorted(list(all_overlaps))} - - return all_overlaps - - -def add_region_aggregates( - data: pd.DataFrame, - regions: Dict[Any, Any], - index_columns: List[str], - country_column: str = "country", - aggregates: Optional[Dict[str, str]] = None, -) -> pd.DataFrame: - """Add region aggregates for all regions (which may include continents and income groups). - - Parameters - ---------- - data : pd.DataFrame - Data. - regions: list - Regions to add. - index_columns : list - Name of index columns. - country_column : str - Name of country column. - year_column : str - Name of year column. - aggregates : dict or None - Dictionary of type of aggregation to use for each variable. If None, variables will be aggregated by summing. - - Returns - ------- - data : pd.DataFrame - Data after adding aggregate regions. - - """ - data = data.copy() - - all_overlaps = detect_overlapping_regions( - df=data, region_and_members=HISTORIC_TO_CURRENT_REGION, index_columns=index_columns - ) - - # Check whether all accepted overlaps are found in the data, and that there are no new unknown overlaps. - error = "Either the list of accepted overlaps is not found in the data, or there are new unknown overlaps." - assert ACCEPTED_OVERLAPS == all_overlaps, error - - if aggregates is None: - # If aggregations are not specified, assume all variables are to be aggregated, by summing. - aggregates = {column: "sum" for column in data.columns if column not in index_columns} - - for region in regions: - # List of countries in region. - countries_in_region = get_countries_in_region(region=region, region_modifications=regions[region]) - # Select rows of data for member countries. - data_region = data[data[country_column].isin(countries_in_region)] - - # Add region aggregates. - region_df = ( - data_region.groupby([column for column in index_columns if column != country_column]) - .agg(aggregates) - .reset_index() - .assign(**{country_column: region}) - ) - data = pd.concat([data, region_df], ignore_index=True) - - return data diff --git a/etl/steps/archive/garden/ember/2022-12-13/yearly_electricity.countries.json b/etl/steps/archive/garden/ember/2022-12-13/yearly_electricity.countries.json deleted file mode 100644 index af909746415..00000000000 --- a/etl/steps/archive/garden/ember/2022-12-13/yearly_electricity.countries.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas (the)": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bosnia Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cayman Islands (the)": "Cayman Islands", - "Central African Republic (the)": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros (the)": "Comoros", - "Congo (the Democratic Republic of the)": "Democratic Republic of Congo", - "Congo (the)": "Congo", - "Cook Islands (the)": "Cook Islands", - "Costa Rica": "Costa Rica", - "Cote d'Ivoire": "Cote d'Ivoire", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic (the)": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Falkland Islands (the) [Malvinas]": "Falkland Islands", - "Faroe Islands (the)": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia (the)": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Gibraltar": "Gibraltar", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea (the Democratic People's Republic of)": "North Korea", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic (the)": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Moldova": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger (the)": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palestine, State of": "Palestine", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines (the)": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Reunion": "Reunion", - "Romania": "Romania", - "Russian Federation (the)": "Russia", - "Rwanda": "Rwanda", - "Saint Helena, Ascension and Tristan da Cunha": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan (the)": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic (the)": "Syria", - "Taiwan": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania, the United Republic of": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands (the)": "Turks and Caicos Islands", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Virgin Islands (British)": "British Virgin Islands", - "Virgin Islands (U.S.)": "United States Virgin Islands", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "OECD": "OECD (Ember)", - "Africa": "Africa (Ember)", - "Asia": "Asia (Ember)", - "EU": "European Union (27) (Ember)", - "Europe": "Europe (Ember)", - "G20": "G20 (Ember)", - "G7": "G7 (Ember)", - "Latin America and Caribbean": "Latin America and Caribbean (Ember)", - "North America": "North America (Ember)", - "Oceania": "Oceania (Ember)" -} diff --git a/etl/steps/archive/garden/ember/2022-12-13/yearly_electricity.meta.yml b/etl/steps/archive/garden/ember/2022-12-13/yearly_electricity.meta.yml deleted file mode 100644 index 2e8d5986346..00000000000 --- a/etl/steps/archive/garden/ember/2022-12-13/yearly_electricity.meta.yml +++ /dev/null @@ -1,434 +0,0 @@ -dataset: - namespace: ember - version: 2022-12-13 - title: Yearly Electricity Data (Ember, 2022) - short_name: yearly_electricity - description: | - [Ember's region definitions](https://ember-climate.org/countries-and-regions/), denoted with "(Ember)", are: - * "G20 (Ember)" - Group of Twenty: Argentina, Australia, Brazil, Canada, China, France, Germany, India, Indonesia, Italy, Japan, Mexico, Russia, Saudi Arabia, South Africa, South Korea, Turkey, United Kingdom, United States and the 27 members of the European Union. - * "G7 (Ember)" - Group of Seven: Canada, France, Germany, Italy, Japan, United Kingdom and United States. - * "Latin America and Caribbean (Ember)": Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, Uruguay, Venezuela, Aruba, British Virgin Islands, Cayman Islands, Falkland Islands, French Guiana, Guadeloupe, Martinique, Montserrat, Puerto Rico, Turks and Caicos Islands and United States Virgin Islands. - * "OECD (Ember)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, United Kingdom, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, and United States. - sources: - - name: Our World in Data based on Ember's Yearly Electricity Data (2022). - published_by: Ember - publication_year: 2022 - date_accessed: 2022-12-13 - url: https://ember-climate.org/data-catalogue/yearly-electricity-data/ -tables: - capacity: - variables: - clean__gw: - title: Clean (GW) - short_unit: GW - unit: gigawatts - display: - name: Clean - fossil__gw: - title: Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Fossil - gas_and_other_fossil__gw: - title: Gas and Other Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__gw: - title: Hydro, Bioenergy and Other Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro, Bioenergy and Other Renewables - renewables__gw: - title: Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Renewables - wind_and_solar__gw: - title: Wind and Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind and Solar - bioenergy__gw: - title: Bioenergy (GW) - short_unit: GW - unit: gigawatts - display: - name: Bioenergy - coal__gw: - title: Coal (GW) - short_unit: GW - unit: gigawatts - display: - name: Coal - gas__gw: - title: Gas (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas - hydro__gw: - title: Hydro (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro - nuclear__gw: - title: Nuclear (GW) - short_unit: GW - unit: gigawatts - display: - name: Nuclear - other_fossil__gw: - title: Other Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Other Fossil - other_renewables__gw: - title: Other Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Other Renewables - solar__gw: - title: Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Solar - wind__gw: - title: Wind (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind - electricity_demand: - variables: - demand__twh: - title: Demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Demand - population: - title: Population - short_unit: people - unit: people - demand_per_capita__kwh: - title: Demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Demand per capita - electricity_generation: - variables: - clean__pct: - title: Clean (%) - short_unit: '%' - unit: '%' - display: - name: Clean - fossil__pct: - title: Fossil (%) - short_unit: '%' - unit: '%' - display: - name: Fossil - gas_and_other_fossil__pct: - title: Gas and Other Fossil (%) - short_unit: '%' - unit: '%' - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__pct: - title: Hydro, Bioenergy and Other Renewables (%) - short_unit: '%' - unit: '%' - display: - name: Hydro, Bioenergy and Other Renewables - renewables__pct: - title: Renewables (%) - short_unit: '%' - unit: '%' - display: - name: Renewables - wind_and_solar__pct: - title: Wind and Solar (%) - short_unit: '%' - unit: '%' - display: - name: Wind and Solar - clean__twh: - title: Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean - fossil__twh: - title: Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil - gas_and_other_fossil__twh: - title: Gas and Other Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__twh: - title: Hydro, Bioenergy and Other Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro, Bioenergy and Other Renewables - renewables__twh: - title: Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables - wind_and_solar__twh: - title: Wind and Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and Solar - bioenergy__pct: - title: Bioenergy (%) - short_unit: '%' - unit: '%' - display: - name: Bioenergy - coal__pct: - title: Coal (%) - short_unit: '%' - unit: '%' - display: - name: Coal - gas__pct: - title: Gas (%) - short_unit: '%' - unit: '%' - display: - name: Gas - hydro__pct: - title: Hydro (%) - short_unit: '%' - unit: '%' - display: - name: Hydro - nuclear__pct: - title: Nuclear (%) - short_unit: '%' - unit: '%' - display: - name: Nuclear - other_fossil__pct: - title: Other Fossil (%) - short_unit: '%' - unit: '%' - display: - name: Other Fossil - other_renewables__pct: - title: Other Renewables (%) - short_unit: '%' - unit: '%' - display: - name: Other Renewables - solar__pct: - title: Solar (%) - short_unit: '%' - unit: '%' - display: - name: Solar - wind__pct: - title: Wind (%) - short_unit: '%' - unit: '%' - display: - name: Wind - bioenergy__twh: - title: Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy - coal__twh: - title: Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - gas__twh: - title: Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas - hydro__twh: - title: Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro - nuclear__twh: - title: Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - other_fossil__twh: - title: Other Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other Fossil - other_renewables__twh: - title: Other Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other Renewables - solar__twh: - title: Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - wind__twh: - title: Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - total_generation__twh: - title: Total Generation (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total Generation - electricity_imports: - variables: - net_imports__twh: - title: Net Imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Net Imports - power_sector_emissions: - variables: - clean__mtco2: - title: Clean (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Clean - fossil__mtco2: - title: Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Fossil - gas_and_other_fossil__mtco2: - title: Gas and Other Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__mtco2: - title: Hydro, Bioenergy and Other Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro, Bioenergy and Other Renewables - renewables__mtco2: - title: Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Renewables - wind_and_solar__mtco2: - title: Wind and Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind and Solar - bioenergy__mtco2: - title: Bioenergy (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Bioenergy - coal__mtco2: - title: Coal (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Coal - gas__mtco2: - title: Gas (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas - hydro__mtco2: - title: Hydro (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro - nuclear__mtco2: - title: Nuclear (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Nuclear - other_fossil__mtco2: - title: Other Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other Fossil - other_renewables__mtco2: - title: Other Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other Renewables - solar__mtco2: - title: Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Solar - wind__mtco2: - title: Wind (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind - total_emissions__mtco2: - title: Total emissions (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - total_generation__twh: - title: Total Generation (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total Generation - co2_intensity__gco2_kwh: - title: CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity diff --git a/etl/steps/archive/garden/ember/2022-12-13/yearly_electricity.py b/etl/steps/archive/garden/ember/2022-12-13/yearly_electricity.py deleted file mode 100644 index f8fe6004c25..00000000000 --- a/etl/steps/archive/garden/ember/2022-12-13/yearly_electricity.py +++ /dev/null @@ -1,387 +0,0 @@ -"""Garden step for Ember's Yearly Electricity Data. - -""" - -import pandas as pd -from owid import catalog -from shared import CURRENT_DIR, add_population, add_region_aggregates, log - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -# Details for dataset to export. -DATASET_SHORT_NAME = "yearly_electricity" -# Details for dataset to import. -MEADOW_VERSION = "2022-12-13" -MEADOW_DATASET_PATH = DATA_DIR / f"meadow/ember/{MEADOW_VERSION}/{DATASET_SHORT_NAME}" - -COUNTRY_MAPPING_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.countries.json" -METADATA_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" - -# Aggregate regions to add, following OWID definitions. -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -REGIONS = { - # Default continents. - "Africa": {}, - "Asia": {}, - "Europe": {}, - "European Union (27)": {}, - "North America": {}, - "Oceania": {}, - "South America": {}, - # Ember already has data for "World". - # "World": {}, - # Income groups. - "Low-income countries": {}, - "Upper-middle-income countries": {}, - "Lower-middle-income countries": {}, - "High-income countries": {}, -} - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 -# Megatonnes to grams. -MT_TO_G = 1e12 - -# Map units (short version) to unit name (long version). -SHORT_UNIT_TO_UNIT = { - "TWh": "terawatt-hours", - "MWh": "megawatt-hours", - "kWh": "kilowatt-hours", - "mtCO2": "megatonnes of CO2 equivalent", - "gCO2/kWh": "grams of CO2 equivalent per kilowatt-hour", - "GW": "gigawatts", - "%": "%", -} - -# Categories expected to exist in the data. -CATEGORIES = [ - "Capacity", - "Electricity demand", - "Electricity generation", - "Electricity imports", - "Power sector emissions", -] - -# Choose columns for which region aggregates should be created. -SUM_AGGREGATES = [ - # "Bioenergy (%)", - "Bioenergy (GW)", - "Bioenergy (TWh)", - "Bioenergy (mtCO2)", - # "CO2 intensity (gCO2/kWh)", - # "Clean (%)", - "Clean (GW)", - "Clean (TWh)", - "Clean (mtCO2)", - # "Coal (%)", - "Coal (GW)", - "Coal (TWh)", - "Coal (mtCO2)", - "Demand (TWh)", - # "Demand per capita (MWh)", - # "Fossil (%)", - "Fossil (GW)", - "Fossil (TWh)", - "Fossil (mtCO2)", - # "Gas (%)", - "Gas (GW)", - "Gas (TWh)", - "Gas (mtCO2)", - "Gas and Other Fossil (%)", - "Gas and Other Fossil (GW)", - "Gas and Other Fossil (TWh)", - "Gas and Other Fossil (mtCO2)", - # "Hydro (%)", - "Hydro (GW)", - "Hydro (TWh)", - "Hydro (mtCO2)", - "Hydro, Bioenergy and Other Renewables (%)", - "Hydro, Bioenergy and Other Renewables (GW)", - "Hydro, Bioenergy and Other Renewables (TWh)", - "Hydro, Bioenergy and Other Renewables (mtCO2)", - "Net Imports (TWh)", - # "Nuclear (%)", - "Nuclear (GW)", - "Nuclear (TWh)", - "Nuclear (mtCO2)", - # "Other Fossil (%)", - "Other Fossil (GW)", - "Other Fossil (TWh)", - "Other Fossil (mtCO2)", - # "Other Renewables (%)", - "Other Renewables (GW)", - "Other Renewables (TWh)", - "Other Renewables (mtCO2)", - # "Renewables (%)", - "Renewables (GW)", - "Renewables (TWh)", - "Renewables (mtCO2)", - # "Solar (%)", - "Solar (GW)", - "Solar (TWh)", - "Solar (mtCO2)", - "Total Generation (TWh)", - "Total emissions (mtCO2)", - # "Wind (%)", - "Wind (GW)", - "Wind (TWh)", - "Wind (mtCO2)", - # "Wind and Solar (%)", - "Wind and Solar (GW)", - "Wind and Solar (TWh)", - "Wind and Solar (mtCO2)", -] - - -def prepare_yearly_electricity_data(tb_meadow: catalog.Table) -> pd.DataFrame: - """Prepare yearly electricity data using the raw table from meadow. - - Parameters - ---------- - tb_meadow : catalog.Table - Table from the yearly electricity dataset in meadow. - - Returns - ------- - df : pd.DataFrame - Yearly electricity data, in a dataframe format, with a dummy index, and only required columns. - - """ - # Make a dataframe out of the data in the table. - raw = pd.DataFrame(tb_meadow) - - # Select and rename columns conveniently. - columns = { - "area": "country", - "year": "year", - "variable": "variable", - "value": "value", - "unit": "unit", - "category": "category", - "subcategory": "subcategory", - } - df = raw.reset_index()[list(columns)].rename(columns=columns) - - # Sanity check. - assert set(df["category"]) == set(CATEGORIES), "Categories have changed in data." - - return df - - -def make_wide_table(df: pd.DataFrame, category: str) -> catalog.Table: - """Convert data from long to wide format for a specific category. - - This is a common processing for all categories in the data. - - Parameters - ---------- - df : pd.DataFrame - Data, after harmonizing country names. - category : str - Name of category (as defined above in CATEGORIES) to process. - - Returns - ------- - table : catalog.Table - Table in wide format. - - """ - # Select data for given category. - _df = df[df["category"] == category].copy() - - # Pivot dataframe to have a column for each variable. - table = catalog.Table(_df.pivot(index=["country", "year"], columns=["variable", "unit"], values="value")) - - # Get variable names, units, and variable-units (a name that combines both) for each column. - variable_units = [f"{variable} ({unit})" for variable, unit in table.columns] - - # Sanity check. - variables = table.columns.get_level_values(0).tolist() - units = table.columns.get_level_values(1).tolist() - assert len(variable_units) == len(units) == len(variables) - - # Collapse the two column levels into one, with the naming "variable (unit)" (except for country and year, that - # have no units and are the indexes of the table). - table.columns = variable_units - - # Add region aggregates. - aggregates = {column: "sum" for column in SUM_AGGREGATES if column in table.columns} - - table = add_region_aggregates( - data=table.reset_index(), - index_columns=["country", "year"], - regions=REGIONS, - aggregates=aggregates, - ) - - return table - - -def make_table_electricity_generation(df: pd.DataFrame) -> catalog.Table: - """Create table with processed data of category "Electricity generation". - - Parameters - ---------- - df : pd.DataFrame - Data in long format for all categories, after harmonizing country names. - - Returns - ------- - table : catalog.Table - Table of processed data for the given category. - - """ - # Prepare wide table. - table = make_wide_table(df=df, category="Electricity generation") - - # Recalculate the share of electricity generates for region aggregates. - for column in table.columns: - if "(%)" in column: - # Find corresponding column with units instead of percentages. - value_column = column.replace("(%)", "(TWh)") - if value_column not in table.columns: - raise ValueError(f"Column {value_column} not found.") - # Select only regions. - select_regions = table["country"].isin(list(REGIONS)) - table.loc[select_regions, column] = table[value_column] / table["Total Generation (TWh)"] * 100 - - return table - - -def make_table_electricity_demand(df: pd.DataFrame) -> catalog.Table: - """Create table with processed data of category "Electricity demand". - - Parameters - ---------- - df : pd.DataFrame - Data in long format for all categories, after harmonizing country names. - - Returns - ------- - table : catalog.Table - Table of processed data for the given category. - - """ - # Prepare wide table. - table = make_wide_table(df=df, category="Electricity demand") - - # Add population to data - table = add_population(df=table, warn_on_missing_countries=False) - - # Recalculate demand per capita. - # We could do this only for region aggregates (since they do not have per capita values), - # but we do this for all countries, to ensure per-capita variables are consistent with our population data. - table["Demand per capita (kWh)"] = ( - pd.DataFrame(table)["Demand (TWh)"] * TWH_TO_KWH / pd.DataFrame(table)["population"] - ) - - # Delete the original demand per capita column. - table = table.drop(columns=["Demand per capita (MWh)"]) - - return table - - -def make_table_power_sector_emissions(df: pd.DataFrame) -> catalog.Table: - """Create table with processed data of category "Power sector emissions". - - Parameters - ---------- - df : pd.DataFrame - Data in long format for all categories, after harmonizing country names. - - Returns - ------- - table : catalog.Table - Table of processed data for the given category. - - """ - # Prepare wide table of emissions data. - table = make_wide_table(df=df, category="Power sector emissions") - - # Add carbon intensity. - # In principle this only needs to be done for region aggregates, but we do it for all countries and check that - # the results are consistent with the original data. - # Prepare wide table also for electricity generation (required to calculate carbon intensity). - electricity = make_wide_table(df=df, category="Electricity generation")[ - ["country", "year", "Total Generation (TWh)"] - ] - # Add total electricity generation to emissions table. - table = pd.merge(table, electricity, on=["country", "year"], how="left") - # Rename the original carbon intensity column as a temporary column called "check". - intensity_col = "CO2 intensity (gCO2/kWh)" - table = table.rename(columns={intensity_col: "check"}) - # Calculate carbon intensity for all countries and regions. - table[intensity_col] = ( - pd.DataFrame(table)["Total emissions (mtCO2)"] * MT_TO_G / (table["Total Generation (TWh)"] * TWH_TO_KWH) - ) - - # Check that the new carbon intensities agree (within 1 % of mean average percentage error, aka mape) with the - # original ones (where carbon intensity was given, namely for countries, not aggregate regions). - mape = 100 * abs(table.dropna(subset="check")[intensity_col] - table["check"].dropna()) / table["check"].dropna() - assert mape.max() < 1, "Calculated carbon intensities differ from original ones by more than 1 percent." - - # Remove temporary column. - table = table.drop(columns=["check"]) - - return table - - -def run(dest_dir: str) -> None: - log.info(f"{DATASET_SHORT_NAME}.start") - - # - # Load data. - # - # Read dataset from meadow. - ds_meadow = catalog.Dataset(MEADOW_DATASET_PATH) - # Get table from dataset. - tb_meadow = ds_meadow[DATASET_SHORT_NAME] - # Make a dataframe out of the data in the table, with the required columns. - df = prepare_yearly_electricity_data(tb_meadow) - - # - # Process data. - # - # Harmonize country names. - log.info(f"{DATASET_SHORT_NAME}.harmonize_countries") - df = geo.harmonize_countries(df=df, countries_file=str(COUNTRY_MAPPING_PATH)) - - # Split data into different tables, one per category, and process each one individually. - log.info(f"{DATASET_SHORT_NAME}.prepare_wide_tables") - tables = { - "Capacity": make_wide_table(df=df, category="Capacity"), - "Electricity demand": make_table_electricity_demand(df=df), - "Electricity generation": make_table_electricity_generation(df=df), - "Electricity imports": make_wide_table(df=df, category="Electricity imports"), - "Power sector emissions": make_table_power_sector_emissions(df=df), - } - - # - # Save outputs. - # - # Create a new dataset with the same metadata as in Meadow. - ds_garden = catalog.Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - - # Add all tables to dataset. - for table_name, table in tables.items(): - # Set index and sort conveniently. - table = table.set_index(["country", "year"], verify_integrity=True).sort_index() - # Make column names snake lower case. - table = catalog.utils.underscore_table(table) - # Prepare table basic metadata. - table.metadata.title = table_name - table.metadata.short_name = catalog.utils.underscore(table_name) - # Add table to dataset. - ds_garden.add(table) - - # Update metadata attributes using the yaml file. - ds_garden.update_metadata(METADATA_PATH) - # Create dataset. - ds_garden.save() - - log.info(f"{DATASET_SHORT_NAME}.end") diff --git a/etl/steps/archive/garden/ember/2023-02-20/combined_electricity.meta.yml b/etl/steps/archive/garden/ember/2023-02-20/combined_electricity.meta.yml deleted file mode 100644 index b40a4832b59..00000000000 --- a/etl/steps/archive/garden/ember/2023-02-20/combined_electricity.meta.yml +++ /dev/null @@ -1,1096 +0,0 @@ -dataset: - namespace: ember - version: 2023-02-20 - title: Combined Electricity Data (Ember, 2023) - short_name: combined_electricity - sources: - - - name: Our World in Data based on Ember's Yearly Electricity Data (2023). - published_by: Ember - publication_year: 2023 - publication_date: 2023-01-31 - date_accessed: 2023-02-20 - url: https://ember-climate.org/data-catalogue/yearly-electricity-data/ - - - name: Our World in Data based on Ember's European Electricity Review (2022). - published_by: Ember - publication_year: 2022 - publication_date: 2022-02-01 - date_accessed: 2022-08-01 - url: https://ember-climate.org/insights/research/european-electricity-review-2022/ - -tables: - european_electricity_review: - title: European electricity review - variables: - demand__total_demand__twh: - title: Demand - Total demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total demand - demand__total_demand_per_capita__kwh: - title: Demand - Total demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Total per capita demand - emissions__co2_intensity__gco2_kwh: - title: Emissions - CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity - emissions__total_emissions__mtco2: - title: Emissions - Total emissions (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - generation__bioenergy__pct: - title: Generation - Bioenergy (%) - short_unit: "%" - unit: "%" - display: - name: Bioenergy generation - generation__bioenergy__twh: - title: Generation - Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy generation - generation__clean__pct: - title: Generation - Clean (%) - short_unit: "%" - unit: "%" - display: - name: Clean generation - generation__clean__twh: - title: Generation - Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean generation - generation__coal__pct: - title: Generation - Coal (%) - short_unit: "%" - unit: "%" - display: - name: Coal generation - generation__coal__twh: - title: Generation - Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal generation - generation__fossil__pct: - title: Generation - Fossil (%) - short_unit: "%" - unit: "%" - display: - name: Fossil generation - generation__fossil__twh: - title: Generation - Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil generation - generation__gas__pct: - title: Generation - Gas (%) - short_unit: "%" - unit: "%" - display: - name: Gas generation - generation__gas__twh: - title: Generation - Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas generation - generation__gas_and_other_fossil__pct: - title: Generation - Gas and other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Gas and other fossil generation - generation__gas_and_other_fossil__twh: - title: Generation - Gas and other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and other fossil generation - generation__hard_coal__pct: - title: Generation - Hard coal (%) - short_unit: "%" - unit: "%" - display: - name: Hard coal generation - generation__hard_coal__twh: - title: Generation - Hard coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hard coal generation - generation__hydro__bioenergy_and_other_renewables__pct: - title: Generation - Hydro bioenergy and other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__bioenergy_and_other_renewables__twh: - title: Generation - Hydro bioenergy and other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__pct: - title: Generation - Hydro (%) - short_unit: "%" - unit: "%" - display: - name: Hydro generation - generation__hydro__twh: - title: Generation - Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro generation - generation__lignite__pct: - title: Generation - Lignite (%) - short_unit: "%" - unit: "%" - display: - name: Lignite generation - generation__lignite__twh: - title: Generation - Lignite (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Lignite generation - generation__nuclear__pct: - title: Generation - Nuclear (%) - short_unit: "%" - unit: "%" - display: - name: Nuclear generation - generation__nuclear__twh: - title: Generation - Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear generation - generation__other_fossil__pct: - title: Generation - Other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Other fossil generation - generation__other_fossil__twh: - title: Generation - Other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other fossil generation - generation__other_renewables__pct: - title: Generation - Other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Other renewables generation - generation__other_renewables__twh: - title: Generation - Other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables generation - generation__renewables__pct: - title: Generation - Renewables (%) - short_unit: "%" - unit: "%" - display: - name: Renewables generation - generation__renewables__twh: - title: Generation - Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables generation - generation__solar__pct: - title: Generation - Solar (%) - short_unit: "%" - unit: "%" - display: - name: Solar generation - generation__solar__twh: - title: Generation - Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar generation - generation__total_generation__twh: - title: Generation - Total - short_unit: TWh - unit: terawatt-hours - display: - name: Total generation - generation__wind__pct: - title: Generation - Wind (%) - short_unit: "%" - unit: "%" - display: - name: Wind generation - generation__wind__twh: - title: Generation - Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind generation - generation__wind_and_solar__pct: - title: Generation - Wind and solar (%) - short_unit: "%" - unit: "%" - display: - name: Wind and solar generation - generation__wind_and_solar__twh: - title: Generation - Wind and solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and solar generation - imports__total_net_imports__twh: - title: Imports - Total net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total net imports - yearly_electricity: - title: Yearly Electricity Data - variables: - capacity__bioenergy__gw: - title: Capacity - Bioenergy (GW) - short_unit: GW - unit: gigawatts - display: - name: Bioenergy capacity - capacity__clean__gw: - title: Capacity - Clean (GW) - short_unit: GW - unit: gigawatts - display: - name: Clean capacity - capacity__coal__gw: - title: Capacity - Coal (GW) - short_unit: GW - unit: gigawatts - display: - name: Coal capacity - capacity__fossil__gw: - title: Capacity - Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Fossil capacity - capacity__gas__gw: - title: Capacity - Gas (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas capacity - capacity__gas_and_other_fossil__gw: - title: Capacity - Gas and other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas and other fossil capacity - capacity__hydro__bioenergy_and_other_renewables__gw: - title: Capacity - Hydro bioenergy and other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro bioenergy and other renewables capacity - capacity__hydro__gw: - title: Capacity - Hydro (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro capacity - capacity__nuclear__gw: - title: Capacity - Nuclear (GW) - short_unit: GW - unit: gigawatts - display: - name: Nuclear capacity - capacity__other_fossil__gw: - title: Capacity - Other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Other fossil capacity - capacity__other_renewables__gw: - title: Capacity - Other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Other renewables capacity - capacity__renewables__gw: - title: Capacity - Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Renewables capacity - capacity__solar__gw: - title: Capacity - Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Solar capacity - capacity__wind__gw: - title: Capacity - Wind (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind capacity - capacity__wind_and_solar__gw: - title: Capacity - Wind and solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind and solar capacity - demand__total_demand__twh: - title: Demand - Total demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total demand - demand__total_demand_per_capita__kwh: - title: Demand - Total demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Total per capita demand - emissions__bioenergy__mtco2: - title: Emissions - Bioenergy (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Bioenergy emissions - emissions__clean__mtco2: - title: Emissions - Clean (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Clean emissions - emissions__co2_intensity__gco2_kwh: - title: Emissions - CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity - emissions__coal__mtco2: - title: Emissions - Coal (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Coal emissions - emissions__fossil__mtco2: - title: Emissions - Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Fossil emissions - emissions__gas__mtco2: - title: Emissions - Gas (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas emissions - emissions__gas_and_other_fossil__mtco2: - title: Emissions - Gas and other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas and other fossil emissions - emissions__hydro__bioenergy_and_other_renewables__mtco2: - title: Emissions - Hydro bioenergy and other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro bioenergy and other renewables emissions - emissions__hydro__mtco2: - title: Emissions - Hydro (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro emissions - emissions__nuclear__mtco2: - title: Emissions - Nuclear (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Nuclear emissions - emissions__other_fossil__mtco2: - title: Emissions - Other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other fossil emissions - emissions__other_renewables__mtco2: - title: Emissions - Other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other renewables emissions - emissions__renewables__mtco2: - title: Emissions - Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Renewables emissions - emissions__solar__mtco2: - title: Emissions - Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Solar emissions - emissions__total_emissions__mtco2: - title: Emissions - Total - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - emissions__wind__mtco2: - title: Emissions - Wind (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind emissions - emissions__wind_and_solar__mtco2: - title: Emissions - Wind and solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind and solar emissions - generation__bioenergy__pct: - title: Generation - Bioenergy (%) - short_unit: "%" - unit: "%" - display: - name: Bioenergy generation - generation__bioenergy__twh: - title: Generation - Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy generation - generation__clean__pct: - title: Generation - Clean (%) - short_unit: "%" - unit: "%" - display: - name: Clean generation - generation__clean__twh: - title: Generation - Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean generation - generation__coal__pct: - title: Generation - Coal (%) - short_unit: "%" - unit: "%" - display: - name: Coal generation - generation__coal__twh: - title: Generation - Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal generation - generation__fossil__pct: - title: Generation - Fossil (%) - short_unit: "%" - unit: "%" - display: - name: Fossil generation - generation__fossil__twh: - title: Generation - Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil generation - generation__gas__pct: - title: Generation - Gas (%) - short_unit: "%" - unit: "%" - display: - name: Gas generation - generation__gas__twh: - title: Generation - Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas generation - generation__gas_and_other_fossil__pct: - title: Generation - Gas and other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Gas and other fossil generation - generation__gas_and_other_fossil__twh: - title: Generation - Gas and other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and other fossil generation - generation__hydro__bioenergy_and_other_renewables__pct: - title: Generation - Hydro bioenergy and other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__bioenergy_and_other_renewables__twh: - title: Generation - Hydro bioenergy and other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__pct: - title: Generation - Hydro (%) - short_unit: "%" - unit: "%" - display: - name: Hydro generation - generation__hydro__twh: - title: Generation - Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro generation - generation__nuclear__pct: - title: Generation - Nuclear (%) - short_unit: "%" - unit: "%" - display: - name: Nuclear generation - generation__nuclear__twh: - title: Generation - Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear generation - generation__other_fossil__pct: - title: Generation - Other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Other fossil generation - generation__other_fossil__twh: - title: Generation - Other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other fossil generation - generation__other_renewables__pct: - title: Generation - Other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Other renewables generation - generation__other_renewables__twh: - title: Generation - Other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables generation - generation__renewables__pct: - title: Generation - Renewables (%) - short_unit: "%" - unit: "%" - display: - name: Renewables generation - generation__renewables__twh: - title: Generation - Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables generation - generation__solar__pct: - title: Generation - Solar (%) - short_unit: "%" - unit: "%" - display: - name: Solar generation - generation__solar__twh: - title: Generation - Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar generation - generation__total_generation__twh: - title: Generation - Total - short_unit: TWh - unit: terawatt-hours - display: - name: Total generation - generation__wind__pct: - title: Generation - Wind (%) - short_unit: "%" - unit: "%" - display: - name: Wind generation - generation__wind__twh: - title: Generation - Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind generation - generation__wind_and_solar__pct: - title: Generation - Wind and solar (%) - short_unit: "%" - unit: "%" - display: - name: Wind and solar generation - generation__wind_and_solar__twh: - title: Generation - Wind and solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and solar generation - imports__total_net_imports__twh: - title: Imports - Total net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total net imports - combined_electricity: - title: Electricity - variables: - capacity__bioenergy__gw: - title: Capacity - Bioenergy (GW) - short_unit: GW - unit: gigawatts - display: - name: Bioenergy capacity - capacity__clean__gw: - title: Capacity - Clean (GW) - short_unit: GW - unit: gigawatts - display: - name: Clean capacity - capacity__coal__gw: - title: Capacity - Coal (GW) - short_unit: GW - unit: gigawatts - display: - name: Coal capacity - capacity__fossil__gw: - title: Capacity - Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Fossil capacity - capacity__gas__gw: - title: Capacity - Gas (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas capacity - capacity__gas_and_other_fossil__gw: - title: Capacity - Gas and other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas and other fossil capacity - capacity__hydro__bioenergy_and_other_renewables__gw: - title: Capacity - Hydro bioenergy and other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro bioenergy and other renewables capacity - capacity__hydro__gw: - title: Capacity - Hydro (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro capacity - capacity__nuclear__gw: - title: Capacity - Nuclear (GW) - short_unit: GW - unit: gigawatts - display: - name: Nuclear capacity - capacity__other_fossil__gw: - title: Capacity - Other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Other fossil capacity - capacity__other_renewables__gw: - title: Capacity - Other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Other renewables capacity - capacity__renewables__gw: - title: Capacity - Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Renewables capacity - capacity__solar__gw: - title: Capacity - Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Solar capacity - capacity__wind__gw: - title: Capacity - Wind (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind capacity - capacity__wind_and_solar__gw: - title: Capacity - Wind and solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind and solar capacity - demand__total_demand__twh: - title: Demand - Total demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total demand - demand__total_demand_per_capita__kwh: - title: Demand - Total demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Total per capita demand - emissions__bioenergy__mtco2: - title: Emissions - Bioenergy (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Bioenergy emissions - emissions__clean__mtco2: - title: Emissions - Clean (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Clean emissions - emissions__co2_intensity__gco2_kwh: - title: Emissions - CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity - emissions__coal__mtco2: - title: Emissions - Coal (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Coal emissions - emissions__fossil__mtco2: - title: Emissions - Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Fossil emissions - emissions__gas__mtco2: - title: Emissions - Gas (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas emissions - emissions__gas_and_other_fossil__mtco2: - title: Emissions - Gas and other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas and other fossil emissions - emissions__hydro__bioenergy_and_other_renewables__mtco2: - title: Emissions - Hydro bioenergy and other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro bioenergy and other renewables emissions - emissions__hydro__mtco2: - title: Emissions - Hydro (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro emissions - emissions__nuclear__mtco2: - title: Emissions - Nuclear (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Nuclear emissions - emissions__other_fossil__mtco2: - title: Emissions - Other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other fossil emissions - emissions__other_renewables__mtco2: - title: Emissions - Other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other renewables emissions - emissions__renewables__mtco2: - title: Emissions - Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Renewables emissions - emissions__solar__mtco2: - title: Emissions - Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Solar emissions - emissions__total_emissions__mtco2: - title: Emissions - Total - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - emissions__wind__mtco2: - title: Emissions - Wind (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind emissions - emissions__wind_and_solar__mtco2: - title: Emissions - Wind and solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind and solar emissions - generation__bioenergy__pct: - title: Generation - Bioenergy (%) - short_unit: "%" - unit: "%" - display: - name: Bioenergy generation - generation__bioenergy__twh: - title: Generation - Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy generation - generation__clean__pct: - title: Generation - Clean (%) - short_unit: "%" - unit: "%" - display: - name: Clean generation - generation__clean__twh: - title: Generation - Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean generation - generation__coal__pct: - title: Generation - Coal (%) - short_unit: "%" - unit: "%" - display: - name: Coal generation - generation__coal__twh: - title: Generation - Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal generation - generation__fossil__pct: - title: Generation - Fossil (%) - short_unit: "%" - unit: "%" - display: - name: Fossil generation - generation__fossil__twh: - title: Generation - Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil generation - generation__gas__pct: - title: Generation - Gas (%) - short_unit: "%" - unit: "%" - display: - name: Gas generation - generation__gas__twh: - title: Generation - Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas generation - generation__gas_and_other_fossil__pct: - title: Generation - Gas and other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Gas and other fossil generation - generation__gas_and_other_fossil__twh: - title: Generation - Gas and other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and other fossil generation - generation__hard_coal__pct: - title: Generation - Hard coal (%) - short_unit: "%" - unit: "%" - display: - name: Hard coal generation - generation__hard_coal__twh: - title: Generation - Hard coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hard coal generation - generation__hydro__bioenergy_and_other_renewables__pct: - title: Generation - Hydro bioenergy and other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__bioenergy_and_other_renewables__twh: - title: Generation - Hydro bioenergy and other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__pct: - title: Generation - Hydro (%) - short_unit: "%" - unit: "%" - display: - name: Hydro generation - generation__hydro__twh: - title: Generation - Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro generation - generation__lignite__pct: - title: Generation - Lignite (%) - short_unit: "%" - unit: "%" - display: - name: Lignite generation - generation__lignite__twh: - title: Generation - Lignite (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Lignite generation - generation__nuclear__pct: - title: Generation - Nuclear (%) - short_unit: "%" - unit: "%" - display: - name: Nuclear generation - generation__nuclear__twh: - title: Generation - Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear generation - generation__other_fossil__pct: - title: Generation - Other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Other fossil generation - generation__other_fossil__twh: - title: Generation - Other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other fossil generation - generation__other_renewables__pct: - title: Generation - Other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Other renewables generation - generation__other_renewables__twh: - title: Generation - Other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables generation - generation__renewables__pct: - title: Generation - Renewables (%) - short_unit: "%" - unit: "%" - display: - name: Renewables generation - generation__renewables__twh: - title: Generation - Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables generation - generation__solar__pct: - title: Generation - Solar (%) - short_unit: "%" - unit: "%" - display: - name: Solar generation - generation__solar__twh: - title: Generation - Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar generation - generation__total_generation__twh: - title: Generation - Total - short_unit: TWh - unit: terawatt-hours - display: - name: Total generation - generation__wind__pct: - title: Generation - Wind (%) - short_unit: "%" - unit: "%" - display: - name: Wind generation - generation__wind__twh: - title: Generation - Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind generation - generation__wind_and_solar__pct: - title: Generation - Wind and solar (%) - short_unit: "%" - unit: "%" - display: - name: Wind and solar generation - generation__wind_and_solar__twh: - title: Generation - Wind and solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and solar generation - imports__total_net_imports__twh: - title: Imports - Total net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total net imports - net_flows: - title: Net flows - variables: - net_flow__twh: - title: Net flow (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Net flow diff --git a/etl/steps/archive/garden/ember/2023-02-20/combined_electricity.py b/etl/steps/archive/garden/ember/2023-02-20/combined_electricity.py deleted file mode 100644 index d2b1bde5c5d..00000000000 --- a/etl/steps/archive/garden/ember/2023-02-20/combined_electricity.py +++ /dev/null @@ -1,300 +0,0 @@ -"""Garden step that combines Ember's European Electricity Review (EER) 2022 and the latest Ember's Yearly Electricity -Data (YED). - -The YED dataset contains data for all countries in EER 2022. -However, YED starts in 2000, while EER 2022 starts in 1990. - -Therefore, to gather as much data as possible, we combine both datasets, prioritizing YED. - -This way, we'll have data from 1990-1999 from EER 2022, and data from 2000-2022 from YED. - -NOTES: -* This step used to combine Ember's Global Electricity Review and the EER, but now we have replaced the former by - the YED. However, there may be instances in the code where "global" refers to the YED. -* We don't use the latest EER 2023 because it does not contain data prior to 2000. - -""" - -import pandas as pd -from owid.catalog import Dataset, Table, utils -from owid.datautils import dataframes - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Define aggregates, following their Ember-Electricity-Data-Methodology document: -# https://ember-climate.org/app/uploads/2022/03/GER22-Methodology.pdf -# The European review also has its own methodology document: -# https://ember-climate.org/app/uploads/2022/02/EER-Methodology.pdf -# but it does not explicitly define aggregates. We assume they are consistent with each other. -# This will be also checked, along with other sanity checks, in a separate analysis. -AGGREGATES = { - "coal__twh": [ - "hard_coal__twh", - "lignite__twh", - ], - "wind_and_solar__twh": ["wind__twh", "solar__twh"], - "hydro__bioenergy_and_other_renewables__twh": [ - "hydro__twh", - "bioenergy__twh", - "other_renewables__twh", - ], - "renewables__twh": [ - "wind_and_solar__twh", - "hydro__bioenergy_and_other_renewables__twh", - ], - "clean__twh": [ - "renewables__twh", - "nuclear__twh", - ], - "gas_and_other_fossil__twh": [ - "gas__twh", - "other_fossil__twh", - ], - "fossil__twh": ["gas_and_other_fossil__twh", "coal__twh"], - "total_generation__twh": [ - "clean__twh", - "fossil__twh", - ], -} - - -def combine_yearly_electricity_data(ds_global: Dataset) -> Table: - """Combine all tables in Ember's Yearly Electricity Data into one table. - - Parameters - ---------- - ds_global : Dataset - Yearly Electricity dataset (containing tables for capacity, electricity demand, generation, imports and - emissions). - - Returns - ------- - combined_global : Table - Combined table containing all data in the Yearly Electricity dataset. - - """ - category_renaming = { - "capacity": "Capacity - ", - "electricity_demand": "", - "electricity_generation": "Generation - ", - "electricity_imports": "", - "power_sector_emissions": "Emissions - ", - } - error = "Tables in yearly electricity dataset have changed" - assert set(category_renaming) == set(ds_global.table_names), error - index_columns = ["country", "year"] - tables = [] - for category in category_renaming: - table = ds_global[category].copy() - table = table.rename( - columns={ - column: utils.underscore(category_renaming[category] + column) - for column in table.columns - if column not in index_columns - } - ) - table = table.reset_index() - tables.append(table) - - # Merge all tables into one, with an appropriate short name. - combined_global = dataframes.multi_merge(dfs=tables, on=index_columns, how="outer") - combined_global.metadata.short_name = "yearly_electricity" - - # Rename certain columns for consistency. - combined_global = combined_global.rename( - columns={ - "net_imports__twh": "imports__total_net_imports__twh", - "demand__twh": "demand__total_demand__twh", - "demand_per_capita__kwh": "demand__total_demand_per_capita__kwh", - }, - errors="raise", - ) - - # Sanity check. - error = "Total generation column in emissions and generation tables are not identical." - assert all( - combined_global["emissions__total_generation__twh"].fillna(-1) - == combined_global["generation__total_generation__twh"].fillna(-1) - ), error - - # Remove unnecessary columns and any possible rows with no data. - combined_global = combined_global.drop(columns=["population", "emissions__total_generation__twh"]).dropna(how="all") - - # Set a convenient index and sort rows and columns conveniently. - combined_global = ( - combined_global.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - ) - - return combined_global - - -def combine_european_electricity_review_data(ds_european: Dataset) -> Table: - """Combine tables in Ember's European Electricity Review dataset into one table. - - The tables to be combined are 'country_overview', 'generation', and 'emissions'. The remaining table on net flows - has a different structure and cannot be combined with the others, so it will remain as a separate table. - - Parameters - ---------- - ds_european : Dataset - European Electricity Review dataset. - - Returns - ------- - combined_european : Table - Combined table containing all data in the European Electricity Review dataset (except net flows). - - """ - index_columns = ["country", "year"] - # Extract the necessary tables from the dataset. - country_overview = ds_european["country_overview"].copy() - generation = ds_european["generation"].copy() - emissions = ds_european["emissions"].copy() - - # Create aggregates (defined in AGGREGATES) that are in yearly electricity but not in the european review. - for aggregate in AGGREGATES: - generation[aggregate] = pd.DataFrame(generation)[AGGREGATES[aggregate]].sum(axis=1) - - # Create a column for each of those new aggregates, giving percentage share of total generation. - for aggregate in AGGREGATES: - column = aggregate.replace("__twh", "__pct") - generation[column] = pd.DataFrame(generation)[aggregate] / generation["total_generation__twh"] * 100 - - # Check that total generation adds up to 100%. - error = "Total generation does not add up to 100%." - assert set(generation["total_generation__pct"]) == {100}, error - - # Check that the constructed "total generation" column agrees with the one given in table "country_overview". - columns = ["country", "year", "total_generation__twh"] - check = pd.merge( - ds_european["country_overview"].reset_index()[columns], - generation.reset_index()[columns], - on=index_columns, - ) - # Assert that the percentage change is smaller than 1% - error = "Total generation does not agree with the on in country_overview." - assert all( - (abs(check["total_generation__twh_x"] - check["total_generation__twh_y"]) / check["total_generation__twh_x"]) - < 0.01 - ), error - - # Remove unnecessary columns. - generation = generation.drop(columns=["total_generation__pct", "total_generation__twh"]) - - # Rename all column names to start with the category, before combining all categories. - generation = generation.rename(columns={column: "generation__" + column for column in generation.columns}) - emissions = emissions.rename(columns={column: "emissions__" + column for column in emissions.columns}) - country_overview = country_overview.rename( - columns={ - "total_generation__twh": "generation__total_generation__twh", - "demand__twh": "demand__total_demand__twh", - "demand_per_capita__kwh": "demand__total_demand_per_capita__kwh", - "net_imports__twh": "imports__total_net_imports__twh", - }, - errors="raise", - ) - - # Combine tables into one dataframe. - combined_european = dataframes.multi_merge( - [ - country_overview.reset_index(), - emissions.reset_index(), - generation.reset_index(), - ], - on=index_columns, - how="outer", - ) - - # Assign an appropriate short name to table. - combined_european.metadata.short_name = "european_electricity_review" - - # If any column was repeated in the merge, it will have a "_x" at the end of the name. - # Check that no other columns were repeated. - error = "There are repeated columns in combined dataframe." - assert len([column for column in combined_european.columns if column.endswith("_x")]) == 0, error - - # Remove any possible rows with no data. - combined_european = combined_european.dropna(how="all") - - # Ensure that the index is well constructed. - combined_european = ( - combined_european.set_index(index_columns, verify_integrity=True).sort_index().sort_index(axis=1) - ) - - return combined_european - - -def combine_yearly_electricity_data_and_european_electricity_review( - combined_global: Table, combined_european: Table -) -> Table: - """Combine the combined table of the Yearly Electricity Data with the combined table of the European Electricity - Review. - - Parameters - ---------- - combined_global : Table - Table that combines all tables of the Yearly Electricity Data. - combined_european : Table - Table that combines all tables of the European Electricity Review (except net flows). - - Returns - ------- - combined : Table - Combined data. - - """ - # Combine (global) yearly electricity data with European data, prioritizing the former. - index_columns = ["country", "year"] - combined = dataframes.combine_two_overlapping_dataframes( - df1=combined_global.reset_index(), df2=combined_european.reset_index(), index_columns=index_columns - ) - - # Create a table (with no metadata) and sort data appropriately. - combined = ( - Table(combined, short_name="combined_electricity") - .set_index(index_columns, verify_integrity=True) - .sort_index() - .sort_index(axis=1) - ) - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read yearly electricity data and european electricity review datasets from garden. - ds_global: Dataset = paths.load_dependency("yearly_electricity") - ds_european: Dataset = paths.load_dependency("european_electricity_review") - - # - # Process data. - # - # Combine all tables of the yearly electricity data into one. - combined_global = combine_yearly_electricity_data(ds_global=ds_global) - - # Combine all tables of the european electricity review into one. - combined_european = combine_european_electricity_review_data(ds_european=ds_european) - - # Combine yearly electricity and european reviews. - combined = combine_yearly_electricity_data_and_european_electricity_review( - combined_global=combined_global, combined_european=combined_european - ) - - # Create an additional table with the electricity net flows (only available in european review). - net_flows = ds_european["net_flows"].copy() - - # - # Save outputs. - # - # Create new garden dataset. - ds_garden = create_dataset( - dest_dir=dest_dir, - tables=[combined_global, combined_european, combined, net_flows], - default_metadata=ds_global.metadata, - ) - ds_garden.save() diff --git a/etl/steps/archive/garden/ember/2023-02-20/yearly_electricity.countries.json b/etl/steps/archive/garden/ember/2023-02-20/yearly_electricity.countries.json deleted file mode 100644 index af909746415..00000000000 --- a/etl/steps/archive/garden/ember/2023-02-20/yearly_electricity.countries.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas (the)": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bosnia Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cayman Islands (the)": "Cayman Islands", - "Central African Republic (the)": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros (the)": "Comoros", - "Congo (the Democratic Republic of the)": "Democratic Republic of Congo", - "Congo (the)": "Congo", - "Cook Islands (the)": "Cook Islands", - "Costa Rica": "Costa Rica", - "Cote d'Ivoire": "Cote d'Ivoire", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic (the)": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Falkland Islands (the) [Malvinas]": "Falkland Islands", - "Faroe Islands (the)": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia (the)": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Gibraltar": "Gibraltar", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea (the Democratic People's Republic of)": "North Korea", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic (the)": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Moldova": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger (the)": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palestine, State of": "Palestine", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines (the)": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Reunion": "Reunion", - "Romania": "Romania", - "Russian Federation (the)": "Russia", - "Rwanda": "Rwanda", - "Saint Helena, Ascension and Tristan da Cunha": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan (the)": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic (the)": "Syria", - "Taiwan": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania, the United Republic of": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands (the)": "Turks and Caicos Islands", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Virgin Islands (British)": "British Virgin Islands", - "Virgin Islands (U.S.)": "United States Virgin Islands", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "OECD": "OECD (Ember)", - "Africa": "Africa (Ember)", - "Asia": "Asia (Ember)", - "EU": "European Union (27) (Ember)", - "Europe": "Europe (Ember)", - "G20": "G20 (Ember)", - "G7": "G7 (Ember)", - "Latin America and Caribbean": "Latin America and Caribbean (Ember)", - "North America": "North America (Ember)", - "Oceania": "Oceania (Ember)" -} diff --git a/etl/steps/archive/garden/ember/2023-02-20/yearly_electricity.meta.yml b/etl/steps/archive/garden/ember/2023-02-20/yearly_electricity.meta.yml deleted file mode 100644 index 3229696770d..00000000000 --- a/etl/steps/archive/garden/ember/2023-02-20/yearly_electricity.meta.yml +++ /dev/null @@ -1,440 +0,0 @@ -dataset: - namespace: ember - version: 2023-02-20 - title: Yearly Electricity Data (Ember, 2023) - short_name: yearly_electricity - description: | - [Ember's region definitions](https://ember-climate.org/countries-and-regions/), denoted with "(Ember)", are: - * "G20 (Ember)" - Group of Twenty: Argentina, Australia, Brazil, Canada, China, France, Germany, India, Indonesia, Italy, Japan, Mexico, Russia, Saudi Arabia, South Africa, South Korea, Turkey, United Kingdom, United States and the 27 members of the European Union. - * "G7 (Ember)" - Group of Seven: Canada, France, Germany, Italy, Japan, United Kingdom and United States. - * "Latin America and Caribbean (Ember)": Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, Uruguay, Venezuela, Aruba, British Virgin Islands, Cayman Islands, Falkland Islands, French Guiana, Guadeloupe, Martinique, Montserrat, Puerto Rico, Turks and Caicos Islands and United States Virgin Islands. - * "OECD (Ember)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, United Kingdom, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, and United States. - sources: - - name: Our World in Data based on Ember's Yearly Electricity Data (2023). - published_by: Ember - publication_year: 2023 - publication_date: 2023-01-31 - date_accessed: 2023-02-20 - url: https://ember-climate.org/data-catalogue/yearly-electricity-data/ -tables: - capacity: - title: Capacity - variables: - clean__gw: - title: Clean (GW) - short_unit: GW - unit: gigawatts - display: - name: Clean - fossil__gw: - title: Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Fossil - gas_and_other_fossil__gw: - title: Gas and Other Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__gw: - title: Hydro, Bioenergy and Other Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro, Bioenergy and Other Renewables - renewables__gw: - title: Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Renewables - wind_and_solar__gw: - title: Wind and Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind and Solar - bioenergy__gw: - title: Bioenergy (GW) - short_unit: GW - unit: gigawatts - display: - name: Bioenergy - coal__gw: - title: Coal (GW) - short_unit: GW - unit: gigawatts - display: - name: Coal - gas__gw: - title: Gas (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas - hydro__gw: - title: Hydro (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro - nuclear__gw: - title: Nuclear (GW) - short_unit: GW - unit: gigawatts - display: - name: Nuclear - other_fossil__gw: - title: Other Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Other Fossil - other_renewables__gw: - title: Other Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Other Renewables - solar__gw: - title: Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Solar - wind__gw: - title: Wind (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind - electricity_demand: - title: Electricity demand - variables: - demand__twh: - title: Demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Demand - population: - title: Population - short_unit: people - unit: people - demand_per_capita__kwh: - title: Demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Demand per capita - electricity_generation: - title: Electricity generation - variables: - clean__pct: - title: Clean (%) - short_unit: '%' - unit: '%' - display: - name: Clean - fossil__pct: - title: Fossil (%) - short_unit: '%' - unit: '%' - display: - name: Fossil - gas_and_other_fossil__pct: - title: Gas and Other Fossil (%) - short_unit: '%' - unit: '%' - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__pct: - title: Hydro, Bioenergy and Other Renewables (%) - short_unit: '%' - unit: '%' - display: - name: Hydro, Bioenergy and Other Renewables - renewables__pct: - title: Renewables (%) - short_unit: '%' - unit: '%' - display: - name: Renewables - wind_and_solar__pct: - title: Wind and Solar (%) - short_unit: '%' - unit: '%' - display: - name: Wind and Solar - clean__twh: - title: Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean - fossil__twh: - title: Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil - gas_and_other_fossil__twh: - title: Gas and Other Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__twh: - title: Hydro, Bioenergy and Other Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro, Bioenergy and Other Renewables - renewables__twh: - title: Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables - wind_and_solar__twh: - title: Wind and Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and Solar - bioenergy__pct: - title: Bioenergy (%) - short_unit: '%' - unit: '%' - display: - name: Bioenergy - coal__pct: - title: Coal (%) - short_unit: '%' - unit: '%' - display: - name: Coal - gas__pct: - title: Gas (%) - short_unit: '%' - unit: '%' - display: - name: Gas - hydro__pct: - title: Hydro (%) - short_unit: '%' - unit: '%' - display: - name: Hydro - nuclear__pct: - title: Nuclear (%) - short_unit: '%' - unit: '%' - display: - name: Nuclear - other_fossil__pct: - title: Other Fossil (%) - short_unit: '%' - unit: '%' - display: - name: Other Fossil - other_renewables__pct: - title: Other Renewables (%) - short_unit: '%' - unit: '%' - display: - name: Other Renewables - solar__pct: - title: Solar (%) - short_unit: '%' - unit: '%' - display: - name: Solar - wind__pct: - title: Wind (%) - short_unit: '%' - unit: '%' - display: - name: Wind - bioenergy__twh: - title: Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy - coal__twh: - title: Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - gas__twh: - title: Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas - hydro__twh: - title: Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro - nuclear__twh: - title: Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - other_fossil__twh: - title: Other Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other Fossil - other_renewables__twh: - title: Other Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other Renewables - solar__twh: - title: Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - wind__twh: - title: Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - total_generation__twh: - title: Total Generation (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total Generation - electricity_imports: - title: Electricity imports - variables: - net_imports__twh: - title: Net Imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Net Imports - power_sector_emissions: - title: Power sector emissions - variables: - clean__mtco2: - title: Clean (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Clean - fossil__mtco2: - title: Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Fossil - gas_and_other_fossil__mtco2: - title: Gas and Other Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__mtco2: - title: Hydro, Bioenergy and Other Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro, Bioenergy and Other Renewables - renewables__mtco2: - title: Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Renewables - wind_and_solar__mtco2: - title: Wind and Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind and Solar - bioenergy__mtco2: - title: Bioenergy (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Bioenergy - coal__mtco2: - title: Coal (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Coal - gas__mtco2: - title: Gas (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas - hydro__mtco2: - title: Hydro (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro - nuclear__mtco2: - title: Nuclear (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Nuclear - other_fossil__mtco2: - title: Other Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other Fossil - other_renewables__mtco2: - title: Other Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other Renewables - solar__mtco2: - title: Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Solar - wind__mtco2: - title: Wind (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind - total_emissions__mtco2: - title: Total emissions (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - total_generation__twh: - title: Total Generation (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total Generation - co2_intensity__gco2_kwh: - title: CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity diff --git a/etl/steps/archive/garden/ember/2023-02-20/yearly_electricity.py b/etl/steps/archive/garden/ember/2023-02-20/yearly_electricity.py deleted file mode 100644 index c09d31d4822..00000000000 --- a/etl/steps/archive/garden/ember/2023-02-20/yearly_electricity.py +++ /dev/null @@ -1,534 +0,0 @@ -"""Garden step for Ember's Yearly Electricity Data. - -""" - -from typing import cast - -import numpy as np -import pandas as pd -from owid import catalog -from owid.catalog import Dataset, Table -from shared import add_population, add_region_aggregates, correct_data_points - -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Corrections to the output tables. -# They are all the same correction: Remove aggregates for 2022, given that only EU countries are informed. -AMENDMENTS = { - "Capacity": [ - ( - { - "country": ["Europe", "Upper-middle-income countries", "High-income countries"], - "year": [2022], - }, - { - "Clean (GW)": pd.NA, - "Fossil (GW)": pd.NA, - "Gas and Other Fossil (GW)": pd.NA, - "Hydro, Bioenergy and Other Renewables (GW)": pd.NA, - "Renewables (GW)": pd.NA, - "Wind and Solar (GW)": pd.NA, - "Bioenergy (GW)": pd.NA, - "Coal (GW)": pd.NA, - "Gas (GW)": pd.NA, - "Hydro (GW)": pd.NA, - "Nuclear (GW)": pd.NA, - "Other Fossil (GW)": pd.NA, - "Other Renewables (GW)": pd.NA, - "Solar (GW)": pd.NA, - "Wind (GW)": pd.NA, - }, - ) - ], - "Electricity demand": [ - ( - { - "country": ["Europe", "Upper-middle-income countries", "High-income countries"], - "year": [2022], - }, - { - "Demand (TWh)": pd.NA, - "population": pd.NA, - "Demand per capita (kWh)": pd.NA, - }, - ) - ], - "Electricity generation": [ - ( - { - "country": ["Europe", "Upper-middle-income countries", "High-income countries"], - "year": [2022], - }, - { - "Clean (%)": pd.NA, - "Fossil (%)": pd.NA, - "Gas and Other Fossil (%)": pd.NA, - "Hydro, Bioenergy and Other Renewables (%)": pd.NA, - "Renewables (%)": pd.NA, - "Wind and Solar (%)": pd.NA, - "Clean (TWh)": pd.NA, - "Fossil (TWh)": pd.NA, - "Gas and Other Fossil (TWh)": pd.NA, - "Hydro, Bioenergy and Other Renewables (TWh)": pd.NA, - "Renewables (TWh)": pd.NA, - "Wind and Solar (TWh)": pd.NA, - "Bioenergy (%)": pd.NA, - "Coal (%)": pd.NA, - "Gas (%)": pd.NA, - "Hydro (%)": pd.NA, - "Nuclear (%)": pd.NA, - "Other Fossil (%)": pd.NA, - "Other Renewables (%)": pd.NA, - "Solar (%)": pd.NA, - "Wind (%)": pd.NA, - "Bioenergy (TWh)": pd.NA, - "Coal (TWh)": pd.NA, - "Gas (TWh)": pd.NA, - "Hydro (TWh)": pd.NA, - "Nuclear (TWh)": pd.NA, - "Other Fossil (TWh)": pd.NA, - "Other Renewables (TWh)": pd.NA, - "Solar (TWh)": pd.NA, - "Wind (TWh)": pd.NA, - "Total Generation (TWh)": pd.NA, - }, - ), - ], - "Electricity imports": [ - ( - { - "country": ["Europe", "Upper-middle-income countries", "High-income countries"], - "year": [2022], - }, - { - "Net Imports (TWh)": np.nan, - }, - ), - ], - "Power sector emissions": [ - ( - { - "country": ["Europe", "Upper-middle-income countries", "High-income countries"], - "year": [2022], - }, - { - "Clean (mtCO2)": pd.NA, - "Fossil (mtCO2)": pd.NA, - "Gas and Other Fossil (mtCO2)": pd.NA, - "Hydro, Bioenergy and Other Renewables (mtCO2)": pd.NA, - "Renewables (mtCO2)": pd.NA, - "Wind and Solar (mtCO2)": pd.NA, - "Bioenergy (mtCO2)": pd.NA, - "Coal (mtCO2)": pd.NA, - "Gas (mtCO2)": pd.NA, - "Hydro (mtCO2)": pd.NA, - "Nuclear (mtCO2)": pd.NA, - "Other Fossil (mtCO2)": pd.NA, - "Other Renewables (mtCO2)": pd.NA, - "Solar (mtCO2)": pd.NA, - "Wind (mtCO2)": pd.NA, - "Total emissions (mtCO2)": pd.NA, - "Total Generation (TWh)": pd.NA, - "CO2 intensity (gCO2/kWh)": pd.NA, - }, - ), - ], -} - -# Aggregate regions to add, following OWID definitions. -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -REGIONS = { - # Default continents. - "Africa": {}, - "Asia": {}, - "Europe": {}, - "European Union (27)": {}, - "North America": {}, - "Oceania": {}, - "South America": {}, - # Ember already has data for "World". - # "World": {}, - # Income groups. - "Low-income countries": {}, - "Upper-middle-income countries": {}, - "Lower-middle-income countries": {}, - "High-income countries": {}, -} - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 -# Megatonnes to grams. -MT_TO_G = 1e12 - -# Map units (short version) to unit name (long version). -SHORT_UNIT_TO_UNIT = { - "TWh": "terawatt-hours", - "MWh": "megawatt-hours", - "kWh": "kilowatt-hours", - "mtCO2": "megatonnes of CO2 equivalent", - "gCO2/kWh": "grams of CO2 equivalent per kilowatt-hour", - "GW": "gigawatts", - "%": "%", -} - -# Categories expected to exist in the data. -CATEGORIES = [ - "Capacity", - "Electricity demand", - "Electricity generation", - "Electricity imports", - "Power sector emissions", -] - -# Choose columns for which region aggregates should be created. -SUM_AGGREGATES = [ - # "Bioenergy (%)", - "Bioenergy (GW)", - "Bioenergy (TWh)", - "Bioenergy (mtCO2)", - # "CO2 intensity (gCO2/kWh)", - # "Clean (%)", - "Clean (GW)", - "Clean (TWh)", - "Clean (mtCO2)", - # "Coal (%)", - "Coal (GW)", - "Coal (TWh)", - "Coal (mtCO2)", - "Demand (TWh)", - # "Demand per capita (MWh)", - # "Fossil (%)", - "Fossil (GW)", - "Fossil (TWh)", - "Fossil (mtCO2)", - # "Gas (%)", - "Gas (GW)", - "Gas (TWh)", - "Gas (mtCO2)", - "Gas and Other Fossil (%)", - "Gas and Other Fossil (GW)", - "Gas and Other Fossil (TWh)", - "Gas and Other Fossil (mtCO2)", - # "Hydro (%)", - "Hydro (GW)", - "Hydro (TWh)", - "Hydro (mtCO2)", - "Hydro, Bioenergy and Other Renewables (%)", - "Hydro, Bioenergy and Other Renewables (GW)", - "Hydro, Bioenergy and Other Renewables (TWh)", - "Hydro, Bioenergy and Other Renewables (mtCO2)", - "Net Imports (TWh)", - # "Nuclear (%)", - "Nuclear (GW)", - "Nuclear (TWh)", - "Nuclear (mtCO2)", - # "Other Fossil (%)", - "Other Fossil (GW)", - "Other Fossil (TWh)", - "Other Fossil (mtCO2)", - # "Other Renewables (%)", - "Other Renewables (GW)", - "Other Renewables (TWh)", - "Other Renewables (mtCO2)", - # "Renewables (%)", - "Renewables (GW)", - "Renewables (TWh)", - "Renewables (mtCO2)", - # "Solar (%)", - "Solar (GW)", - "Solar (TWh)", - "Solar (mtCO2)", - "Total Generation (TWh)", - "Total emissions (mtCO2)", - # "Wind (%)", - "Wind (GW)", - "Wind (TWh)", - "Wind (mtCO2)", - # "Wind and Solar (%)", - "Wind and Solar (GW)", - "Wind and Solar (TWh)", - "Wind and Solar (mtCO2)", -] - - -def prepare_yearly_electricity_data(tb_meadow: Table) -> pd.DataFrame: - """Prepare yearly electricity data using the raw table from meadow. - - Parameters - ---------- - tb_meadow : Table - Table from the yearly electricity dataset in meadow. - - Returns - ------- - df : pd.DataFrame - Yearly electricity data, in a dataframe format, with a dummy index, and only required columns. - - """ - # Make a dataframe out of the data in the table. - raw = pd.DataFrame(tb_meadow) - - # Select and rename columns conveniently. - columns = { - "area": "country", - "year": "year", - "variable": "variable", - "value": "value", - "unit": "unit", - "category": "category", - "subcategory": "subcategory", - } - df = raw.reset_index()[list(columns)].rename(columns=columns) - - # Sanity check. - assert set(df["category"]) == set(CATEGORIES), "Categories have changed in data." - - return df - - -def make_wide_table(df: pd.DataFrame, category: str, df_regions: pd.DataFrame, df_income: pd.DataFrame) -> Table: - """Convert data from long to wide format for a specific category. - - This is a common processing for all categories in the data. - - Parameters - ---------- - df : pd.DataFrame - Data, after harmonizing country names. - category : str - Name of category (as defined above in CATEGORIES) to process. - df_regions : pd.DataFrame - Countries-regions data. - df_income : pd.DataFrame - Data on income group definitions. - - Returns - ------- - table : Table - Table in wide format. - - """ - # Select data for given category. - _df = df[df["category"] == category].copy() - - # Pivot dataframe to have a column for each variable. - table = Table(_df.pivot(index=["country", "year"], columns=["variable", "unit"], values="value")) - - # Get variable names, units, and variable-units (a name that combines both) for each column. - variable_units = [f"{variable} ({unit})" for variable, unit in table.columns] - - # Sanity check. - variables = table.columns.get_level_values(0).tolist() - units = table.columns.get_level_values(1).tolist() - assert len(variable_units) == len(units) == len(variables) - - # Collapse the two column levels into one, with the naming "variable (unit)" (except for country and year, that - # have no units and are the indexes of the table). - table.columns = variable_units - - # Add region aggregates. - aggregates = {column: "sum" for column in SUM_AGGREGATES if column in table.columns} - - table = add_region_aggregates( - data=table.reset_index(), - index_columns=["country", "year"], - regions_to_add=REGIONS, - aggregates=aggregates, - df_regions=df_regions, - df_income=df_income, - ) - - return table - - -def make_table_electricity_generation(df: pd.DataFrame, df_regions: pd.DataFrame, df_income: pd.DataFrame) -> Table: - """Create table with processed data of category "Electricity generation". - - Parameters - ---------- - df : pd.DataFrame - Data in long format for all categories, after harmonizing country names. - df_regions : pd.DataFrame - Countries-regions data. - df_income : pd.DataFrame - Data on income group definitions. - - Returns - ------- - table : Table - Table of processed data for the given category. - - """ - # Prepare wide table. - table = make_wide_table(df=df, category="Electricity generation", df_regions=df_regions, df_income=df_income) - - # Recalculate the share of electricity generates for region aggregates. - for column in table.columns: - if "(%)" in column: - # Find corresponding column with units instead of percentages. - value_column = column.replace("(%)", "(TWh)") - if value_column not in table.columns: - raise ValueError(f"Column {value_column} not found.") - # Select only regions. - select_regions = table["country"].isin(list(REGIONS)) - table.loc[select_regions, column] = table[value_column] / table["Total Generation (TWh)"] * 100 - - return table - - -def make_table_electricity_demand( - df: pd.DataFrame, population: pd.DataFrame, df_regions: pd.DataFrame, df_income: pd.DataFrame -) -> Table: - """Create table with processed data of category "Electricity demand". - - Parameters - ---------- - df : pd.DataFrame - Data in long format for all categories, after harmonizing country names. - df_regions : pd.DataFrame - Countries-regions data. - df_income : pd.DataFrame - Data on income group definitions. - - Returns - ------- - table : Table - Table of processed data for the given category. - - """ - # Prepare wide table. - table = make_wide_table(df=df, category="Electricity demand", df_regions=df_regions, df_income=df_income) - - # Add population to data - table = add_population(df=table, population=population, warn_on_missing_countries=False) - - # Recalculate demand per capita. - # We could do this only for region aggregates (since they do not have per capita values), - # but we do this for all countries, to ensure per-capita variables are consistent with our population data. - table["Demand per capita (kWh)"] = ( - pd.DataFrame(table)["Demand (TWh)"] * TWH_TO_KWH / pd.DataFrame(table)["population"] - ) - - # Delete the original demand per capita column. - table = table.drop(columns=["Demand per capita (MWh)"]) - - return table - - -def make_table_power_sector_emissions(df: pd.DataFrame, df_regions: pd.DataFrame, df_income: pd.DataFrame) -> Table: - """Create table with processed data of category "Power sector emissions". - - Parameters - ---------- - df : pd.DataFrame - Data in long format for all categories, after harmonizing country names. - df_regions : pd.DataFrame - Countries-regions data. - df_income : pd.DataFrame - Data on income group definitions. - - Returns - ------- - table : Table - Table of processed data for the given category. - - """ - # Prepare wide table of emissions data. - table = make_wide_table(df=df, category="Power sector emissions", df_regions=df_regions, df_income=df_income) - - # Add carbon intensity. - # In principle this only needs to be done for region aggregates, but we do it for all countries and check that - # the results are consistent with the original data. - # Prepare wide table also for electricity generation (required to calculate carbon intensity). - electricity = make_wide_table(df=df, category="Electricity generation", df_regions=df_regions, df_income=df_income)[ - ["country", "year", "Total Generation (TWh)"] - ] - # Add total electricity generation to emissions table. - table = pd.merge(table, electricity, on=["country", "year"], how="left") - # Rename the original carbon intensity column as a temporary column called "check". - intensity_col = "CO2 intensity (gCO2/kWh)" - table = table.rename(columns={intensity_col: "check"}) - # Calculate carbon intensity for all countries and regions. - table[intensity_col] = ( - pd.DataFrame(table)["Total emissions (mtCO2)"] * MT_TO_G / (table["Total Generation (TWh)"] * TWH_TO_KWH) - ) - - # Check that the new carbon intensities agree (within 1 % of mean average percentage error, aka mape) with the - # original ones (where carbon intensity was given, namely for countries, not aggregate regions). - mape = 100 * abs(table.dropna(subset="check")[intensity_col] - table["check"].dropna()) / table["check"].dropna() - assert mape.max() < 1, "Calculated carbon intensities differ from original ones by more than 1 percent." - - # Remove temporary column. - table = table.drop(columns=["check"]) - - return table - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read dataset from meadow. - ds_meadow: Dataset = paths.load_dependency("yearly_electricity") - # Get table from dataset. - tb_meadow = ds_meadow["yearly_electricity"] - # Make a dataframe out of the data in the table, with the required columns. - df = prepare_yearly_electricity_data(tb_meadow) - - # Read population dataset from garden. - ds_population: Dataset = paths.load_dependency("population") - # Get table from dataset. - tb_population = ds_population["population"] - # Make a dataframe out of the data in the table, with the required columns. - df_population = pd.DataFrame(tb_population) - - # Load regions dataset. - tb_regions = cast(Dataset, paths.load_dependency("regions"))["regions"] - df_regions = pd.DataFrame(tb_regions) - - # Load income groups dataset. - ds_income: Dataset = paths.load_dependency("wb_income") - # Get main table from dataset. - tb_income = ds_income["wb_income_group"] - # Create a dataframe out of the table. - df_income = pd.DataFrame(tb_income).reset_index() - - # - # Process data. - # - # Harmonize country names. - df = geo.harmonize_countries(df=df, countries_file=paths.country_mapping_path) - - # Split data into different tables, one per category, and process each one individually. - tables = { - "Capacity": make_wide_table(df=df, category="Capacity", df_regions=df_regions, df_income=df_income), - "Electricity demand": make_table_electricity_demand( - df=df, population=df_population, df_regions=df_regions, df_income=df_income - ), - "Electricity generation": make_table_electricity_generation(df=df, df_regions=df_regions, df_income=df_income), - "Electricity imports": make_wide_table( - df=df, category="Electricity imports", df_regions=df_regions, df_income=df_income - ), - "Power sector emissions": make_table_power_sector_emissions(df=df, df_regions=df_regions, df_income=df_income), - } - - # Apply amendments, and set an appropriate index and short name to each table an sort conveniently. - for table_name in tables: - tables[table_name] = correct_data_points(df=tables[table_name], corrections=AMENDMENTS[table_name]) - tables[table_name] = tables[table_name].set_index(["country", "year"], verify_integrity=True).sort_index() - tables[table_name].metadata.short_name = catalog.utils.underscore(table_name) - - # - # Save outputs. - # - # Create a new dataset with the same metadata as in Meadow. - ds_garden = create_dataset(dest_dir, tables=list(tables.values()), default_metadata=ds_meadow.metadata) - ds_garden.save() diff --git a/etl/steps/archive/garden/ember/2023-06-01/combined_electricity.meta.yml b/etl/steps/archive/garden/ember/2023-06-01/combined_electricity.meta.yml deleted file mode 100644 index 2d06c084fb0..00000000000 --- a/etl/steps/archive/garden/ember/2023-06-01/combined_electricity.meta.yml +++ /dev/null @@ -1,1078 +0,0 @@ -dataset: - title: Combined Electricity Data (Ember, 2023b) - -tables: - european_electricity_review: - title: European electricity review - variables: - demand__total_demand__twh: - title: Demand - Total demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total demand - demand__total_demand_per_capita__kwh: - title: Demand - Total demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Total per capita demand - emissions__co2_intensity__gco2_kwh: - title: Emissions - CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity - emissions__total_emissions__mtco2: - title: Emissions - Total emissions (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - generation__bioenergy__pct: - title: Generation - Bioenergy (%) - short_unit: "%" - unit: "%" - display: - name: Bioenergy generation - generation__bioenergy__twh: - title: Generation - Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy generation - generation__clean__pct: - title: Generation - Clean (%) - short_unit: "%" - unit: "%" - display: - name: Clean generation - generation__clean__twh: - title: Generation - Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean generation - generation__coal__pct: - title: Generation - Coal (%) - short_unit: "%" - unit: "%" - display: - name: Coal generation - generation__coal__twh: - title: Generation - Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal generation - generation__fossil__pct: - title: Generation - Fossil (%) - short_unit: "%" - unit: "%" - display: - name: Fossil generation - generation__fossil__twh: - title: Generation - Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil generation - generation__gas__pct: - title: Generation - Gas (%) - short_unit: "%" - unit: "%" - display: - name: Gas generation - generation__gas__twh: - title: Generation - Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas generation - generation__gas_and_other_fossil__pct: - title: Generation - Gas and other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Gas and other fossil generation - generation__gas_and_other_fossil__twh: - title: Generation - Gas and other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and other fossil generation - generation__hard_coal__pct: - title: Generation - Hard coal (%) - short_unit: "%" - unit: "%" - display: - name: Hard coal generation - generation__hard_coal__twh: - title: Generation - Hard coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hard coal generation - generation__hydro__bioenergy_and_other_renewables__pct: - title: Generation - Hydro bioenergy and other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__bioenergy_and_other_renewables__twh: - title: Generation - Hydro bioenergy and other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__pct: - title: Generation - Hydro (%) - short_unit: "%" - unit: "%" - display: - name: Hydro generation - generation__hydro__twh: - title: Generation - Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro generation - generation__lignite__pct: - title: Generation - Lignite (%) - short_unit: "%" - unit: "%" - display: - name: Lignite generation - generation__lignite__twh: - title: Generation - Lignite (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Lignite generation - generation__nuclear__pct: - title: Generation - Nuclear (%) - short_unit: "%" - unit: "%" - display: - name: Nuclear generation - generation__nuclear__twh: - title: Generation - Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear generation - generation__other_fossil__pct: - title: Generation - Other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Other fossil generation - generation__other_fossil__twh: - title: Generation - Other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other fossil generation - generation__other_renewables__pct: - title: Generation - Other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Other renewables generation - generation__other_renewables__twh: - title: Generation - Other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables generation - generation__renewables__pct: - title: Generation - Renewables (%) - short_unit: "%" - unit: "%" - display: - name: Renewables generation - generation__renewables__twh: - title: Generation - Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables generation - generation__solar__pct: - title: Generation - Solar (%) - short_unit: "%" - unit: "%" - display: - name: Solar generation - generation__solar__twh: - title: Generation - Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar generation - generation__total_generation__twh: - title: Generation - Total - short_unit: TWh - unit: terawatt-hours - display: - name: Total generation - generation__wind__pct: - title: Generation - Wind (%) - short_unit: "%" - unit: "%" - display: - name: Wind generation - generation__wind__twh: - title: Generation - Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind generation - generation__wind_and_solar__pct: - title: Generation - Wind and solar (%) - short_unit: "%" - unit: "%" - display: - name: Wind and solar generation - generation__wind_and_solar__twh: - title: Generation - Wind and solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and solar generation - imports__total_net_imports__twh: - title: Imports - Total net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total net imports - yearly_electricity: - title: Yearly Electricity Data - variables: - capacity__bioenergy__gw: - title: Capacity - Bioenergy (GW) - short_unit: GW - unit: gigawatts - display: - name: Bioenergy capacity - capacity__clean__gw: - title: Capacity - Clean (GW) - short_unit: GW - unit: gigawatts - display: - name: Clean capacity - capacity__coal__gw: - title: Capacity - Coal (GW) - short_unit: GW - unit: gigawatts - display: - name: Coal capacity - capacity__fossil__gw: - title: Capacity - Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Fossil capacity - capacity__gas__gw: - title: Capacity - Gas (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas capacity - capacity__gas_and_other_fossil__gw: - title: Capacity - Gas and other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas and other fossil capacity - capacity__hydro__bioenergy_and_other_renewables__gw: - title: Capacity - Hydro bioenergy and other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro bioenergy and other renewables capacity - capacity__hydro__gw: - title: Capacity - Hydro (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro capacity - capacity__nuclear__gw: - title: Capacity - Nuclear (GW) - short_unit: GW - unit: gigawatts - display: - name: Nuclear capacity - capacity__other_fossil__gw: - title: Capacity - Other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Other fossil capacity - capacity__other_renewables__gw: - title: Capacity - Other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Other renewables capacity - capacity__renewables__gw: - title: Capacity - Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Renewables capacity - capacity__solar__gw: - title: Capacity - Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Solar capacity - capacity__wind__gw: - title: Capacity - Wind (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind capacity - capacity__wind_and_solar__gw: - title: Capacity - Wind and solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind and solar capacity - demand__total_demand__twh: - title: Demand - Total demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total demand - demand__total_demand_per_capita__kwh: - title: Demand - Total demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Total per capita demand - emissions__bioenergy__mtco2: - title: Emissions - Bioenergy (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Bioenergy emissions - emissions__clean__mtco2: - title: Emissions - Clean (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Clean emissions - emissions__co2_intensity__gco2_kwh: - title: Emissions - CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity - emissions__coal__mtco2: - title: Emissions - Coal (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Coal emissions - emissions__fossil__mtco2: - title: Emissions - Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Fossil emissions - emissions__gas__mtco2: - title: Emissions - Gas (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas emissions - emissions__gas_and_other_fossil__mtco2: - title: Emissions - Gas and other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas and other fossil emissions - emissions__hydro__bioenergy_and_other_renewables__mtco2: - title: Emissions - Hydro bioenergy and other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro bioenergy and other renewables emissions - emissions__hydro__mtco2: - title: Emissions - Hydro (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro emissions - emissions__nuclear__mtco2: - title: Emissions - Nuclear (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Nuclear emissions - emissions__other_fossil__mtco2: - title: Emissions - Other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other fossil emissions - emissions__other_renewables__mtco2: - title: Emissions - Other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other renewables emissions - emissions__renewables__mtco2: - title: Emissions - Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Renewables emissions - emissions__solar__mtco2: - title: Emissions - Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Solar emissions - emissions__total_emissions__mtco2: - title: Emissions - Total - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - emissions__wind__mtco2: - title: Emissions - Wind (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind emissions - emissions__wind_and_solar__mtco2: - title: Emissions - Wind and solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind and solar emissions - generation__bioenergy__pct: - title: Generation - Bioenergy (%) - short_unit: "%" - unit: "%" - display: - name: Bioenergy generation - generation__bioenergy__twh: - title: Generation - Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy generation - generation__clean__pct: - title: Generation - Clean (%) - short_unit: "%" - unit: "%" - display: - name: Clean generation - generation__clean__twh: - title: Generation - Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean generation - generation__coal__pct: - title: Generation - Coal (%) - short_unit: "%" - unit: "%" - display: - name: Coal generation - generation__coal__twh: - title: Generation - Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal generation - generation__fossil__pct: - title: Generation - Fossil (%) - short_unit: "%" - unit: "%" - display: - name: Fossil generation - generation__fossil__twh: - title: Generation - Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil generation - generation__gas__pct: - title: Generation - Gas (%) - short_unit: "%" - unit: "%" - display: - name: Gas generation - generation__gas__twh: - title: Generation - Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas generation - generation__gas_and_other_fossil__pct: - title: Generation - Gas and other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Gas and other fossil generation - generation__gas_and_other_fossil__twh: - title: Generation - Gas and other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and other fossil generation - generation__hydro__bioenergy_and_other_renewables__pct: - title: Generation - Hydro bioenergy and other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__bioenergy_and_other_renewables__twh: - title: Generation - Hydro bioenergy and other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__pct: - title: Generation - Hydro (%) - short_unit: "%" - unit: "%" - display: - name: Hydro generation - generation__hydro__twh: - title: Generation - Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro generation - generation__nuclear__pct: - title: Generation - Nuclear (%) - short_unit: "%" - unit: "%" - display: - name: Nuclear generation - generation__nuclear__twh: - title: Generation - Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear generation - generation__other_fossil__pct: - title: Generation - Other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Other fossil generation - generation__other_fossil__twh: - title: Generation - Other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other fossil generation - generation__other_renewables__pct: - title: Generation - Other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Other renewables generation - generation__other_renewables__twh: - title: Generation - Other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables generation - generation__renewables__pct: - title: Generation - Renewables (%) - short_unit: "%" - unit: "%" - display: - name: Renewables generation - generation__renewables__twh: - title: Generation - Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables generation - generation__solar__pct: - title: Generation - Solar (%) - short_unit: "%" - unit: "%" - display: - name: Solar generation - generation__solar__twh: - title: Generation - Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar generation - generation__total_generation__twh: - title: Generation - Total - short_unit: TWh - unit: terawatt-hours - display: - name: Total generation - generation__wind__pct: - title: Generation - Wind (%) - short_unit: "%" - unit: "%" - display: - name: Wind generation - generation__wind__twh: - title: Generation - Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind generation - generation__wind_and_solar__pct: - title: Generation - Wind and solar (%) - short_unit: "%" - unit: "%" - display: - name: Wind and solar generation - generation__wind_and_solar__twh: - title: Generation - Wind and solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and solar generation - imports__total_net_imports__twh: - title: Imports - Total net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total net imports - combined_electricity: - title: Electricity - variables: - capacity__bioenergy__gw: - title: Capacity - Bioenergy (GW) - short_unit: GW - unit: gigawatts - display: - name: Bioenergy capacity - capacity__clean__gw: - title: Capacity - Clean (GW) - short_unit: GW - unit: gigawatts - display: - name: Clean capacity - capacity__coal__gw: - title: Capacity - Coal (GW) - short_unit: GW - unit: gigawatts - display: - name: Coal capacity - capacity__fossil__gw: - title: Capacity - Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Fossil capacity - capacity__gas__gw: - title: Capacity - Gas (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas capacity - capacity__gas_and_other_fossil__gw: - title: Capacity - Gas and other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas and other fossil capacity - capacity__hydro__bioenergy_and_other_renewables__gw: - title: Capacity - Hydro bioenergy and other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro bioenergy and other renewables capacity - capacity__hydro__gw: - title: Capacity - Hydro (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro capacity - capacity__nuclear__gw: - title: Capacity - Nuclear (GW) - short_unit: GW - unit: gigawatts - display: - name: Nuclear capacity - capacity__other_fossil__gw: - title: Capacity - Other fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Other fossil capacity - capacity__other_renewables__gw: - title: Capacity - Other renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Other renewables capacity - capacity__renewables__gw: - title: Capacity - Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Renewables capacity - capacity__solar__gw: - title: Capacity - Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Solar capacity - capacity__wind__gw: - title: Capacity - Wind (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind capacity - capacity__wind_and_solar__gw: - title: Capacity - Wind and solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind and solar capacity - demand__total_demand__twh: - title: Demand - Total demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total demand - demand__total_demand_per_capita__kwh: - title: Demand - Total demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Total per capita demand - emissions__bioenergy__mtco2: - title: Emissions - Bioenergy (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Bioenergy emissions - emissions__clean__mtco2: - title: Emissions - Clean (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Clean emissions - emissions__co2_intensity__gco2_kwh: - title: Emissions - CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity - emissions__coal__mtco2: - title: Emissions - Coal (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Coal emissions - emissions__fossil__mtco2: - title: Emissions - Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Fossil emissions - emissions__gas__mtco2: - title: Emissions - Gas (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas emissions - emissions__gas_and_other_fossil__mtco2: - title: Emissions - Gas and other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas and other fossil emissions - emissions__hydro__bioenergy_and_other_renewables__mtco2: - title: Emissions - Hydro bioenergy and other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro bioenergy and other renewables emissions - emissions__hydro__mtco2: - title: Emissions - Hydro (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro emissions - emissions__nuclear__mtco2: - title: Emissions - Nuclear (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Nuclear emissions - emissions__other_fossil__mtco2: - title: Emissions - Other fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other fossil emissions - emissions__other_renewables__mtco2: - title: Emissions - Other renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other renewables emissions - emissions__renewables__mtco2: - title: Emissions - Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Renewables emissions - emissions__solar__mtco2: - title: Emissions - Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Solar emissions - emissions__total_emissions__mtco2: - title: Emissions - Total - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - emissions__wind__mtco2: - title: Emissions - Wind (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind emissions - emissions__wind_and_solar__mtco2: - title: Emissions - Wind and solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind and solar emissions - generation__bioenergy__pct: - title: Generation - Bioenergy (%) - short_unit: "%" - unit: "%" - display: - name: Bioenergy generation - generation__bioenergy__twh: - title: Generation - Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy generation - generation__clean__pct: - title: Generation - Clean (%) - short_unit: "%" - unit: "%" - display: - name: Clean generation - generation__clean__twh: - title: Generation - Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean generation - generation__coal__pct: - title: Generation - Coal (%) - short_unit: "%" - unit: "%" - display: - name: Coal generation - generation__coal__twh: - title: Generation - Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal generation - generation__fossil__pct: - title: Generation - Fossil (%) - short_unit: "%" - unit: "%" - display: - name: Fossil generation - generation__fossil__twh: - title: Generation - Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil generation - generation__gas__pct: - title: Generation - Gas (%) - short_unit: "%" - unit: "%" - display: - name: Gas generation - generation__gas__twh: - title: Generation - Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas generation - generation__gas_and_other_fossil__pct: - title: Generation - Gas and other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Gas and other fossil generation - generation__gas_and_other_fossil__twh: - title: Generation - Gas and other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and other fossil generation - generation__hard_coal__pct: - title: Generation - Hard coal (%) - short_unit: "%" - unit: "%" - display: - name: Hard coal generation - generation__hard_coal__twh: - title: Generation - Hard coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hard coal generation - generation__hydro__bioenergy_and_other_renewables__pct: - title: Generation - Hydro bioenergy and other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__bioenergy_and_other_renewables__twh: - title: Generation - Hydro bioenergy and other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro bioenergy and other renewables generation - generation__hydro__pct: - title: Generation - Hydro (%) - short_unit: "%" - unit: "%" - display: - name: Hydro generation - generation__hydro__twh: - title: Generation - Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro generation - generation__lignite__pct: - title: Generation - Lignite (%) - short_unit: "%" - unit: "%" - display: - name: Lignite generation - generation__lignite__twh: - title: Generation - Lignite (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Lignite generation - generation__nuclear__pct: - title: Generation - Nuclear (%) - short_unit: "%" - unit: "%" - display: - name: Nuclear generation - generation__nuclear__twh: - title: Generation - Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear generation - generation__other_fossil__pct: - title: Generation - Other fossil (%) - short_unit: "%" - unit: "%" - display: - name: Other fossil generation - generation__other_fossil__twh: - title: Generation - Other fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other fossil generation - generation__other_renewables__pct: - title: Generation - Other renewables (%) - short_unit: "%" - unit: "%" - display: - name: Other renewables generation - generation__other_renewables__twh: - title: Generation - Other renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables generation - generation__renewables__pct: - title: Generation - Renewables (%) - short_unit: "%" - unit: "%" - display: - name: Renewables generation - generation__renewables__twh: - title: Generation - Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables generation - generation__solar__pct: - title: Generation - Solar (%) - short_unit: "%" - unit: "%" - display: - name: Solar generation - generation__solar__twh: - title: Generation - Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar generation - generation__total_generation__twh: - title: Generation - Total - short_unit: TWh - unit: terawatt-hours - display: - name: Total generation - generation__wind__pct: - title: Generation - Wind (%) - short_unit: "%" - unit: "%" - display: - name: Wind generation - generation__wind__twh: - title: Generation - Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind generation - generation__wind_and_solar__pct: - title: Generation - Wind and solar (%) - short_unit: "%" - unit: "%" - display: - name: Wind and solar generation - generation__wind_and_solar__twh: - title: Generation - Wind and solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and solar generation - imports__total_net_imports__twh: - title: Imports - Total net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total net imports - net_flows: - title: Net flows - variables: - net_flow__twh: - title: Net flow (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Net flow diff --git a/etl/steps/archive/garden/ember/2023-06-01/combined_electricity.py b/etl/steps/archive/garden/ember/2023-06-01/combined_electricity.py deleted file mode 100644 index 111b6661137..00000000000 --- a/etl/steps/archive/garden/ember/2023-06-01/combined_electricity.py +++ /dev/null @@ -1,305 +0,0 @@ -"""Garden step that combines Ember's European Electricity Review (EER) and the latest Ember's Yearly Electricity -Data (YED). - -The YED dataset contains data for all countries in EER 2022. -However, YED starts in 2000, while EER 2022 starts in 1990. - -Therefore, to gather as much data as possible, we combine both datasets, prioritizing YED. - -This way, we'll have data from 1990-1999 from EER 2022, and data from 2000-2022 from YED. - -NOTES: -* This step used to combine Ember's Global Electricity Review and the EER, but now we have replaced the former by - the YED. However, there may be instances in the code where "global" refers to the YED. -* We don't use the latest EER 2023 because it does not contain data prior to 2000. - -""" - -import pandas as pd -from owid.catalog import Dataset, Table, utils -from owid.datautils import dataframes - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Define aggregates, following their Ember-Electricity-Data-Methodology document: -# https://ember-climate.org/app/uploads/2022/03/GER22-Methodology.pdf -# The European review also has its own methodology document: -# https://ember-climate.org/app/uploads/2022/02/EER-Methodology.pdf -# but it does not explicitly define aggregates. We assume they are consistent with each other. -# This will be also checked, along with other sanity checks, in a separate analysis. -AGGREGATES = { - "coal__twh": [ - "hard_coal__twh", - "lignite__twh", - ], - "wind_and_solar__twh": ["wind__twh", "solar__twh"], - "hydro__bioenergy_and_other_renewables__twh": [ - "hydro__twh", - "bioenergy__twh", - "other_renewables__twh", - ], - "renewables__twh": [ - "wind_and_solar__twh", - "hydro__bioenergy_and_other_renewables__twh", - ], - "clean__twh": [ - "renewables__twh", - "nuclear__twh", - ], - "gas_and_other_fossil__twh": [ - "gas__twh", - "other_fossil__twh", - ], - "fossil__twh": ["gas_and_other_fossil__twh", "coal__twh"], - "total_generation__twh": [ - "clean__twh", - "fossil__twh", - ], -} - - -def combine_yearly_electricity_data(ds_global: Dataset) -> Table: - """Combine all tables in Ember's Yearly Electricity Data into one table. - - Parameters - ---------- - ds_global : Dataset - Yearly Electricity dataset (containing tables for capacity, electricity demand, generation, imports and - emissions). - - Returns - ------- - combined_global : Table - Combined table containing all data in the Yearly Electricity dataset. - - """ - category_renaming = { - "capacity": "Capacity - ", - "electricity_demand": "", - "electricity_generation": "Generation - ", - "electricity_imports": "", - "power_sector_emissions": "Emissions - ", - } - error = "Tables in yearly electricity dataset have changed" - assert set(category_renaming) == set(ds_global.table_names), error - index_columns = ["country", "year"] - tables = [] - for category in category_renaming: - table = ds_global[category].copy() - table = table.rename( - columns={ - column: utils.underscore(category_renaming[category] + column) - for column in table.columns - if column not in index_columns - } - ) - table = table.reset_index() - tables.append(table) - - # Merge all tables into one, with an appropriate short name. - combined_global = dataframes.multi_merge(dfs=tables, on=index_columns, how="outer") - combined_global.metadata.short_name = "yearly_electricity" - - # Rename certain columns for consistency. - combined_global = combined_global.rename( - columns={ - "net_imports__twh": "imports__total_net_imports__twh", - "demand__twh": "demand__total_demand__twh", - "demand_per_capita__kwh": "demand__total_demand_per_capita__kwh", - }, - errors="raise", - ) - - # Sanity check. - error = "Total generation column in emissions and generation tables are not identical." - assert all( - combined_global["emissions__total_generation__twh"].fillna(-1) - == combined_global["generation__total_generation__twh"].fillna(-1) - ), error - - # Remove unnecessary columns and any possible rows with no data. - combined_global = combined_global.drop(columns=["population", "emissions__total_generation__twh"]).dropna(how="all") - - # Set a convenient index and sort rows and columns conveniently. - combined_global = ( - combined_global.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - ) - - return combined_global - - -def combine_european_electricity_review_data(ds_european: Dataset) -> Table: - """Combine tables in Ember's European Electricity Review dataset into one table. - - The tables to be combined are 'country_overview', 'generation', and 'emissions'. The remaining table on net flows - has a different structure and cannot be combined with the others, so it will remain as a separate table. - - Parameters - ---------- - ds_european : Dataset - European Electricity Review dataset. - - Returns - ------- - combined_european : Table - Combined table containing all data in the European Electricity Review dataset (except net flows). - - """ - index_columns = ["country", "year"] - # Extract the necessary tables from the dataset. - country_overview = ds_european["country_overview"].copy() - generation = ds_european["generation"].copy() - emissions = ds_european["emissions"].copy() - - # Create aggregates (defined in AGGREGATES) that are in yearly electricity but not in the european review. - for aggregate in AGGREGATES: - generation[aggregate] = pd.DataFrame(generation)[AGGREGATES[aggregate]].sum(axis=1) - - # Create a column for each of those new aggregates, giving percentage share of total generation. - for aggregate in AGGREGATES: - column = aggregate.replace("__twh", "__pct") - generation[column] = pd.DataFrame(generation)[aggregate] / generation["total_generation__twh"] * 100 - - # Check that total generation adds up to 100%. - error = "Total generation does not add up to 100%." - assert set(generation["total_generation__pct"]) == {100}, error - - # Check that the constructed "total generation" column agrees with the one given in table "country_overview". - columns = ["country", "year", "total_generation__twh"] - check = pd.merge( - ds_european["country_overview"].reset_index()[columns], - generation.reset_index()[columns], - on=index_columns, - ) - # Assert that the percentage change is smaller than 1% - error = "Total generation does not agree with the on in country_overview." - assert all( - (abs(check["total_generation__twh_x"] - check["total_generation__twh_y"]) / check["total_generation__twh_x"]) - < 0.01 - ), error - - # Remove unnecessary columns. - generation = generation.drop(columns=["total_generation__pct", "total_generation__twh"]) - - # Rename all column names to start with the category, before combining all categories. - generation = generation.rename(columns={column: "generation__" + column for column in generation.columns}) - emissions = emissions.rename(columns={column: "emissions__" + column for column in emissions.columns}) - country_overview = country_overview.rename( - columns={ - "total_generation__twh": "generation__total_generation__twh", - "demand__twh": "demand__total_demand__twh", - "demand_per_capita__kwh": "demand__total_demand_per_capita__kwh", - "net_imports__twh": "imports__total_net_imports__twh", - }, - errors="raise", - ) - - # Combine tables into one dataframe. - combined_european = dataframes.multi_merge( - [ - country_overview.reset_index(), - emissions.reset_index(), - generation.reset_index(), - ], - on=index_columns, - how="outer", - ) - - # Assign an appropriate short name to table. - combined_european.metadata.short_name = "european_electricity_review" - - # If any column was repeated in the merge, it will have a "_x" at the end of the name. - # Check that no other columns were repeated. - error = "There are repeated columns in combined dataframe." - assert len([column for column in combined_european.columns if column.endswith("_x")]) == 0, error - - # Remove any possible rows with no data. - combined_european = combined_european.dropna(how="all") - - # Ensure that the index is well constructed. - combined_european = ( - combined_european.set_index(index_columns, verify_integrity=True).sort_index().sort_index(axis=1) - ) - - return combined_european - - -def combine_yearly_electricity_data_and_european_electricity_review( - combined_global: Table, combined_european: Table -) -> Table: - """Combine the combined table of the Yearly Electricity Data with the combined table of the European Electricity - Review. - - Parameters - ---------- - combined_global : Table - Table that combines all tables of the Yearly Electricity Data. - combined_european : Table - Table that combines all tables of the European Electricity Review (except net flows). - - Returns - ------- - combined : Table - Combined data. - - """ - # Combine (global) yearly electricity data with European data, prioritizing the former. - index_columns = ["country", "year"] - combined = dataframes.combine_two_overlapping_dataframes( - df1=combined_global.reset_index(), df2=combined_european.reset_index(), index_columns=index_columns - ) - - # Create a table (with no metadata) and sort data appropriately. - combined = ( - Table(combined, short_name="combined_electricity") - .set_index(index_columns, verify_integrity=True) - .sort_index() - .sort_index(axis=1) - ) - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read yearly electricity data and european electricity review datasets from garden. - ds_global: Dataset = paths.load_dependency("yearly_electricity") - ds_european: Dataset = paths.load_dependency("european_electricity_review") - - # - # Process data. - # - # Combine all tables of the yearly electricity data into one. - combined_global = combine_yearly_electricity_data(ds_global=ds_global) - - # Combine all tables of the european electricity review into one. - combined_european = combine_european_electricity_review_data(ds_european=ds_european) - - # Combine yearly electricity and european reviews. - combined = combine_yearly_electricity_data_and_european_electricity_review( - combined_global=combined_global, combined_european=combined_european - ) - - # Create an additional table with the electricity net flows (only available in european review). - net_flows = ds_european["net_flows"].copy() - - # - # Save outputs. - # - # Create new garden dataset. - ds_garden = create_dataset( - dest_dir=dest_dir, - tables=[combined_global, combined_european, combined, net_flows], - default_metadata=ds_global.metadata, - ) - - # Combine sources and licenses from the original datasets. - ds_garden.metadata.sources = sum([ds.metadata.sources for ds in [ds_global, ds_european]], []) - ds_garden.metadata.licenses = sum([ds.metadata.licenses for ds in [ds_global, ds_european]], []) - - ds_garden.save() diff --git a/etl/steps/archive/garden/ember/2023-06-01/shared.py b/etl/steps/archive/garden/ember/2023-06-01/shared.py deleted file mode 100644 index 92f3e7e08da..00000000000 --- a/etl/steps/archive/garden/ember/2023-06-01/shared.py +++ /dev/null @@ -1,516 +0,0 @@ -import itertools -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import pandas as pd -from structlog import get_logger - -from etl.data_helpers import geo - -log = get_logger() - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - -# Aggregate regions to add, following OWID definitions. -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -# REGIONS = { -# # Default continents. -# "Africa": {}, -# "Asia": {}, -# "Europe": {}, -# "European Union (27)": {}, -# "North America": {}, -# "Oceania": {}, -# "South America": {}, -# "World": {}, -# # Income groups. -# "Low-income countries": {}, -# "Upper-middle-income countries": {}, -# "Lower-middle-income countries": {}, -# "High-income countries": {}, -# } - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION: Dict[str, Dict[str, Union[str, List[str]]]] = { - "Czechoslovakia": { - "continent": "Europe", - "income_group": "High-income countries", - "regions_included": [ - # Europe - High-income countries. - "Czechia", - "Slovakia", - ], - }, - "East Germany": { - "continent": "Europe", - "income_group": "", - "regions_included": [ - # Europe - High-income countries. - "Germany", - ], - }, - "West Germany": { - "continent": "Europe", - "income_group": "", - "regions_included": [ - # Europe - High-income countries. - "Germany", - ], - }, - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "regions_included": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - "Bonaire Sint Eustatius and Saba", - ], - }, - "Serbia and Montenegro": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - Upper-middle-income countries. - "Serbia", - "Montenegro", - ], - }, - "North Yemen": { - "continent": "Asia", - "income_group": "Low-income countries", - "regions_included": [ - # Asia - Low-income countries. - "Yemen", - ], - }, - "South Yemen": { - "continent": "Asia", - "income_group": "Low-income countries", - "regions_included": [ - # Asia - Low-income countries. - "Yemen", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, - "Yugoslavia": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - High-income countries. - "Croatia", - "Slovenia", - # Europe - Upper-middle-income countries. - "North Macedonia", - "Bosnia and Herzegovina", - "Serbia", - "Montenegro", - ], - }, -} - -# Overlaps found between historical regions and successor countries, that we accept in the data. -# We accept them either because they happened close to the transition, or to avoid needing to introduce new -# countries for which we do not have data (like the Russian Empire). -ACCEPTED_OVERLAPS = { - # 1991: {"Georgia", "USSR"}, -} - - -def get_countries_in_region( - region: str, - df_regions: pd.DataFrame, - df_income: pd.DataFrame, - region_modifications: Optional[Dict[str, Dict[str, List[str]]]] = None, -) -> List[str]: - """Get countries in a region, both for known regions (e.g. "Africa") and custom ones (e.g. "Europe (excl. EU-27)"). - - Parameters - ---------- - region : str - Region name (e.g. "Africa", or "Europe (excl. EU-27)"). - df_regions : pd.DataFrame - Countries-regions data. - df_income : pd.DataFrame - Data on income group definitions. - region_modifications : dict or None - If None (or an empty dictionary), the region should be in OWID's countries-regions dataset. - If not None, it should be a dictionary with any (or all) of the following keys: - - "regions_included": List of regions whose countries will be included. - - "regions_excluded": List of regions whose countries will be excluded. - - "countries_included": List of additional individual countries to be included. - - "countries_excluded": List of additional individual countries to be excluded. - NOTE: All regions and countries defined in this dictionary should be in OWID's countries-regions dataset. - - Returns - ------- - countries : list - List of countries in the specified region. - - """ - if region_modifications is None: - region_modifications = {} - - # Check that the fields in the regions_modifications dictionary are well defined. - expected_fields = ["regions_included", "regions_excluded", "countries_included", "countries_excluded"] - assert all([field in expected_fields for field in region_modifications]) - - # Get lists of regions whose countries will be included and excluded. - regions_included = region_modifications.get("regions_included", [region]) - regions_excluded = region_modifications.get("regions_excluded", []) - # Get lists of additional individual countries to include and exclude. - countries_included = region_modifications.get("countries_included", []) - countries_excluded = region_modifications.get("countries_excluded", []) - - # List countries from the list of regions included. - countries_set = set( - sum( - [ - geo.list_countries_in_region(region_included, countries_regions=df_regions, income_groups=df_income) - for region_included in regions_included - ], - [], - ) - ) - - # Remove all countries from the list of regions excluded. - countries_set -= set( - sum( - [ - geo.list_countries_in_region(region_excluded, countries_regions=df_regions, income_groups=df_income) - for region_excluded in regions_excluded - ], - [], - ) - ) - - # Add the list of individual countries to be included. - countries_set |= set(countries_included) - - # Remove the list of individual countries to be excluded. - countries_set -= set(countries_excluded) - - # Convert set of countries into a sorted list. - countries = sorted(countries_set) - - return countries - - -def add_population( - df: pd.DataFrame, - population: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - interpolate_missing_population: bool = False, - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, - expected_countries_without_population: List[str] = [], -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - population : pd.DataFrame - Population data. - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - interpolate_missing_population : bool - True to linearly interpolate population on years that are presented in df, but for which we do not have - population data; otherwise False to keep missing population data as nans. - For example, if interpolate_missing_population is True and df has data for all years between 1900 and 1910, - but population is only given for 1900 and 1910, population will be linearly interpolated between those years. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - regions : dict - Definitions of regions whose population also needs to be included. - expected_countries_without_population : list - Countries that are expected to not have population (that should be ignored if warnings are activated). - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Prepare population dataset. - population = population.reset_index().rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - if interpolate_missing_population: - # For some countries we have population data only on certain years, e.g. 1900, 1910, etc. - # Optionally fill missing years linearly. - countries_in_data = df[country_col].unique() - years_in_data = df[year_col].unique() - - population = population.set_index([country_col, year_col]).reindex( - pd.MultiIndex.from_product([countries_in_data, years_in_data], names=[country_col, year_col]) - ) - - population = population.groupby(country_col).transform( - lambda x: x.interpolate(method="linear", limit_direction="both") - ) - - error = "Countries without population data differs from list of expected countries without population data." - assert set(population[population[population_col].isnull()].reset_index()[country_col]) == set( - expected_countries_without_population - ), error - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population - - -def detect_overlapping_regions( - df, index_columns, region_and_members, country_col="country", year_col="year", ignore_zeros=True -): - """Detect years on which the data for two regions overlap, e.g. a historical region and one of its successors. - - Parameters - ---------- - df : _type_ - Data (with a dummy index). - index_columns : _type_ - Names of index columns. - region_and_members : _type_ - Regions to check for overlaps. Each region must have a dictionary "regions_included", listing the subregions - contained. If the region is historical, "regions_included" would be the list of successor countries. - country_col : str, optional - Name of country column (usually "country"). - year_col : str, optional - Name of year column (usually "year"). - ignore_zeros : bool, optional - True to ignore overlaps of zeros. - - Returns - ------- - all_overlaps : dict - All overlaps found. - - """ - # Sum over all columns to get the total sum of each column for each country-year. - df_total = ( - df.groupby([country_col, year_col]) - .agg({column: "sum" for column in df.columns if column not in index_columns}) - .reset_index() - ) - # Create a list of values that will be ignored in overlaps (usually zero or nothing). - if ignore_zeros: - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - # List all variables in data (ignoring index columns). - variables = [column for column in df.columns if column not in index_columns] - # List all country names found in data. - countries_in_data = df[country_col].unique().tolist() - # List all regions found in data. - regions = [country for country in list(region_and_members) if country in countries_in_data] - # Initialize a dictionary that will store all overlaps found. - all_overlaps = {} - for region in regions: - # List members of current region. - members = [member for member in region_and_members[region]["regions_included"] if member in countries_in_data] - for member in members: - # Select data for current region. - region_values = ( - df_total[df_total[country_col] == region] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=variables, how="all") - ) - # Select data for current member. - member_values = ( - df_total[df_total[country_col] == member] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=variables, how="all") - ) - # Concatenate both selections of data, and select duplicated rows. - combined = pd.concat([region_values, member_values]) - overlaps = combined[combined.duplicated(subset=[year_col], keep=False)] # type: ignore - if len(overlaps) > 0: - # Add the overlap found to the dictionary of all overlaps. - all_overlaps.update({year: set(overlaps[country_col]) for year in overlaps[year_col].unique()}) - - # Sort overlaps conveniently. - all_overlaps = {year: all_overlaps[year] for year in sorted(list(all_overlaps))} - - return all_overlaps - - -def add_region_aggregates( - data: pd.DataFrame, - regions_to_add: Dict[Any, Any], - index_columns: List[str], - df_regions: pd.DataFrame, - df_income: pd.DataFrame, - country_column: str = "country", - aggregates: Optional[Dict[str, str]] = None, -) -> pd.DataFrame: - """Add region aggregates for all regions (which may include continents and income groups). - - Parameters - ---------- - data : pd.DataFrame - Data. - regions_to_add: list - Regions to add. - index_columns : list - Name of index columns. - df_regions : pd.DataFrame - Countries-regions data. - df_income : pd.DataFrame - Data on income group definitions. - country_column : str - Name of country column. - year_column : str - Name of year column. - aggregates : dict or None - Dictionary of type of aggregation to use for each variable. If None, variables will be aggregated by summing. - - Returns - ------- - data : pd.DataFrame - Data after adding aggregate regions. - - """ - data = data.copy() - - all_overlaps = detect_overlapping_regions( - df=data, region_and_members=HISTORIC_TO_CURRENT_REGION, index_columns=index_columns - ) - - # Check whether all accepted overlaps are found in the data, and that there are no new unknown overlaps. - error = "Either the list of accepted overlaps is not found in the data, or there are new unknown overlaps." - assert ACCEPTED_OVERLAPS == all_overlaps, error - - if aggregates is None: - # If aggregations are not specified, assume all variables are to be aggregated, by summing. - aggregates = {column: "sum" for column in data.columns if column not in index_columns} - - for region in regions_to_add: - # List of countries in region. - countries_in_region = get_countries_in_region( - region=region, region_modifications=regions_to_add[region], df_regions=df_regions, df_income=df_income - ) - # Select rows of data for member countries. - data_region = data[data[country_column].isin(countries_in_region)] - - # Add region aggregates. - region_df = ( - data_region.groupby([column for column in index_columns if column != country_column]) - .agg(aggregates) - .reset_index() - .assign(**{country_column: region}) - ) - data = pd.concat([data, region_df], ignore_index=True) - - return data - - -def _expand_combinations_in_amendments( - amendments: List[Tuple[Dict[Any, Any], Dict[Any, Any]]] -) -> List[Tuple[Dict[Any, Any], Dict[Any, Any]]]: - """When values in amendments are given as lists, explode them to have all possible combinations of values.""" - amendments_expanded = [] - for wrong_row, corrected_row in amendments: - field, values = zip(*wrong_row.items()) - for amendment_single in [dict(zip(field, value)) for value in itertools.product(*values)]: - amendments_expanded.append((amendment_single, corrected_row)) - - return amendments_expanded - - -def correct_data_points(df: pd.DataFrame, corrections: List[Tuple[Dict[Any, Any], Dict[Any, Any]]]) -> pd.DataFrame: - """Make individual corrections to data points in a dataframe. - - Parameters - ---------- - df : pd.DataFrame - Data to be corrected. - corrections : List[Tuple[Dict[Any, Any], Dict[Any, Any]]] - Corrections. - - Returns - ------- - corrected_df : pd.DataFrame - Corrected data. - - """ - corrected_df = df.copy() - - corrections_expanded = _expand_combinations_in_amendments(amendments=corrections) - for wrong_row, corrected_row in corrections_expanded: - # Select the row in the dataframe where the wrong data point is. - # The 'fillna(False)' is added because otherwise rows that do not fulfil the selection will create ambiguity. - selection = corrected_df.loc[(corrected_df[list(wrong_row)] == pd.Series(wrong_row)).fillna(False).all(axis=1)] - # Sanity check. - error = "Either raw data has been corrected, or dictionary selecting wrong row is ambiguous." - assert len(selection) == 1, error - - # Replace wrong fields by the corrected ones. - # Note: Changes to categorical fields will not work. - corrected_df.loc[selection.index, list(corrected_row)] = list(corrected_row.values()) - - return corrected_df diff --git a/etl/steps/archive/garden/ember/2023-06-01/yearly_electricity.countries.json b/etl/steps/archive/garden/ember/2023-06-01/yearly_electricity.countries.json deleted file mode 100644 index acac8f4f0b2..00000000000 --- a/etl/steps/archive/garden/ember/2023-06-01/yearly_electricity.countries.json +++ /dev/null @@ -1,229 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas (the)": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bosnia Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cayman Islands (the)": "Cayman Islands", - "Central African Republic (the)": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros (the)": "Comoros", - "Congo (the Democratic Republic of the)": "Democratic Republic of Congo", - "Congo (the)": "Congo", - "Cook Islands (the)": "Cook Islands", - "Costa Rica": "Costa Rica", - "Cote d'Ivoire": "Cote d'Ivoire", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic (the)": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Falkland Islands (the) [Malvinas]": "Falkland Islands", - "Faroe Islands (the)": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia (the)": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Gibraltar": "Gibraltar", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea (the Democratic People's Republic of)": "North Korea", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic (the)": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Moldova": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger (the)": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palestine, State of": "Palestine", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines (the)": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Reunion": "Reunion", - "Romania": "Romania", - "Russian Federation (the)": "Russia", - "Rwanda": "Rwanda", - "Saint Helena, Ascension and Tristan da Cunha": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan (the)": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic (the)": "Syria", - "Taiwan": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania, the United Republic of": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands (the)": "Turks and Caicos Islands", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Virgin Islands (British)": "British Virgin Islands", - "Virgin Islands (U.S.)": "United States Virgin Islands", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "OECD": "OECD (Ember)", - "Africa": "Africa (Ember)", - "Asia": "Asia (Ember)", - "EU": "European Union (27) (Ember)", - "Europe": "Europe (Ember)", - "G20": "G20 (Ember)", - "G7": "G7 (Ember)", - "Latin America and Caribbean": "Latin America and Caribbean (Ember)", - "Middle East": "Middle East (Ember)", - "North America": "North America (Ember)", - "Oceania": "Oceania (Ember)" -} diff --git a/etl/steps/archive/garden/ember/2023-06-01/yearly_electricity.meta.yml b/etl/steps/archive/garden/ember/2023-06-01/yearly_electricity.meta.yml deleted file mode 100644 index 1cff8a10da4..00000000000 --- a/etl/steps/archive/garden/ember/2023-06-01/yearly_electricity.meta.yml +++ /dev/null @@ -1,431 +0,0 @@ -dataset: - title: Yearly Electricity Data (Ember, 2023b) - description: | - [Ember's region definitions](https://ember-climate.org/countries-and-regions/), denoted with "(Ember)", are: - * "G20 (Ember)" - Group of Twenty: Argentina, Australia, Brazil, Canada, China, France, Germany, India, Indonesia, Italy, Japan, Mexico, Russia, Saudi Arabia, South Africa, South Korea, Turkey, United Kingdom, United States and the 27 members of the European Union. - * "G7 (Ember)" - Group of Seven: Canada, France, Germany, Italy, Japan, United Kingdom and United States. - * "Latin America and Caribbean (Ember)": Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, Uruguay, Venezuela, Aruba, British Virgin Islands, Cayman Islands, Falkland Islands, French Guiana, Guadeloupe, Martinique, Montserrat, Puerto Rico, Turks and Caicos Islands and United States Virgin Islands. - * "OECD (Ember)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, United Kingdom, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, and United States. - -tables: - capacity: - title: "Capacity" - variables: - clean__gw: - title: Clean (GW) - short_unit: GW - unit: gigawatts - display: - name: Clean - fossil__gw: - title: Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Fossil - gas_and_other_fossil__gw: - title: Gas and Other Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__gw: - title: Hydro, Bioenergy and Other Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro, Bioenergy and Other Renewables - renewables__gw: - title: Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Renewables - wind_and_solar__gw: - title: Wind and Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind and Solar - bioenergy__gw: - title: Bioenergy (GW) - short_unit: GW - unit: gigawatts - display: - name: Bioenergy - coal__gw: - title: Coal (GW) - short_unit: GW - unit: gigawatts - display: - name: Coal - gas__gw: - title: Gas (GW) - short_unit: GW - unit: gigawatts - display: - name: Gas - hydro__gw: - title: Hydro (GW) - short_unit: GW - unit: gigawatts - display: - name: Hydro - nuclear__gw: - title: Nuclear (GW) - short_unit: GW - unit: gigawatts - display: - name: Nuclear - other_fossil__gw: - title: Other Fossil (GW) - short_unit: GW - unit: gigawatts - display: - name: Other Fossil - other_renewables__gw: - title: Other Renewables (GW) - short_unit: GW - unit: gigawatts - display: - name: Other Renewables - solar__gw: - title: Solar (GW) - short_unit: GW - unit: gigawatts - display: - name: Solar - wind__gw: - title: Wind (GW) - short_unit: GW - unit: gigawatts - display: - name: Wind - electricity_demand: - title: Electricity demand - variables: - demand__twh: - title: Demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Demand - population: - title: Population - short_unit: people - unit: people - demand_per_capita__kwh: - title: Demand per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Demand per capita - electricity_generation: - title: Electricity generation - variables: - clean__pct: - title: Clean (%) - short_unit: '%' - unit: '%' - display: - name: Clean - fossil__pct: - title: Fossil (%) - short_unit: '%' - unit: '%' - display: - name: Fossil - gas_and_other_fossil__pct: - title: Gas and Other Fossil (%) - short_unit: '%' - unit: '%' - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__pct: - title: Hydro, Bioenergy and Other Renewables (%) - short_unit: '%' - unit: '%' - display: - name: Hydro, Bioenergy and Other Renewables - renewables__pct: - title: Renewables (%) - short_unit: '%' - unit: '%' - display: - name: Renewables - wind_and_solar__pct: - title: Wind and Solar (%) - short_unit: '%' - unit: '%' - display: - name: Wind and Solar - clean__twh: - title: Clean (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Clean - fossil__twh: - title: Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil - gas_and_other_fossil__twh: - title: Gas and Other Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__twh: - title: Hydro, Bioenergy and Other Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro, Bioenergy and Other Renewables - renewables__twh: - title: Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables - wind_and_solar__twh: - title: Wind and Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind and Solar - bioenergy__pct: - title: Bioenergy (%) - short_unit: '%' - unit: '%' - display: - name: Bioenergy - coal__pct: - title: Coal (%) - short_unit: '%' - unit: '%' - display: - name: Coal - gas__pct: - title: Gas (%) - short_unit: '%' - unit: '%' - display: - name: Gas - hydro__pct: - title: Hydro (%) - short_unit: '%' - unit: '%' - display: - name: Hydro - nuclear__pct: - title: Nuclear (%) - short_unit: '%' - unit: '%' - display: - name: Nuclear - other_fossil__pct: - title: Other Fossil (%) - short_unit: '%' - unit: '%' - display: - name: Other Fossil - other_renewables__pct: - title: Other Renewables (%) - short_unit: '%' - unit: '%' - display: - name: Other Renewables - solar__pct: - title: Solar (%) - short_unit: '%' - unit: '%' - display: - name: Solar - wind__pct: - title: Wind (%) - short_unit: '%' - unit: '%' - display: - name: Wind - bioenergy__twh: - title: Bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy - coal__twh: - title: Coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - gas__twh: - title: Gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas - hydro__twh: - title: Hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydro - nuclear__twh: - title: Nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - other_fossil__twh: - title: Other Fossil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other Fossil - other_renewables__twh: - title: Other Renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other Renewables - solar__twh: - title: Solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - wind__twh: - title: Wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - total_generation__twh: - title: Total Generation (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total Generation - electricity_imports: - title: Electricity imports - variables: - net_imports__twh: - title: Net Imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Net Imports - power_sector_emissions: - title: Power sector emissions - variables: - clean__mtco2: - title: Clean (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Clean - fossil__mtco2: - title: Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Fossil - gas_and_other_fossil__mtco2: - title: Gas and Other Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas and Other Fossil - hydro__bioenergy_and_other_renewables__mtco2: - title: Hydro, Bioenergy and Other Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro, Bioenergy and Other Renewables - renewables__mtco2: - title: Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Renewables - wind_and_solar__mtco2: - title: Wind and Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind and Solar - bioenergy__mtco2: - title: Bioenergy (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Bioenergy - coal__mtco2: - title: Coal (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Coal - gas__mtco2: - title: Gas (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Gas - hydro__mtco2: - title: Hydro (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Hydro - nuclear__mtco2: - title: Nuclear (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Nuclear - other_fossil__mtco2: - title: Other Fossil (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other Fossil - other_renewables__mtco2: - title: Other Renewables (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Other Renewables - solar__mtco2: - title: Solar (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Solar - wind__mtco2: - title: Wind (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Wind - total_emissions__mtco2: - title: Total emissions (mtCO2) - short_unit: mtCO2 - unit: megatonnes of CO2 equivalent - display: - name: Total emissions - total_generation__twh: - title: Total Generation (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Total Generation - co2_intensity__gco2_kwh: - title: CO2 intensity (gCO2/kWh) - short_unit: gCO2/kWh - unit: grams of CO2 equivalent per kilowatt-hour - display: - name: CO2 intensity diff --git a/etl/steps/archive/garden/ember/2023-06-01/yearly_electricity.py b/etl/steps/archive/garden/ember/2023-06-01/yearly_electricity.py deleted file mode 100644 index 2e7178227ed..00000000000 --- a/etl/steps/archive/garden/ember/2023-06-01/yearly_electricity.py +++ /dev/null @@ -1,514 +0,0 @@ -"""Garden step for Ember's Yearly Electricity Data. - -""" - -import numpy as np -import pandas as pd -from owid import catalog -from owid.catalog import Dataset, Table -from shared import add_population, add_region_aggregates, correct_data_points -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - -# Initialize log. -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Aggregate regions to add, following OWID definitions. -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -REGIONS = { - # Default continents. - "Africa": {}, - "Asia": {}, - "Europe": {}, - "European Union (27)": {}, - "North America": {}, - "Oceania": {}, - "South America": {}, - # Ember already has data for "World". - # "World": {}, - # Income groups. - "Low-income countries": {}, - "Upper-middle-income countries": {}, - "Lower-middle-income countries": {}, - "High-income countries": {}, -} - -# Corrections to the output tables. -# They are all the same correction: Remove region aggregates for the latest year, given that many countries are not -# informed, which causes the aggregates to be unreliable -# (e.g. generation__total_generation__twh in Africa drops in 2022 because only a few countries are informed). -AFFECTED_YEAR = 2022 -AMENDMENTS = { - "Capacity": [ - ( - { - "country": list(REGIONS), - "year": [AFFECTED_YEAR], - }, - { - "Clean (GW)": pd.NA, - "Fossil (GW)": pd.NA, - "Gas and Other Fossil (GW)": pd.NA, - "Hydro, Bioenergy and Other Renewables (GW)": pd.NA, - "Renewables (GW)": pd.NA, - "Wind and Solar (GW)": pd.NA, - "Bioenergy (GW)": pd.NA, - "Coal (GW)": pd.NA, - "Gas (GW)": pd.NA, - "Hydro (GW)": pd.NA, - "Nuclear (GW)": pd.NA, - "Other Fossil (GW)": pd.NA, - "Other Renewables (GW)": pd.NA, - "Solar (GW)": pd.NA, - "Wind (GW)": pd.NA, - }, - ) - ], - "Electricity demand": [ - ( - { - "country": list(REGIONS), - "year": [AFFECTED_YEAR], - }, - { - "Demand (TWh)": pd.NA, - "population": pd.NA, - "Demand per capita (kWh)": pd.NA, - }, - ) - ], - "Electricity generation": [ - ( - { - "country": list(REGIONS), - "year": [AFFECTED_YEAR], - }, - { - "Clean (%)": pd.NA, - "Fossil (%)": pd.NA, - "Gas and Other Fossil (%)": pd.NA, - "Hydro, Bioenergy and Other Renewables (%)": pd.NA, - "Renewables (%)": pd.NA, - "Wind and Solar (%)": pd.NA, - "Clean (TWh)": pd.NA, - "Fossil (TWh)": pd.NA, - "Gas and Other Fossil (TWh)": pd.NA, - "Hydro, Bioenergy and Other Renewables (TWh)": pd.NA, - "Renewables (TWh)": pd.NA, - "Wind and Solar (TWh)": pd.NA, - "Bioenergy (%)": pd.NA, - "Coal (%)": pd.NA, - "Gas (%)": pd.NA, - "Hydro (%)": pd.NA, - "Nuclear (%)": pd.NA, - "Other Fossil (%)": pd.NA, - "Other Renewables (%)": pd.NA, - "Solar (%)": pd.NA, - "Wind (%)": pd.NA, - "Bioenergy (TWh)": pd.NA, - "Coal (TWh)": pd.NA, - "Gas (TWh)": pd.NA, - "Hydro (TWh)": pd.NA, - "Nuclear (TWh)": pd.NA, - "Other Fossil (TWh)": pd.NA, - "Other Renewables (TWh)": pd.NA, - "Solar (TWh)": pd.NA, - "Wind (TWh)": pd.NA, - "Total Generation (TWh)": pd.NA, - }, - ), - ], - "Electricity imports": [ - ( - { - "country": list(REGIONS), - "year": [AFFECTED_YEAR], - }, - { - "Net Imports (TWh)": np.nan, - }, - ), - ], - "Power sector emissions": [ - ( - { - "country": list(REGIONS), - "year": [AFFECTED_YEAR], - }, - { - "Clean (mtCO2)": pd.NA, - "Fossil (mtCO2)": pd.NA, - "Gas and Other Fossil (mtCO2)": pd.NA, - "Hydro, Bioenergy and Other Renewables (mtCO2)": pd.NA, - "Renewables (mtCO2)": pd.NA, - "Wind and Solar (mtCO2)": pd.NA, - "Bioenergy (mtCO2)": pd.NA, - "Coal (mtCO2)": pd.NA, - "Gas (mtCO2)": pd.NA, - "Hydro (mtCO2)": pd.NA, - "Nuclear (mtCO2)": pd.NA, - "Other Fossil (mtCO2)": pd.NA, - "Other Renewables (mtCO2)": pd.NA, - "Solar (mtCO2)": pd.NA, - "Wind (mtCO2)": pd.NA, - "Total emissions (mtCO2)": pd.NA, - "Total Generation (TWh)": pd.NA, - "CO2 intensity (gCO2/kWh)": pd.NA, - }, - ), - ], -} - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 -# Megatonnes to grams. -MT_TO_G = 1e12 - -# Columns to use from Ember's yearly electricity data, and how to rename them. -COLUMNS_YEARLY_ELECTRICITY = { - "area": "country", - "year": "year", - "variable": "variable", - "value": "value", - "unit": "unit", - "category": "category", - "subcategory": "subcategory", -} - -# Map units (short version) to unit name (long version). -SHORT_UNIT_TO_UNIT = { - "TWh": "terawatt-hours", - "MWh": "megawatt-hours", - "kWh": "kilowatt-hours", - "mtCO2": "megatonnes of CO2 equivalent", - "gCO2/kWh": "grams of CO2 equivalent per kilowatt-hour", - "GW": "gigawatts", - "%": "%", -} - -# Categories expected to exist in the data. -CATEGORIES = [ - "Capacity", - "Electricity demand", - "Electricity generation", - "Electricity imports", - "Power sector emissions", -] - -# Choose columns for which region aggregates should be created. -SUM_AGGREGATES = [ - # "Bioenergy (%)", - "Bioenergy (GW)", - "Bioenergy (TWh)", - "Bioenergy (mtCO2)", - # "CO2 intensity (gCO2/kWh)", - # "Clean (%)", - "Clean (GW)", - "Clean (TWh)", - "Clean (mtCO2)", - # "Coal (%)", - "Coal (GW)", - "Coal (TWh)", - "Coal (mtCO2)", - "Demand (TWh)", - # "Demand per capita (MWh)", - # "Fossil (%)", - "Fossil (GW)", - "Fossil (TWh)", - "Fossil (mtCO2)", - # "Gas (%)", - "Gas (GW)", - "Gas (TWh)", - "Gas (mtCO2)", - "Gas and Other Fossil (%)", - "Gas and Other Fossil (GW)", - "Gas and Other Fossil (TWh)", - "Gas and Other Fossil (mtCO2)", - # "Hydro (%)", - "Hydro (GW)", - "Hydro (TWh)", - "Hydro (mtCO2)", - "Hydro, Bioenergy and Other Renewables (%)", - "Hydro, Bioenergy and Other Renewables (GW)", - "Hydro, Bioenergy and Other Renewables (TWh)", - "Hydro, Bioenergy and Other Renewables (mtCO2)", - "Net Imports (TWh)", - # "Nuclear (%)", - "Nuclear (GW)", - "Nuclear (TWh)", - "Nuclear (mtCO2)", - # "Other Fossil (%)", - "Other Fossil (GW)", - "Other Fossil (TWh)", - "Other Fossil (mtCO2)", - # "Other Renewables (%)", - "Other Renewables (GW)", - "Other Renewables (TWh)", - "Other Renewables (mtCO2)", - # "Renewables (%)", - "Renewables (GW)", - "Renewables (TWh)", - "Renewables (mtCO2)", - # "Solar (%)", - "Solar (GW)", - "Solar (TWh)", - "Solar (mtCO2)", - "Total Generation (TWh)", - "Total emissions (mtCO2)", - # "Wind (%)", - "Wind (GW)", - "Wind (TWh)", - "Wind (mtCO2)", - # "Wind and Solar (%)", - "Wind and Solar (GW)", - "Wind and Solar (TWh)", - "Wind and Solar (mtCO2)", -] - - -def make_wide_table(tb: Table, category: str, tb_regions: Table, tb_income: Table) -> Table: - """Convert data from long to wide format for a specific category. - - This is a common processing for all categories in the data. - - Parameters - ---------- - tb : Table - Data, after harmonizing country names. - category : str - Name of category (as defined above in CATEGORIES) to process. - tb_regions : Table - Countries-regions data. - tb_income : Table - Data on income group definitions. - - Returns - ------- - table : Table - Table in wide format. - - """ - # Select data for given category. - _tb = tb[tb["category"] == category].copy() - - # Pivot dataframe to have a column for each variable. - table = Table(_tb.pivot(index=["country", "year"], columns=["variable", "unit"], values="value")) - - # Get variable names, units, and variable-units (a name that combines both) for each column. - variable_units = [f"{variable} ({unit})" for variable, unit in table.columns] - - # Sanity check. - variables = table.columns.get_level_values(0).tolist() - units = table.columns.get_level_values(1).tolist() - assert len(variable_units) == len(units) == len(variables) - - # Collapse the two column levels into one, with the naming "variable (unit)" (except for country and year, that - # have no units and are the indexes of the table). - table.columns = variable_units - - # Add region aggregates. - aggregates = {column: "sum" for column in SUM_AGGREGATES if column in table.columns} - - table = add_region_aggregates( - data=table.reset_index(), - index_columns=["country", "year"], - regions_to_add=REGIONS, - aggregates=aggregates, - df_regions=tb_regions, - df_income=tb_income.rename(columns={"classification": "income_group"}), - ) - - return table - - -def make_table_electricity_generation(tb: Table, tb_regions: Table, tb_income: Table) -> Table: - """Create table with processed data of category "Electricity generation". - - Parameters - ---------- - tb : Table - Data in long format for all categories, after harmonizing country names. - tb_regions : Table - Countries-regions data. - tb_income : Table - Data on income group definitions. - - Returns - ------- - table : Table - Table of processed data for the given category. - - """ - # Prepare wide table. - table = make_wide_table(tb=tb, category="Electricity generation", tb_regions=tb_regions, tb_income=tb_income) - - # Recalculate the share of electricity generates for region aggregates. - for column in table.columns: - if "(%)" in column: - # Find corresponding column with units instead of percentages. - value_column = column.replace("(%)", "(TWh)") - if value_column not in table.columns: - raise ValueError(f"Column {value_column} not found.") - # Select only regions. - select_regions = table["country"].isin(list(REGIONS)) - table.loc[select_regions, column] = table[value_column] / table["Total Generation (TWh)"] * 100 - - return table - - -def make_table_electricity_demand(tb: Table, population: Table, tb_regions: Table, tb_income: Table) -> Table: - """Create table with processed data of category "Electricity demand". - - Parameters - ---------- - tb : Table - Data in long format for all categories, after harmonizing country names. - tb_regions : Table - Countries-regions data. - tb_income : Table - Data on income group definitions. - - Returns - ------- - table : Table - Table of processed data for the given category. - - """ - # Prepare wide table. - table = make_wide_table(tb=tb, category="Electricity demand", tb_regions=tb_regions, tb_income=tb_income) - - # Add population to data - table = add_population(df=table, population=population, warn_on_missing_countries=False) - - # Recalculate demand per capita. - # We could do this only for region aggregates (since they do not have per capita values), - # but we do this for all countries, to ensure per-capita variables are consistent with our population data. - table["Demand per capita (kWh)"] = ( - pd.DataFrame(table)["Demand (TWh)"] * TWH_TO_KWH / pd.DataFrame(table)["population"] - ) - - # Delete the original demand per capita column. - table = table.drop(columns=["Demand per capita (MWh)"]) - - return table - - -def make_table_power_sector_emissions(tb: Table, tb_regions: Table, tb_income: Table) -> Table: - """Create table with processed data of category "Power sector emissions". - - Parameters - ---------- - tb : Table - Data in long format for all categories, after harmonizing country names. - tb_regions : Table - Countries-regions data. - tb_income : Table - Data on income group definitions. - - Returns - ------- - table : Table - Table of processed data for the given category. - - """ - # Prepare wide table of emissions data. - table = make_wide_table(tb=tb, category="Power sector emissions", tb_regions=tb_regions, tb_income=tb_income) - - # Add carbon intensity. - # In principle this only needs to be done for region aggregates, but we do it for all countries and check that - # the results are consistent with the original data. - # Prepare wide table also for electricity generation (required to calculate carbon intensity). - electricity = make_wide_table(tb=tb, category="Electricity generation", tb_regions=tb_regions, tb_income=tb_income)[ - ["country", "year", "Total Generation (TWh)"] - ] - # Add total electricity generation to emissions table. - table = pd.merge(table, electricity, on=["country", "year"], how="left") - # Rename the original carbon intensity column as a temporary column called "check". - intensity_col = "CO2 intensity (gCO2/kWh)" - table = table.rename(columns={intensity_col: "check"}) - # Calculate carbon intensity for all countries and regions. - table[intensity_col] = ( - pd.DataFrame(table)["Total emissions (mtCO2)"] * MT_TO_G / (table["Total Generation (TWh)"] * TWH_TO_KWH) - ) - - # Check that the new carbon intensities agree (within 1 % of mean average percentage error, aka mape) with the - # original ones (where carbon intensity was given, namely for countries, not aggregate regions). - mape = 100 * abs(table.dropna(subset="check")[intensity_col] - table["check"].dropna()) / table["check"].dropna() - assert mape.max() < 1, "Calculated carbon intensities differ from original ones by more than 1 percent." - - # Remove temporary column. - table = table.drop(columns=["check"]) - - return table - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load dataset from meadow and read its main table. - ds_meadow: Dataset = paths.load_dependency("yearly_electricity") - tb_meadow = ds_meadow["yearly_electricity"].reset_index() - - # Load population dataset and read its main table. - ds_population: Dataset = paths.load_dependency("population") - tb_population = ds_population["population"] - - # Load regions dataset and read its main table. - ds_regions: Dataset = paths.load_dependency("regions") - tb_regions = ds_regions["regions"] - - # Load income groups dataset and read its main table. - ds_income: Dataset = paths.load_dependency("income_groups") - tb_income = ds_income["income_groups_latest"].reset_index() - - # - # Process data. - # - # Select and rename columns conveniently. - tb = tb_meadow[list(COLUMNS_YEARLY_ELECTRICITY)].rename(columns=COLUMNS_YEARLY_ELECTRICITY, errors="raise") - - # Sanity check. - assert set(tb["category"]) == set(CATEGORIES), "Categories have changed in data." - - # Harmonize country names. - tb = geo.harmonize_countries( - df=tb, countries_file=paths.country_mapping_path, warn_on_missing_countries=True, warn_on_unused_countries=True - ) - - # Split data into different tables, one per category, and process each one individually. - tables = { - "Capacity": make_wide_table(tb=tb, category="Capacity", tb_regions=tb_regions, tb_income=tb_income), - "Electricity demand": make_table_electricity_demand( - tb=tb, population=tb_population, tb_regions=tb_regions, tb_income=tb_income - ), - "Electricity generation": make_table_electricity_generation(tb=tb, tb_regions=tb_regions, tb_income=tb_income), - "Electricity imports": make_wide_table( - tb=tb, category="Electricity imports", tb_regions=tb_regions, tb_income=tb_income - ), - "Power sector emissions": make_table_power_sector_emissions(tb=tb, tb_regions=tb_regions, tb_income=tb_income), - } - - # Apply amendments, and set an appropriate index and short name to each table an sort conveniently. - for table_name in tables: - if table_name in AMENDMENTS: - log.info(f"Applying amendments to table: {table_name}") - tables[table_name] = correct_data_points(df=tables[table_name], corrections=AMENDMENTS[table_name]) - tables[table_name] = tables[table_name].set_index(["country", "year"], verify_integrity=True).sort_index() - tables[table_name].metadata.short_name = catalog.utils.underscore(table_name) - - # - # Save outputs. - # - # Create a new dataset with the same metadata as in Meadow. - ds_garden = create_dataset(dest_dir, tables=list(tables.values()), default_metadata=ds_meadow.metadata) - ds_garden.save() diff --git a/etl/steps/archive/garden/emdat/2022-11-24/natural_disasters.countries.json b/etl/steps/archive/garden/emdat/2022-11-24/natural_disasters.countries.json deleted file mode 100644 index 415443b2a90..00000000000 --- a/etl/steps/archive/garden/emdat/2022-11-24/natural_disasters.countries.json +++ /dev/null @@ -1,227 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas (the)": "Bahamas", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia (Plurinational State of)": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cayman Islands (the)": "Cayman Islands", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros (the)": "Comoros", - "Congo (the Democratic Republic of the)": "Democratic Republic of Congo", - "Congo (the)": "Congo", - "Cook Islands (the)": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czechoslovakia": "Czechoslovakia", - "C\u00f4te d\u2019Ivoire": "Cote d'Ivoire", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic (the)": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia (the)": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Isle of Man": "Isle of Man", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea (the Democratic People's Republic of)": "North Korea", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic (the)": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nepal": "Nepal", - "Netherlands Antilles": "Netherlands Antilles", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger (the)": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Palestine, State of": "Palestine", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines (the)": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russian Federation (the)": "Russia", - "Rwanda": "Rwanda", - "R\u00e9union": "Reunion", - "Saint Barth\u00e9lemy": "Saint Barthelemy", - "Saint Helena, Ascension and Tristan da Cunha": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Martin (French Part)": "Saint Martin (French part)", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan (the)": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Taiwan (Province of China)": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania, United Republic of": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tokelau": "Tokelau", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands (the)": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Wallis and Futuna": "Wallis and Futuna", - "Yemen": "Yemen", - "Yugoslavia": "Yugoslavia", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Azores Islands": "Azores Islands", - "Canary Is": "Canary Islands", - "Czech Republic (the)": "Czechia", - "Germany Dem Rep": "East Germany", - "Germany Fed Rep": "West Germany", - "Korea (the Republic of)": "South Korea", - "Macedonia (the former Yugoslav Republic of)": "North Macedonia", - "Marshall Islands (the)": "Marshall Islands", - "Moldova (the Republic of)": "Moldova", - "Netherlands (the)": "Netherlands", - "Northern Mariana Islands (the)": "Northern Mariana Islands", - "Serbia Montenegro": "Serbia and Montenegro", - "Soviet Union": "USSR", - "United Arab Emirates (the)": "United Arab Emirates", - "United Kingdom of Great Britain and Northern Ireland (the)": "United Kingdom", - "United States of America (the)": "United States", - "Virgin Island (British)": "British Virgin Islands", - "Virgin Island (U.S.)": "United States Virgin Islands", - "Yemen Arab Rep": "North Yemen", - "Yemen P Dem Rep": "South Yemen" -} diff --git a/etl/steps/archive/garden/emdat/2022-11-24/natural_disasters.meta.yml b/etl/steps/archive/garden/emdat/2022-11-24/natural_disasters.meta.yml deleted file mode 100644 index 2a62358c499..00000000000 --- a/etl/steps/archive/garden/emdat/2022-11-24/natural_disasters.meta.yml +++ /dev/null @@ -1,220 +0,0 @@ -all_sources: -- emdat: &source-emdat - name: EM-DAT, CRED / UCLouvain, Brussels, Belgium - url: https://emdat.be/ - date_accessed: '2022-11-27' - publication_date: '2022-11-24' - publication_year: 2022 - description: &description-emdat | - EM-DAT data includes all categories classified as "natural disasters" (distinguished from technological disasters, such as oil spills and industrial accidents). This includes those from drought, earthquakes, extreme temperatures, extreme weather, floods, fogs, glacial lake outbursts, landslide, dry mass movements, volcanic activity, and wildfires. - - Disaster-related deaths from EM-DAT have been normalized by Our World in Data to global population size based on [different sources](https://ourworldindata.org/population-sources). This provides data in terms of cases per 100,000 people. - - Our World in Data has also calculated economic damage metrics adjusted for gross domestic product (GDP), using GDP data from [the World Bank's World Development Indicators](http://data.worldbank.org/data-catalog/world-development-indicators). - - Latest update: 2022-12-06. - This dataset is updated regularly. On Our World in Data, given that we only show yearly (or decadal) data, we will update this dataset on a yearly basis. At the link above you can directly access the source page and see the latest available data. - - EM-DAT defines the following variables as: - - + Affected: People requiring immediate assistance during a period of emergency, i.e. requiring basic survival needs such as food, water, shelter, sanitation and immediate medical assistance. - - + Injured: People suffering from physical injuries, trauma or an illness requiring immediate medical assistance as a direct result of a disaster. - - + Homeless: Number of people whose house is destroyed or heavily damaged and therefore need shelter after an event. - - + Total affected: In EM-DAT, it is the sum of the injured, affected and left homeless after a disaster. - - + Estimated economic damage: The amount of damage to property, crops, and livestock. In EM-DAT estimated damage are given in US$ ('000). For each disaster, the registered figure corresponds to the damage value at the moment of the event, i.e. the figures are shown true to the year of the event. - - + Total deaths: In EM-DAT, it is the sum of deaths and missing. - - EM-DAT defines the following types of disasters as: - - + Drought: An extended period of unusually low precipitation that produces a shortage of water for people, animals and plants. Drought is different from most other hazards in that it develops slowly, sometimes even over years, and its onset is generally difficult to detect. Drought is not solely a physical phenomenon because its impacts can be exacerbated by human activities and water supply demands. Drought is therefore often defined both conceptually and operationally. Operational definitions of drought, meaning the degree of precipitation reduction that constitutes a drought, vary by locality, climate and environmental sector. - - + Earthquake: Sudden movement of a block of the Earth's crust along a geological fault and associated ground shaking. - - + Extreme temperature: Extreme temperature. - - + Flood: A general term for the overflow of water from a stream channel onto normally dry land in the floodplain (riverine flooding), higher-than-normal levels along the coast and in lakes or reservoirs (coastal flooding) as well as ponding of water at or near the point where the rain fell (flash floods). - - + Fog: Water droplets that are suspended in the air near the Earth's surface. Fog is simply a cloud that is in contact with the ground. - - + Glacial lake outburst: A flood that occurs when water dammed by a glacier or moraine is suddenly released. Glacial lakes can be at the front of the glacier (marginal lake) or below the ice sheet (sub-glacial lake). - - + Landslide: Any kind of moderate to rapid soil movement incl. lahar, mudslide, debris flow. A landslide is the movement of soil or rock controlled by gravity and the speed of the movement usually ranges between slow and rapid, but not very slow. It can be superficial or deep, but the materials have to make up a mass that is a portion of the slope or the slope itself. The movement has to be downward and outward with a free face. - - + Mass movement: Any type of downslope movement of earth materials. - - + Extreme weather: Storm. - - + Volcanic activity: A type of volcanic event near an opening/vent in the Earth's surface including volcanic eruptions of lava, ash, hot vapour, gas, and pyroclastic material. - - + Wildfire: Any uncontrolled and non-prescribed combustion or burning of plants in a natural setting such as a forest, grassland, brush land or tundra, which consumes the natural fuels and spreads based on environmental conditions (e.g., wind, topography). Wildfires can be triggered by lightning or human actions. -- wdi: &source-wdi - name: World Development Indicators - World Bank - url: https://datacatalog.worldbank.org/search/dataset/0037712/World-Development-Indicators - date_accessed: '2022-05-26' - publication_year: 2022 - description: &description-wdi | - The World Development Indicators (WDI) is the primary World Bank collection of development indicators, compiled from officially-recognized international sources. It presents the most current and accurate global development data available, and includes national, regional and global estimates. -- population: &source-population - name: Population (Gapminder, HYDE & UN) - description: &description-population | - Population by country, available from 10,000 BCE to 2100, is based on Gapminder data, HYDE, and UN Population Division (2022) estimates. - - + 10,000 BCE - 1799: Historical estimates by [HYDE (v3.2)](https://dataportaal.pbl.nl/downloads/HYDE/). - - + 1800-1949: Historical estimates by [Gapminder (v6)](https://www.gapminder.org/data/documentation/gd003/). - - + 1950-2021: Population records by [the United Nations - Population Division (2022)](https://population.un.org/wpp/Download/Standard/Population/). - - + 2022-2100: Projections based on Medium variant by [the United Nations - Population Division (2022)](https://population.un.org/wpp/Download/Standard/Population/). -dataset: - namespace: emdat - short_name: natural_disasters - title: Natural disasters (EM-DAT, 2022) - description: *description-emdat - licenses: - - name: UCLouvain 2022 - url: https://public.emdat.be/about - version: '2022-11-24' - sources: - - *source-emdat - - *source-wdi - - *source-population - -tables: - natural_disasters_decadal: - variables: &variables-default - total_dead: - title: Total deaths - unit: 'people' - description: "Total number of deaths as a result of a natural disaster. In EM-DAT, it is the sum of deaths and missing." - sources: - - *source-emdat - injured: - title: Injured - unit: 'people' - description: "People suffering from physical injuries, trauma or an illness requiring immediate medical assistance as a direct result of a disaster." - sources: - - *source-emdat - affected: - title: Affected - unit: 'people' - description: "Number of people requiring immediate assistance during a period of emergency, i.e. requiring basic survival needs such as food, water, shelter, sanitation and immediate medical assistance." - sources: - - *source-emdat - homeless: - title: Homeless - unit: 'people' - description: "Number of people whose house is destroyed or heavily damaged and therefore need shelter after an event." - sources: - - *source-emdat - total_affected: - title: Total affected - unit: 'people' - description: "Total number of people affected by a natural disaster. In EM-DAT, it is the sum of the injured, affected and left homeless after a disaster." - sources: - - *source-emdat - reconstruction_costs: - title: Reconstruction costs - short_unit: '$' - unit: 'current US$' - description: "Reconstruction costs." - sources: - - *source-emdat - insured_damages: - title: Insured damages - short_unit: "$" - unit: "current US$" - description: "Insured losses are those which are covered by the insurance sector and paid directly to the owner of the damaged or destroyed property or crops and livestock or the primary insurance company (in case of reinsurance)." - sources: - - *source-emdat - total_damages: - title: "Total economic damage from natural disasters" - short_unit: '$' - unit: 'current US$' - description: "The amount of damage to property, crops, and livestock. In EM-DAT estimated damage are given in US$. For each disaster, the registered figure corresponds to the damage value at the moment of the event, i.e. the figures are shown true to the year of the event." - sources: - - *source-emdat - n_events: - title: Number of reported natural disasters - unit: 'events' - description: "Number of reported natural disasters." - sources: - - *source-emdat - population: - title: Population - unit: 'people' - sources: - - *source-population - total_dead_per_100k_people: - title: Total number of deaths per 100,000 people - unit: 'cases per 100k people' - sources: - - *source-emdat - - *source-population - injured_per_100k_people: - title: Number of injured persons per 100,000 people - unit: 'cases per 100k people' - sources: - - *source-emdat - - *source-population - affected_per_100k_people: - title: Number of affected persons per 100,000 people - unit: 'cases per 100k people' - sources: - - *source-emdat - - *source-population - homeless_per_100k_people: - title: Number of homeless persons per 100,000 people - unit: 'cases per 100k people' - sources: - - *source-emdat - - *source-population - total_affected_per_100k_people: - title: Total number of affected persons per 100,000 people - unit: 'cases per 100k people' - sources: - - *source-emdat - - *source-population - n_events_per_100k_people: - title: Number of events per 100,000 people - unit: 'events per 100k people' - sources: - - *source-emdat - - *source-population - gdp: - title: "GDP" - unit: "current US$" - short_unit: "$" - sources: - - *source-wdi - reconstruction_costs_per_gdp: - title: "Reconstruction costs from natural disasters as a share of GDP" - unit: "%" - short_unit: "%" - description: "Reconstruction costs from natural disasters as a share of GDP." - sources: - - *source-emdat - - *source-wdi - insured_damages_per_gdp: - title: "Insured damages from natural disasters as a share of GDP" - unit: "%" - short_unit: "%" - description: "Insured damages from natural disasters as a share of GDP." - sources: - - *source-emdat - - *source-wdi - total_damages_per_gdp: - title: "Total economic damages from natural disasters as a share of GDP" - unit: "%" - short_unit: "%" - description: "Total economic damages from natural disasters as a share of GDP." - sources: - - *source-emdat - - *source-wdi - natural_disasters_yearly: - variables: *variables-default diff --git a/etl/steps/archive/garden/emdat/2022-11-24/natural_disasters.py b/etl/steps/archive/garden/emdat/2022-11-24/natural_disasters.py deleted file mode 100644 index 00c8049742d..00000000000 --- a/etl/steps/archive/garden/emdat/2022-11-24/natural_disasters.py +++ /dev/null @@ -1,695 +0,0 @@ -"""Process and harmonize EM-DAT natural disasters dataset. - -NOTES: -1. We don't have population for some historical regions (e.g. East Germany, or North Yemen). -2. Some issues in the data were detected (see below, we may report them to EM-DAT). Some of them could not be fixed. - Namely, some disasters affect, in one year, a number of people that is larger than the entire population. - For example, the number of people affected by one drought event in Botswana 1981 is 1037300 while population - was 982753. I suppose this could be due to inaccuracies in the estimates of affected people or in the population - (which may not include people living temporarily in the country or visitors). -3. There are some potential issues that can't be fixed: - * On the one hand, we may be underestimating the real impacts of events. The reason is that the original data does - not include zeros. Therefore we can't know if the impacts of a certain event were zero, or unknown. Our only option - is to treat missing data as zeros. - * On the other hand, we may overestimate the real impacts on a given country-year, because events may affect the same - people multiple times during the same year. This can't be fixed, but I suppose it's not common. - * Additionally, it is understandable that some values are rough estimates, that some events are not recorded, and - that there may be duplicated events. - -""" - -import datetime - -import numpy as np -import pandas as pd -from owid import catalog -from shared import ( - BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES, - CURRENT_DIR, - EXPECTED_COUNTRIES_WITHOUT_POPULATION, - HISTORIC_TO_CURRENT_REGION, - REGIONS, - add_population, - add_region_aggregates, - correct_data_points, - get_last_day_of_month, -) - -from etl.data_helpers import geo -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -# Define inputs. -MEADOW_VERSION = "2022-11-24" -WDI_DATASET_PATH = DATA_DIR / "garden/worldbank_wdi/2022-05-26/wdi" -# Define outputs. -VERSION = MEADOW_VERSION - -# List of expected disaster types in the raw data to consider, and how to rename them. -# We consider only natural disasters of subgroups Geophysical, Meteorological, Hydrological and Climatological. -# We therefore ignore Extra-terrestrial (of which there is just one meteorite impact event) and Biological subgroups. -# For completeness, add all existing types here, and rename them as np.nan if they should not be used. -# If new types are included on a data update, simply add them here. -EXPECTED_DISASTER_TYPES = { - "Animal accident": np.nan, - "Drought": "Drought", - "Earthquake": "Earthquake", - "Epidemic": np.nan, - "Extreme temperature": "Extreme temperature", - "Flood": "Flood", - "Fog": "Fog", - "Glacial lake outburst": "Glacial lake outburst", - "Impact": np.nan, - "Insect infestation": np.nan, - "Landslide": "Landslide", - "Mass movement (dry)": "Dry mass movement", - "Storm": "Extreme weather", - "Volcanic activity": "Volcanic activity", - "Wildfire": "Wildfire", -} - -# List of columns to select from raw data, and how to rename them. -COLUMNS = { - "country": "country", - "year": "year", - "type": "type", - "total_dead": "total_dead", - "injured": "injured", - "affected": "affected", - "homeless": "homeless", - "total_affected": "total_affected", - "reconstruction_costs": "reconstruction_costs", - "insured_damages": "insured_damages", - "total_damages": "total_damages", - "start_year": "start_year", - "start_month": "start_month", - "start_day": "start_day", - "end_year": "end_year", - "end_month": "end_month", - "end_day": "end_day", -} - -# Columns of values related to natural disaster impacts. -IMPACT_COLUMNS = [ - "total_dead", - "injured", - "affected", - "homeless", - "total_affected", - "reconstruction_costs", - "insured_damages", - "total_damages", -] - -# Variables related to costs, measured in thousand current US$ (not adjusted for inflation or PPP). -COST_VARIABLES = ["reconstruction_costs", "insured_damages", "total_damages"] - -# Variables to calculate per 100,000 people. -VARIABLES_PER_100K_PEOPLE = [column for column in IMPACT_COLUMNS if column not in COST_VARIABLES] + ["n_events"] - -# New natural disaster type corresponding to the sum of all disasters. -ALL_DISASTERS_TYPE = "all_disasters" - -# List issues found in the data: -# Each element is a tuple with a dictionary that fully identifies the wrong row, -# and another dictionary that specifies the changes. -# Note: Countries here should appear as in the raw data (i.e. not harmonized). -DATA_CORRECTIONS = [ - # The end year of 1969 Morocco earthquake can't be 2019. - ({"country": "Morocco", "start_year": 1969, "end_year": 2019, "type": "Earthquake"}, {"end_year": 1969}), - # The date of the 1992 Afghanistan flood can't be September 31. - ({"country": "Afghanistan", "start_year": 1992, "start_month": 9, "start_day": 31}, {"start_day": 3, "end_day": 3}), - # The date of the 1992 India flood can't be September 31. - # Also, there is one entry for 1992 India flood on 1992-09-08 (500 dead) and another for 1992-09 (86 dead). - # They will be treated as separate events (maybe the monthly one refers to other smaller floods that month?). - ({"country": "India", "start_year": 1992, "start_month": 9, "start_day": 8, "end_day": 31}, {"end_day": 8}), - # Sierra Leone epidemic outbreak in november 1996 can't end in April 31. - ( - {"country": "Sierra Leone", "start_year": 1996, "start_month": 11, "end_month": 4, "end_day": 31}, - {"end_day": 30}, - ), - # Peru 1998 epidemic can't end in February 31. - ({"country": "Peru", "start_year": 1998, "start_month": 1, "end_month": 2, "end_day": 31}, {"end_day": 28}), - # India 2017 flood can't end in June 31. - ({"country": "India", "start_year": 2017, "start_month": 6, "end_month": 6, "end_day": 31}, {"end_day": 30}), - # US 2021 wildfires can't end in September 31. - ({"country": "United States of America (the)", "start_year": 2021, "end_month": 9, "end_day": 31}, {"end_day": 30}), - # Cameroon 2012 drought can't end before it started. - # I will remove the month and day, since I can't pinpoint the exact dates. - ( - {"country": "Cameroon", "start_year": 2012, "start_month": 6, "end_month": 1}, - {"start_month": np.nan, "start_day": np.nan, "end_month": np.nan, "end_day": np.nan}, - ), -] -# Other potential issues, where more people were affected than the entire population of the country: -# country | year | type | affected | homeless | population | -# --------------------|-------:|:--------|-----------:|-----------:|-----------------:| -# Antigua and Barbuda | 1983 | Drought | 75000 | 0 | 65426 | -# Botswana | 1981 | Drought | 1037300 | 0 | 982753 | -# Dominica | 2017 | Storm | 71293 | 0 | 70422 | -# Ghana | 1980 | Drought | 12500000 | 0 | 1.18653e+07 | -# Laos | 1977 | Drought | 3500000 | 0 | 3.12575e+06 | -# Mauritania | 1969 | Drought | 1300000 | 0 | 1.08884e+06 | -# Mauritania | 1976 | Drought | 1420000 | 0 | 1.34161e+06 | -# Mauritania | 1980 | Drought | 1600000 | 0 | 1.5067e+06 | -# Montserrat | 1989 | Storm | 0 | 12000 | 10918 | -# Saint Lucia | 2010 | Storm | 181000 | 0 | 170950 | -# Samoa | 1990 | Storm | 170000 | 25000 | 168202 | -# Tonga | 1982 | Storm | 100000 | 46500 | 96951 | -# Finally, there are events registered on the same year for both a historical region and one of its -# successor countries (we are ignoring this issue). -# 1902: {'Azerbaijan', 'USSR'}, -# 1990: {'Tajikistan', 'USSR'}, -# 1991: {'Georgia', 'USSR'}, - -# Get naming conventions. -N = PathFinder(str(CURRENT_DIR / "natural_disasters")) - - -def prepare_input_data(df: pd.DataFrame) -> pd.DataFrame: - """Prepare input data, and fix some known issues.""" - # Select and rename columns. - df = df[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") - - # Correct wrong data points (defined above in DATA_CORRECTIONS). - df = correct_data_points(df=df, corrections=DATA_CORRECTIONS) - - # Remove spurious spaces in entities. - df["type"] = df["type"].str.strip() - - # Sanity check - error = "List of expected disaster types has changed. Consider updating EXPECTED_DISASTER_TYPES." - assert set(df["type"]) == set(EXPECTED_DISASTER_TYPES), error - - # Rename disaster types conveniently. - df["type"] = df["type"].replace(EXPECTED_DISASTER_TYPES) - - # Drop rows for disaster types that are not relevant. - df = df.dropna(subset="type").reset_index(drop=True) - - return df - - -def sanity_checks_on_inputs(df: pd.DataFrame) -> None: - """Run sanity checks on input data.""" - error = "All values should be positive." - assert (df.select_dtypes("number").fillna(0) >= 0).all().all(), error - - error = "Column 'total_affected' should be the sum of columns 'injured', 'affected', and 'homeless'." - assert ( - df["total_affected"].fillna(0) >= df[["injured", "affected", "homeless"]].sum(axis=1).fillna(0) - ).all(), error - - error = "Natural disasters are not expected to last more than 9 years." - assert (df["end_year"] - df["start_year"]).max() < 10, error - - error = "Some of the columns that can't have nan do have one or more nans." - assert df[["country", "year", "type", "start_year", "end_year"]].notnull().all().all(), error - - for column in ["year", "start_year", "end_year"]: - error = f"Column '{column}' has a year prior to 1900 or posterior to current year." - assert 1900 < df[column].max() <= datetime.datetime.now().year, error - - error = "Some rows have end_day specified, but not end_month." - assert df[(df["end_month"].isnull()) & (df["end_day"].notnull())].empty, error - - -def fix_faulty_dtypes(df: pd.DataFrame) -> pd.DataFrame: - """Fix an issue related to column dtypes. - - Dividing a UInt32 by float64 results in a faulty Float64 that does not handle nans properly (which may be a bug: - https://github.com/pandas-dev/pandas/issues/49818). - To avoid this, there are various options: - 1. Convert all UInt32 into standard int before dividing by a float. But, if there are nans, int dtype is not valid. - 2. Convert all floats into Float64 before dividing. - 3. Convert all Float64 into float, after dividing. - - We adopt option 3. - - """ - df = df.astype({column: float for column in df[df.columns[df.dtypes == "Float64"]]}) - - return df - - -def harmonize_countries(df: pd.DataFrame) -> pd.DataFrame: - """Harmonize country names.""" - df = df.copy() - - # Harmonize country names. - df = geo.harmonize_countries( - df=df, countries_file=N.country_mapping_path, warn_on_missing_countries=True, warn_on_unused_countries=True - ) - - # Add Azores Islands to Portugal (so that we can attach a population to it). - df = df.replace({"Azores Islands": "Portugal"}) - # Add Canary Islands to Spain (so that we can attach a population to it). - df = df.replace({"Canary Islands": "Spain"}) - - return df - - -def calculate_start_and_end_dates(df: pd.DataFrame) -> pd.DataFrame: - """Calculate start and end dates of disasters. - - The original data had year, month and day of start and end, and some of those fields were missing. This function - deals with those missing fields and creates datetime columns for start and end of events. - - """ - df = df.copy() - - # When start month is not given, assume the beginning of the year. - df["start_month"] = df["start_month"].fillna(1) - # When start day is not given, assume the beginning of the month. - df["start_day"] = df["start_day"].fillna(1) - - # When end month is not given, assume the end of the year. - df["end_month"] = df["end_month"].fillna(12) - - # When end day is not given, assume the last day of the month. - last_day_of_month = pd.Series( - [get_last_day_of_month(year=row["end_year"], month=row["end_month"]) for i, row in df.iterrows()] - ) - df["end_day"] = df["end_day"].fillna(last_day_of_month) - - # Create columns for start and end dates. - df["start_date"] = ( - df["start_year"].astype(str) - + "-" - + df["start_month"].astype(str).str.zfill(2) - + "-" - + df["start_day"].astype(str).str.zfill(2) - ) - df["end_date"] = ( - df["end_year"].astype(str) - + "-" - + df["end_month"].astype(str).str.zfill(2) - + "-" - + df["end_day"].astype(str).str.zfill(2) - ) - - # Convert dates into datetime objects. - # Note: This may fail if one of the dates is wrong, e.g. September 31 (if so, check error message for row index). - df["start_date"] = pd.to_datetime(df["start_date"]) - df["end_date"] = pd.to_datetime(df["end_date"]) - - error = "Events can't have an end_date prior to start_date." - assert ((df["end_date"] - df["start_date"]).dt.days >= 0).all(), error - - # Drop unnecessary columns. - df = df.drop(columns=["start_year", "start_month", "start_day", "end_year", "end_month", "end_day"]) - - return df - - -def calculate_yearly_impacts(df: pd.DataFrame) -> pd.DataFrame: - """Equally distribute the impact of disasters lasting longer than one year among the individual years, as separate - events. - - Many disasters last more than one year. Therefore, we need to spread their impact among the different years. - Otherwise, if we assign the impact of a disaster to, say, the first year, we may overestimate the impacts on a - particular country-year. - Hence, for events that started and ended in different years, we distribute their impact equally across the - time spanned by the disaster. - - """ - df = df.copy() - - # There are many rows that have no data on impacts of disasters. - # I suppose those are known disasters for which we don't know the impact. - # Given that we want to count overall impact, fill them with zeros (to count them as disasters that had no victims). - df[IMPACT_COLUMNS] = df[IMPACT_COLUMNS].fillna(0) - - # Select rows of disasters that last more than one year. - multi_year_rows_mask = df["start_date"].dt.year != df["end_date"].dt.year - multi_year_rows = df[multi_year_rows_mask].reset_index(drop=True) - - # Go row by row, and create a new disaster event with the impact normalized by the fraction of days it happened - # in a specific year. - added_events = pd.DataFrame() - for i, row in multi_year_rows.iterrows(): - # Start dataframe for new event. - new_event = pd.DataFrame(row).transpose() - # Years spanned by the disaster. - years = np.arange(row["start_date"].year, row["end_date"].year + 1).tolist() - # Calculate the total number of days spanned by the disaster (and add 1 day to include the day of the end date). - days_total = (row["end_date"] + pd.DateOffset(1) - row["start_date"]).days - - for year in years: - if year == years[0]: - # Get number of days. - days_affected_in_year = (pd.Timestamp(year=year + 1, month=1, day=1) - row["start_date"]).days - # Fraction of days affected this year. - days_fraction = days_affected_in_year / days_total - # Impacts this years. - impacts = (row[IMPACT_COLUMNS] * days_fraction).astype(int) # type: ignore - # Start a series that counts the impacts accumulated over the years. - cumulative_impacts = impacts - # Normalize data by the number of days affected in this year. - new_event[IMPACT_COLUMNS] = impacts - # Correct dates. - new_event["end_date"] = pd.Timestamp(year=year, month=12, day=31) - elif years[0] < year < years[-1]: - # The entire year was affected by the disaster. - # Note: Ignore leap years. - days_fraction = 365 / days_total - # Impacts this year. - impacts = (row[IMPACT_COLUMNS] * days_fraction).astype(int) # type: ignore - # Add impacts to the cumulative impacts series. - cumulative_impacts += impacts # type: ignore - # Normalize data by the number of days affected in this year. - new_event[IMPACT_COLUMNS] = impacts - # Correct dates. - new_event["start_date"] = pd.Timestamp(year=year, month=1, day=1) - new_event["end_date"] = pd.Timestamp(year=year, month=12, day=31) - else: - # Assign all remaining impacts to the last year. - impacts = row[IMPACT_COLUMNS] - cumulative_impacts # type: ignore - new_event[IMPACT_COLUMNS] = impacts - # Correct dates. - new_event["start_date"] = pd.Timestamp(year=year, month=1, day=1) - added_events = pd.concat([added_events, new_event], ignore_index=True) - - # Remove multi-year rows from main dataframe, and add those rows after separating events year by year. - yearly_df = pd.concat([df[~(multi_year_rows_mask)], added_events], ignore_index=True) # type: ignore - - # Sort conveniently. - yearly_df = yearly_df.sort_values(["country", "year", "type"]).reset_index(drop=True) - - return yearly_df - - -def get_total_count_of_yearly_impacts(df: pd.DataFrame) -> pd.DataFrame: - """Get the total count of impacts in the year, ignoring the individual events. - - We are not interested in each individual event, but the number of events of each kind and their impacts. - This function will produce the total count of impacts per country, year and type of disaster. - - """ - counts = ( - df.reset_index() - .groupby(["country", "year", "type"], observed=True) - .agg({"index": "count"}) - .reset_index() - .rename(columns={"index": "n_events"}) - ) - df = df.groupby(["country", "year", "type"], observed=True).sum(numeric_only=True, min_count=1).reset_index() - df = pd.merge(df, counts, on=["country", "year", "type"], how="left") - - return df - - -def create_a_new_type_for_all_disasters_combined(df: pd.DataFrame) -> pd.DataFrame: - """Add a new disaster type that has the impact of all other disasters combined.""" - all_disasters = ( - df.groupby(["country", "year"], observed=True) - .sum(numeric_only=True, min_count=1) - .assign(**{"type": ALL_DISASTERS_TYPE}) - .reset_index() - ) - df = ( - pd.concat([df, all_disasters], ignore_index=True) - .sort_values(["country", "year", "type"]) - .reset_index(drop=True) - ) - - return df - - -def add_population_including_historical_regions(df: pd.DataFrame) -> pd.DataFrame: - """Add population to the main dataframe, including the population of historical regions. - - For historical regions for which we do not have population data, we construct their population by adding the - population of their successor countries. This is done for countries in BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES, - using the definition of regions in HISTORIC_TO_CURRENT_REGION. - For certain countries we have population data only for certain years (e.g. 1900, 1910, but not the years in - between). In those cases we interpolate population data. - - """ - df = df.copy() - - # Historical regions whose population we want to include. - historical_regions = { - region: HISTORIC_TO_CURRENT_REGION[region] - for region in HISTORIC_TO_CURRENT_REGION - if region in BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES - } - # All regions whose population we want to include (i.e. continents and historical countries). - regions = dict(**REGIONS, **historical_regions) - - # Add population to main dataframe. - df = add_population( - df=df, - interpolate_missing_population=True, - warn_on_missing_countries=False, - regions=regions, - expected_countries_without_population=EXPECTED_COUNTRIES_WITHOUT_POPULATION, - ) - - return df - - -def create_additional_variables(df: pd.DataFrame, df_gdp: pd.DataFrame) -> pd.DataFrame: - """Create additional variables, namely damages per GDP, and impacts per 100,000 people.""" - # Combine natural disasters with GDP data. - df = pd.merge(df, df_gdp.rename(columns={"ny_gdp_mktp_cd": "gdp"}), on=["country", "year"], how="left") - # Prepare cost variables. - for variable in COST_VARIABLES: - # Convert costs (given in '000 US$, aka thousand current US$) into current US$. - df[variable] *= 1000 - # Create variables of costs (in current US$) as a share of GDP (in current US$). - df[f"{variable}_per_gdp"] = df[variable] / df["gdp"] * 100 - - # Add rates per 100,000 people. - for column in VARIABLES_PER_100K_PEOPLE: - df[f"{column}_per_100k_people"] = df[column] * 1e5 / df["population"] - - # Fix issue with faulty dtypes (see more details in the function's documentation). - df = fix_faulty_dtypes(df=df) - - return df - - -def create_decade_data(df: pd.DataFrame) -> pd.DataFrame: - """Create data of average impacts over periods of 10 years. - - For example (as explained in the footer of the natural disasters explorer), the value for 1900 of any column should - represent the average of that column between 1900 and 1909. - - """ - decade_df = df.copy() - - # Ensure each country has data for all years (and fill empty rows with zeros). - # Otherwise, the average would only be performed only across years for which we have data. - # For example, if we have data only for 1931 (and no other year in the 1930s) we want that data point to be averaged - # over all years in the decade (assuming they are all zero). - # Note that, for the current decade, since it's not complete, we want to average over the number of current years - # (not the entire decade). - - # List all countries, years and types in the data. - countries = sorted(set(decade_df["country"])) - years = np.arange(decade_df["year"].min(), decade_df["year"].max() + 1).tolist() - types = sorted(set(decade_df["type"])) - - # Create a new index covering all combinations of countries, years and types. - new_indexes = pd.MultiIndex.from_product([countries, years, types], names=["country", "year", "type"]) - - # Reindex data so that all countries and types have data for each year (filling with zeros when there's no data). - decade_df = decade_df.set_index(["country", "year", "type"]).reindex(new_indexes, fill_value=0).reset_index() - - # For each year, calculate the corresponding decade (e.g. 1951 -> 1950, 1929 -> 1920). - decade_df["decade"] = (decade_df["year"] // 10) * 10 - - # Group by that country-decade-type and get the mean for each column. - decade_df = ( - decade_df.drop(columns=["year"]) - .groupby(["country", "decade", "type"], observed=True) - .mean(numeric_only=True) - .reset_index() - .rename(columns={"decade": "year"}) - ) - - return decade_df - - -def sanity_checks_on_outputs(df: pd.DataFrame, is_decade: bool) -> None: - """Run sanity checks on output (yearly or decadal) data. - - Parameters - ---------- - df : pd.DataFrame - Output (yearly or decadal) data. - is_decade : bool - True if df is decadal data; False if it is yearly data. - - """ - # Common sanity checks for yearly and decadal data. - error = "All values should be positive." - assert (df.select_dtypes("number").fillna(0) >= 0).all().all(), error - - error = ( - "List of expected disaster types has changed. " - "Consider updating EXPECTED_DISASTER_TYPES (or renaming ALL_DISASTERS_TYPE)." - ) - expected_disaster_types = [ALL_DISASTERS_TYPE] + [ - catalog.utils.underscore(EXPECTED_DISASTER_TYPES[disaster]) - for disaster in EXPECTED_DISASTER_TYPES - if not pd.isna(EXPECTED_DISASTER_TYPES[disaster]) - ] - assert set(df["type"]) == set(expected_disaster_types), error - - columns_that_should_not_have_nans = [ - "country", - "year", - "type", - "total_dead", - "injured", - "affected", - "homeless", - "total_affected", - "reconstruction_costs", - "insured_damages", - "total_damages", - "n_events", - ] - error = "There are unexpected nans in data." - assert df[columns_that_should_not_have_nans].notnull().all(axis=1).all(), error - - # Sanity checks only for yearly data. - if not is_decade: - all_countries = sorted(set(df["country"]) - set(REGIONS) - set(HISTORIC_TO_CURRENT_REGION)) - - # Check that the aggregate of all countries and disasters leads to the same numbers we have for the world. - # This check would not pass when adding historical regions (since we know there are some overlaps between data - # from historical and successor countries). So check for a specific year. - year_to_check = 2022 - all_disasters_for_world = df[ - (df["country"] == "World") & (df["year"] == year_to_check) & (df["type"] == ALL_DISASTERS_TYPE) - ].reset_index(drop=True) - all_disasters_check = ( - df[(df["country"].isin(all_countries)) & (df["year"] == year_to_check) & (df["type"] != ALL_DISASTERS_TYPE)] - .groupby("year") - .sum(numeric_only=True) - .reset_index() - ) - - cols_to_check = [ - "total_dead", - "injured", - "affected", - "homeless", - "total_affected", - "reconstruction_costs", - "insured_damages", - "total_damages", - ] - error = f"Aggregate for the World in {year_to_check} does not coincide with the sum of all countries." - assert all_disasters_for_world[cols_to_check].equals(all_disasters_check[cols_to_check]), error - - error = "Column 'total_affected' should be the sum of columns 'injured', 'affected', and 'homeless'." - assert ( - df["total_affected"].fillna(0) >= df[["injured", "affected", "homeless"]].sum(axis=1).fillna(0) - ).all(), error - - # Another sanity check would be that certain disasters (e.g. an earthquake) cannot last for longer than 1 day. - # However, for some disasters we don't have exact day, or even exact month, just the year. - - # List of columns whose value should not be larger than population. - columns_to_inspect = [ - "total_dead", - "total_dead_per_100k_people", - ] - error = "One disaster should not be able to cause the death of the entire population of a country in one year." - for column in columns_to_inspect: - informed_rows = df[column].notnull() & df["population"].notnull() - assert (df[informed_rows][column] <= df[informed_rows]["population"]).all(), error - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load natural disasters dataset from meadow. - ds_meadow = catalog.Dataset(DATA_DIR / f"meadow/emdat/{MEADOW_VERSION}/natural_disasters") - # Get table from dataset. - tb_meadow = ds_meadow["natural_disasters"] - # Create a dataframe from the table. - df = pd.DataFrame(tb_meadow) - - # Load GDP from WorldBank WDI dataset. - ds_gdp = catalog.Dataset(WDI_DATASET_PATH) - # Load main table from WDI dataset, and select variable corresponding to GDP (in current US$). - tb_gdp = ds_gdp["wdi"][["ny_gdp_mktp_cd"]] - # Create a dataframe with GDP. - df_gdp = pd.DataFrame(tb_gdp).reset_index() - - # - # Process data. - # - # Prepare input data (and fix some known issues). - df = prepare_input_data(df=df) - - # Sanity checks. - sanity_checks_on_inputs(df=df) - - # Harmonize country names. - df = harmonize_countries(df=df) - - # Calculate start and end dates of disasters. - df = calculate_start_and_end_dates(df=df) - - # Distribute the impacts of disasters lasting longer than a year among separate yearly events. - df = calculate_yearly_impacts(df=df) - - # Get total count of impacts per year (regardless of the specific individual events during the year). - df = get_total_count_of_yearly_impacts(df=df) - - # Add a new category (or "type") corresponding to the total of all natural disasters. - df = create_a_new_type_for_all_disasters_combined(df=df) - - # Add region aggregates. - df = add_region_aggregates(data=df, index_columns=["country", "year", "type"]) - - # Add population including historical regions. - df = add_population_including_historical_regions(df=df) - - # Add damages per GDP, and rates per 100,000 people. - df = create_additional_variables(df=df, df_gdp=df_gdp) - - # Change disaster types to snake, lower case. - df["type"] = [catalog.utils.underscore(value) for value in df["type"]] - - # Create data aggregated (using a simple mean) in intervals of 10 years. - decade_df = create_decade_data(df=df) - - # Run sanity checks on output yearly data. - sanity_checks_on_outputs(df=df, is_decade=False) - - # Run sanity checks on output decadal data. - sanity_checks_on_outputs(df=decade_df, is_decade=True) - - # Set an appropriate index to yearly data and sort conveniently. - df = df.set_index(["country", "year", "type"], verify_integrity=True).sort_index().sort_index() - - # Set an appropriate index to decadal data and sort conveniently. - decade_df = decade_df.set_index(["country", "year", "type"], verify_integrity=True).sort_index().sort_index() - - # - # Save outputs. - # - # Create new Garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - - # Ensure all column names are snake, lower case. - tb_garden = catalog.Table(df, short_name="natural_disasters_yearly", underscore=True) - decade_tb_garden = catalog.Table(decade_df, short_name="natural_disasters_decadal", underscore=True) - - # Add tables to dataset - ds_garden.add(tb_garden) - ds_garden.add(decade_tb_garden) - - # Add metadata from yaml file. - ds_garden.update_metadata(N.metadata_path) - - # Save dataset - ds_garden.save() diff --git a/etl/steps/archive/garden/emdat/2022-11-24/shared.py b/etl/steps/archive/garden/emdat/2022-11-24/shared.py deleted file mode 100644 index fe4c0d33bb0..00000000000 --- a/etl/steps/archive/garden/emdat/2022-11-24/shared.py +++ /dev/null @@ -1,590 +0,0 @@ -import datetime -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union, cast - -import numpy as np -import pandas as pd -from owid import catalog -from structlog import get_logger - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -CURRENT_DIR = Path(__file__).parent - -log = get_logger() - -# Aggregate regions to add, following OWID definitions. -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -REGIONS = { - # Default continents. - "Africa": {}, - "Asia": {}, - "Europe": {}, - "European Union (27)": {}, - "North America": {}, - "Oceania": {}, - "South America": {}, - "World": {}, - # Income groups. - "Low-income countries": {}, - "Upper-middle-income countries": {}, - "Lower-middle-income countries": {}, - "High-income countries": {}, -} - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION: Dict[str, Dict[str, Union[str, List[str]]]] = { - "Czechoslovakia": { - "continent": "Europe", - "income_group": "High-income countries", - "regions_included": [ - # Europe - High-income countries. - "Czechia", - "Slovakia", - ], - }, - "East Germany": { - "continent": "Europe", - "income_group": "", - "regions_included": [ - # Europe - High-income countries. - "Germany", - ], - }, - "West Germany": { - "continent": "Europe", - "income_group": "", - "regions_included": [ - # Europe - High-income countries. - "Germany", - ], - }, - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "regions_included": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - "Bonaire Sint Eustatius and Saba", - ], - }, - "Serbia and Montenegro": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - Upper-middle-income countries. - "Serbia", - "Montenegro", - ], - }, - "North Yemen": { - "continent": "Asia", - "income_group": "Low-income countries", - "regions_included": [ - # Asia - Low-income countries. - "Yemen", - ], - }, - "South Yemen": { - "continent": "Asia", - "income_group": "Low-income countries", - "regions_included": [ - # Asia - Low-income countries. - "Yemen", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, - "Yugoslavia": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - High-income countries. - "Croatia", - "Slovenia", - # Europe - Upper-middle-income countries. - "North Macedonia", - "Bosnia and Herzegovina", - "Serbia", - "Montenegro", - ], - }, -} - -# Historical countries whose population can be built by adding up the population of their successor countries. -# Those historical countries not listed here will have no population data. -BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES = [ - # The following regions split into smaller ones, and can be estimated by the population of the successors. - "Czechoslovakia", - "Netherlands Antilles", - "Serbia and Montenegro", - "USSR", - "Yugoslavia", - # The following countries cannot be replaced by the successor countries. - # 'East Germany', - # 'West Germany', - # 'North Yemen', - # 'South Yemen', -] - -# Historical countries for which we don't have population, and can't be built from successor countries. -EXPECTED_COUNTRIES_WITHOUT_POPULATION = list( - set(HISTORIC_TO_CURRENT_REGION) - set(BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES) -) - -# Overlaps found between historical regions and successor countries, that we accept in the data. -# We accept them either because they happened close to the transition, or to avoid needing to introduce new -# countries for which we do not have data (like the Russian Empire). -ACCEPTED_OVERLAPS = { - 1902: {"USSR", "Azerbaijan"}, - 1990: {"Tajikistan", "USSR"}, - 1991: {"Georgia", "USSR"}, -} - - -def get_countries_in_region( - region: str, region_modifications: Optional[Dict[str, Dict[str, List[str]]]] = None -) -> List[str]: - """Get countries in a region, both for known regions (e.g. "Africa") and custom ones (e.g. "Europe (excl. EU-27)"). - - Parameters - ---------- - region : str - Region name (e.g. "Africa", or "Europe (excl. EU-27)"). - region_modifications : dict or None - If None (or an empty dictionary), the region should be in OWID's countries-regions dataset. - If not None, it should be a dictionary with any (or all) of the following keys: - - "regions_included": List of regions whose countries will be included. - - "regions_excluded": List of regions whose countries will be excluded. - - "countries_included": List of additional individual countries to be included. - - "countries_excluded": List of additional individual countries to be excluded. - NOTE: All regions and countries defined in this dictionary should be in OWID's countries-regions dataset. - - Returns - ------- - countries : list - List of countries in the specified region. - - """ - if region_modifications is None: - region_modifications = {} - - # Check that the fields in the regions_modifications dictionary are well defined. - expected_fields = ["regions_included", "regions_excluded", "countries_included", "countries_excluded"] - assert all([field in expected_fields for field in region_modifications]) - - # Get lists of regions whose countries will be included and excluded. - regions_included = region_modifications.get("regions_included", [region]) - regions_excluded = region_modifications.get("regions_excluded", []) - # Get lists of additional individual countries to include and exclude. - countries_included = region_modifications.get("countries_included", []) - countries_excluded = region_modifications.get("countries_excluded", []) - - # List countries from the list of regions included. - countries_set = set( - sum([geo.list_countries_in_region(region_included) for region_included in regions_included], []) - ) - - # Remove all countries from the list of regions excluded. - countries_set -= set( - sum([geo.list_countries_in_region(region_excluded) for region_excluded in regions_excluded], []) - ) - - # Add the list of individual countries to be included. - countries_set |= set(countries_included) - - # Remove the list of individual countries to be excluded. - countries_set -= set(countries_excluded) - - # Convert set of countries into a sorted list. - countries = sorted(countries_set) - - return countries - - -def load_population(regions: Optional[Dict[Any, Any]] = None) -> pd.DataFrame: - """Load OWID population dataset, and add historical regions to it. - - Returns - ------- - population : pd.DataFrame - Population dataset. - - """ - # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] - - # Add data for historical regions (if not in population) by adding the population of its current successors. - countries_with_population = population["country"].unique() - - # Consider additional regions (e.g. historical regions). - if regions is None: - regions = {} - missing_countries = [country for country in regions if country not in countries_with_population] - for country in missing_countries: - members = regions[country]["regions_included"] - _population = ( - population[population["country"].isin(members)] - .groupby("year") - .agg({"population": "sum", "country": "nunique"}) - .reset_index() - ) - # Select only years for which we have data for all member countries. - _population = _population[_population["country"] == len(members)].reset_index(drop=True) - _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) - - error = "Duplicate country-years found in population. Check if historical regions changed." - assert population[population.duplicated(subset=["country", "year"])].empty, error - - return cast(pd.DataFrame, population) - - -def load_income_groups() -> pd.DataFrame: - """Load dataset of income groups and add historical regions to it. - - Returns - ------- - income_groups : pd.DataFrame - Income groups data. - - """ - # Load the WorldBank dataset for income grups. - income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() - - # Add historical regions to income groups. - for historic_region in HISTORIC_TO_CURRENT_REGION: - historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] - if historic_region not in income_groups["country"]: - historic_region_df = pd.DataFrame( - { - "country": [historic_region], - "income_group": [historic_region_income_group], - } - ) - income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) - - return cast(pd.DataFrame, income_groups) - - -def add_population( - df: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - interpolate_missing_population: bool = False, - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, - regions: Optional[Dict[Any, Any]] = None, - expected_countries_without_population: List[str] = [], -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - This function has been adapted from datautils.geo, because population currently does not include historic regions. - We include them in this function. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - interpolate_missing_population : bool - True to linearly interpolate population on years that are presented in df, but for which we do not have - population data; otherwise False to keep missing population data as nans. - For example, if interpolate_missing_population is True and df has data for all years between 1900 and 1910, - but population is only given for 1900 and 1910, population will be linearly interpolated between those years. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - regions : dict - Definitions of regions whose population also needs to be included. - expected_countries_without_population : list - Countries that are expected to not have population (that should be ignored if warnings are activated). - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Load population dataset. - population = load_population(regions=regions).rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - if interpolate_missing_population: - # For some countries we have population data only on certain years, e.g. 1900, 1910, etc. - # Optionally fill missing years linearly. - countries_in_data = df[country_col].unique() - years_in_data = df[year_col].unique() - - population = population.set_index([country_col, year_col]).reindex( - pd.MultiIndex.from_product([countries_in_data, years_in_data], names=[country_col, year_col]) - ) - - population = population.groupby(country_col).transform( - lambda x: x.interpolate(method="linear", limit_direction="both") - ) - - error = "Countries without population data differs from list of expected countries without population data." - assert set(population[population[population_col].isnull()].reset_index()[country_col]) == set( - expected_countries_without_population - ), error - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population - - -def detect_overlapping_regions( - df, index_columns, region_and_members, country_col="country", year_col="year", ignore_zeros=True -): - """Detect years on which the data for two regions overlap, e.g. a historical region and one of its successors. - - Parameters - ---------- - df : _type_ - Data (with a dummy index). - index_columns : _type_ - Names of index columns. - region_and_members : _type_ - Regions to check for overlaps. Each region must have a dictionary "regions_included", listing the subregions - contained. If the region is historical, "regions_included" would be the list of successor countries. - country_col : str, optional - Name of country column (usually "country"). - year_col : str, optional - Name of year column (usually "year"). - ignore_zeros : bool, optional - True to ignore overlaps of zeros. - - Returns - ------- - all_overlaps : dict - All overlaps found. - - """ - # Sum over all columns to get the total sum of each column for each country-year. - df_total = ( - df.groupby([country_col, year_col]) - .agg({column: "sum" for column in df.columns if column not in index_columns}) - .reset_index() - ) - # Create a list of values that will be ignored in overlaps (usually zero or nothing). - if ignore_zeros: - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - # List all variables in data (ignoring index columns). - variables = [column for column in df.columns if column not in index_columns] - # List all country names found in data. - countries_in_data = df[country_col].unique().tolist() - # List all regions found in data. - regions = [country for country in list(region_and_members) if country in countries_in_data] - # Initialize a dictionary that will store all overlaps found. - all_overlaps = {} - for region in regions: - # List members of current region. - members = [member for member in region_and_members[region]["regions_included"] if member in countries_in_data] - for member in members: - # Select data for current region. - region_values = ( - df_total[df_total[country_col] == region] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=variables, how="all") - ) - # Select data for current member. - member_values = ( - df_total[df_total[country_col] == member] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=variables, how="all") - ) - # Concatenate both selections of data, and select duplicated rows. - combined = pd.concat([region_values, member_values]) - overlaps = combined[combined.duplicated(subset=[year_col], keep=False)] # type: ignore - if len(overlaps) > 0: - # Add the overlap found to the dictionary of all overlaps. - all_overlaps.update({year: set(overlaps[country_col]) for year in overlaps[year_col].unique()}) - - # Sort overlaps conveniently. - all_overlaps = {year: all_overlaps[year] for year in sorted(list(all_overlaps))} - - return all_overlaps - - -def add_region_aggregates( - data: pd.DataFrame, - index_columns: List[str], - country_column: str = "country", - aggregates: Optional[Dict[str, str]] = None, -) -> pd.DataFrame: - """Add region aggregates for all regions (which may include continents and income groups). - - Parameters - ---------- - data : pd.DataFrame - Data. - index_columns : list - Name of index columns. - country_column : str - Name of country column. - year_column : str - Name of year column. - aggregates : dict or None - Dictionary of type of aggregation to use for each variable. If None, variables will be aggregated by summing. - - Returns - ------- - data : pd.DataFrame - Data after adding aggregate regions. - - """ - data = data.copy() - - all_overlaps = detect_overlapping_regions( - df=data, region_and_members=HISTORIC_TO_CURRENT_REGION, index_columns=index_columns - ) - - # Check whether all accepted overlaps are found in the data, and that there are no new unknown overlaps. - error = "Either the list of accepted overlaps is not found in the data, or there are new unknown overlaps." - assert ACCEPTED_OVERLAPS == all_overlaps, error - - if aggregates is None: - # If aggregations are not specified, assume all variables are to be aggregated, by summing. - aggregates = {column: "sum" for column in data.columns if column not in index_columns} - - for region in REGIONS: - # List of countries in region. - countries_in_region = get_countries_in_region(region=region, region_modifications=REGIONS[region]) - # Select rows of data for member countries. - data_region = data[data[country_column].isin(countries_in_region)] - - # Add region aggregates. - region_df = ( - data_region.groupby([column for column in index_columns if column != country_column]) - .sum(numeric_only=True) - .reset_index() - .assign(**{country_column: region}) - ) - data = pd.concat([data, region_df], ignore_index=True) # type: ignore - - return data - - -def get_last_day_of_month(year: int, month: int): - """Get the number of days in a specific month of a specific year. - - Parameters - ---------- - year : int - Year. - month : int - Month. - - Returns - ------- - last_day - Number of days in month. - - """ - if month == 12: - last_day = 31 - else: - last_day = (datetime.datetime.strptime(f"{year:04}-{month + 1:02}", "%Y-%m") + datetime.timedelta(days=-1)).day - - return last_day - - -def correct_data_points(df: pd.DataFrame, corrections: List[Tuple[Dict[Any, Any], Dict[Any, Any]]]) -> pd.DataFrame: - """Make individual corrections to data points in a dataframe. - - Parameters - ---------- - df : pd.DataFrame - Data to be corrected. - corrections : List[Tuple[Dict[Any, Any], Dict[Any, Any]]] - Corrections. - - Returns - ------- - corrected_df : pd.DataFrame - Corrected data. - - """ - corrected_df = df.copy() - - for correction in corrections: - wrong_row, corrected_row = correction - - # Select the row in the dataframe where the wrong data point is. - # The 'fillna(False)' is added because otherwise rows that do not fulfil the selection will create ambiguity. - selection = corrected_df.loc[(corrected_df[list(wrong_row)] == pd.Series(wrong_row)).fillna(False).all(axis=1)] - # Sanity check. - error = "Either raw data has been corrected, or dictionary selecting wrong row is ambiguous." - assert len(selection) == 1, error - - # Replace wrong fields by the corrected ones. - # Note: Changes to categorical fields will not work. - corrected_df.loc[selection.index, list(corrected_row)] = list(corrected_row.values()) - - return corrected_df diff --git a/etl/steps/archive/garden/emissions/2023-05-02/national_contributions.meta.yml b/etl/steps/archive/garden/emissions/2023-05-02/national_contributions.meta.yml deleted file mode 100644 index d8a22e4b4da..00000000000 --- a/etl/steps/archive/garden/emissions/2023-05-02/national_contributions.meta.yml +++ /dev/null @@ -1,330 +0,0 @@ -all_sources: - - source_jones: &source-jones - name: Our World in Data based on Jones et al. (2023) - published_by: | - Our World in Data based on Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data (2023.1). Zenodo. - url: https://zenodo.org/record/7636699#.ZFCy4exBweZ - date_accessed: "2023-05-02" - publication_date: "2023-02-13" - -dataset: - title: National contributions to climate change (Jones et al. (2023), 2023) - description: | - Jones et al. (2023) quantify national and regional contributions to the increase of global mean surface temperature over the last few centuries. - - Original dataset description by Jones et al. (2023): - A dataset describing the global warming response to national emissions CO2, CH4 and N2O from fossil and land use sources during 1851-2021. - - National CO2 emissions data are collated from the Global Carbon Project (Andrew and Peters, 2022; Friedlingstein et al., 2022). - - National CH4 and N2O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2022). - - We construct a time series of cumulative CO2-equivalent emissions for each country, gas, and emissions source (fossil or land use). Emissions of CH4 and N2O emissions are related to cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021). - - Warming in response to cumulative CO2-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST). - sources: - - *source-jones - -tables: - national_contributions: - variables: - annual_emissions_ch4_fossil: - title: Annual methane emissions from fossil fuels and industry - unit: tonnes - short_unit: t - description: | - Annual methane emissions from fossil fuels and industry. - annual_emissions_ch4_land: - title: Annual methane emissions from agriculture and land use - unit: tonnes - short_unit: t - description: | - Annual methane emissions from agriculture and land use. - annual_emissions_ch4_total: - title: Annual methane emissions - unit: tonnes - short_unit: t - description: | - Annual methane emissions. - annual_emissions_co2_fossil: - title: Annual CO2 emissions from fossil fuels and industry - unit: tonnes - short_unit: t - description: | - Annual CO2 emissions from fossil fuels and industry. - annual_emissions_co2_total: - title: Annual CO2 emissions - unit: tonnes - short_unit: t - description: | - Annual CO2 emissions. - annual_emissions_n2o_fossil: - title: Annual nitrous oxide emissions from fossil fuels and industry - unit: tonnes - short_unit: t - description: | - Annual nitrous oxide emissions from fossil fuels and industry. - annual_emissions_n2o_land: - title: Annual nitrous oxide emissions from agriculture and land use - unit: tonnes - short_unit: t - description: | - Annual nitrous oxide emissions from agriculture and land use. - annual_emissions_n2o_total: - title: Annual nitrous oxide emissions - unit: tonnes - short_unit: t - description: | - Annual nitrous oxide emissions. - annual_emissions_co2_land: - title: Annual CO2 emissions from agriculture and land use - unit: tonnes - short_unit: t - description: | - Annual CO2 emissions from agriculture and land use. - cumulative_emissions_ghg_fossil: - title: Cumulative greenhouse gas emissions from fossil fuels and industry - unit: tonnes - short_unit: t - description: | - Cumulative greenhouse gas emissions from fossil fuels and industry. - cumulative_emissions_ghg_land: - title: Cumulative greenhouse gas emissions from agriculture and land use - unit: tonnes - short_unit: t - description: | - Cumulative greenhouse gas emissions from agriculture and land use. - cumulative_emissions_ghg_total: - title: Cumulative greenhouse gas emissions - unit: tonnes - short_unit: t - description: | - Cumulative greenhouse gas emissions. - cumulative_emissions_ch4_fossil: - title: Cumulative methane emissions from fossil fuels and industry - unit: tonnes - short_unit: t - description: | - Cumulative methane emissions from fossil fuels and industry. - cumulative_emissions_ch4_land: - title: Cumulative methane emissions from agriculture and land use - unit: tonnes - short_unit: t - description: | - Cumulative methane emissions from agriculture and land use. - cumulative_emissions_ch4_total: - title: Cumulative methane emissions - unit: tonnes - short_unit: t - description: | - Cumulative methane emissions. - cumulative_emissions_co2_fossil: - title: Cumulative CO2 emissions from fossil fuels and industry - unit: tonnes - short_unit: t - description: | - Cumulative CO2 emissions from fossil fuels and industry. - cumulative_emissions_co2_land: - title: Cumulative CO2 emissions from agriculture and land use - unit: tonnes - short_unit: t - description: | - Cumulative CO2 emissions from agriculture and land use. - cumulative_emissions_co2_total: - title: Cumulative CO2 emissions - unit: tonnes - short_unit: t - description: | - Cumulative CO2 emissions. - cumulative_emissions_n2o_fossil: - title: Cumulative nitrous oxide emissions from fossil fuels and industry - unit: tonnes - short_unit: t - description: | - Cumulative nitrous oxide emissions from fossil fuels and industry. - cumulative_emissions_n2o_land: - title: Cumulative nitrous oxide emissions from agriculture and land use - unit: tonnes - short_unit: t - description: | - Cumulative nitrous oxide emissions from agriculture and land use. - cumulative_emissions_n2o_total: - title: Cumulative nitrous oxide emissions - unit: tonnes - short_unit: t - description: | - Cumulative nitrous oxide emissions. - temperature_response_ghg_fossil: - title: Change in global mean surface temperature caused by greenhouse gas emissions from fossil fuels and industry - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by greenhouse gas emissions from fossil fuels and industry. - temperature_response_ghg_land: - title: Change in global mean surface temperature caused by greenhouse gas emissions from agriculture and land use - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by greenhouse gas emissions from agriculture and land use. - temperature_response_ghg_total: - title: Change in global mean surface temperature caused by greenhouse gas emissions - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by greenhouse gas emissions. This measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide. The warming effects of each gas are calculated based on cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach. - temperature_response_ch4_fossil: - title: Change in global mean surface temperature caused by methane emissions from fossil fuels and industry - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by methane emissions from fossil fuels and industry. - temperature_response_ch4_land: - title: Change in global mean surface temperature caused by methane emissions from agriculture and land use - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by methane emissions from agriculture and land use. - temperature_response_ch4_total: - title: Change in global mean surface temperature caused by methane emissions - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by methane emissions. This measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of methane. The warming effects of each gas are calculated based on cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach. - temperature_response_co2_fossil: - title: Change in global mean surface temperature caused by CO2 emissions from fossil fuels and industry - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by CO2 emissions from fossil fuels and industry. - temperature_response_co2_land: - title: Change in global mean surface temperature caused by CO2 emissions from agriculture and land use - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by CO2 emissions from agriculture and land use. - temperature_response_co2_total: - title: Change in global mean surface temperature caused by CO2 emissions - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by CO2 emissions. This measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide. The warming effects of each gas are calculated based on cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach. - temperature_response_n2o_fossil: - title: Change in global mean surface temperature caused by nitrous oxide emissions from fossil fuels and industry - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by nitrous oxide emissions from fossil fuels and industry. - temperature_response_n2o_land: - title: Change in global mean surface temperature caused by nitrous oxide emissions from agriculture and land use - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by nitrous oxide emissions from agriculture and land use. - temperature_response_n2o_total: - title: Change in global mean surface temperature caused by nitrous oxide emissions - unit: °C - short_unit: °C - description: | - Change in global mean surface temperature (in °C) caused by nitrous oxide emissions. This measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative nitrous oxide emissions. The warming effects of each gas are calculated based on cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach. - annual_emissions_ch4_fossil_co2eq: - title: Annual methane emissions from fossil fuels and industry in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Annual methane emissions from fossil fuels and industry in CO2 equivalents. Methane emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC conversion factors. Jones et al. (2023) give methane emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources, and 27.2 for agricultural and land use sources (as per the IPCC AR6 report). - annual_emissions_ch4_land_co2eq: - title: Annual methane emissions from agriculture and land use in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Annual methane emissions from agriculture and land use in CO2 equivalents. Methane emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC conversion factors. Jones et al. (2023) give methane emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources, and 27.2 for agricultural and land use sources (as per the IPCC AR6 report). - annual_emissions_ch4_total_co2eq: - title: Annual methane emissions in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Annual methane emissions in CO2 equivalents. Methane emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC conversion factors. Jones et al. (2023) give methane emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources, and 27.2 for agricultural and land use sources (as per the IPCC AR6 report). - annual_emissions_n2o_fossil_co2eq: - title: Annual nitrous oxide emissions from fossil fuels and industry in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Annual nitrous oxide emissions from fossil fuels and industry in CO2 equivalents. Nitrous oxide emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 (as per the IPCC AR6 report). - annual_emissions_n2o_land_co2eq: - title: Annual nitrous oxide emissions from agriculture and land use in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Annual nitrous oxide emissions from agriculture and land use in CO2 equivalents. Nitrous oxide emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 (as per the IPCC AR6 report). - annual_emissions_n2o_total_co2eq: - title: Annual nitrous oxide emissions in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Annual nitrous oxide emissions in CO2 equivalents. Nitrous oxide emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 (as per the IPCC AR6 report). - annual_emissions_ghg_fossil_co2eq: - title: Annual greenhouse gas emissions from fossil fuels and industry in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Annual greenhouse gas emissions from fossil fuels and industry in CO2 equivalents. Greenhouse gas emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). - annual_emissions_ghg_land_co2eq: - title: Annual greenhouse gas emissions from agriculture and land use in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Annual greenhouse gas emissions from agriculture and land use in CO2 equivalents. Greenhouse gas emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). - annual_emissions_ghg_total_co2eq: - title: Annual greenhouse gas emissions in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Annual greenhouse gas emissions in CO2 equivalents. Greenhouse gas emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). - share_of_annual_emissions_ch4_total: - title: Share of global methane emissions - unit: "%" - short_unit: "%" - description: | - Share of global methane emissions. Methane emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC conversion factors. Jones et al. (2023) give methane emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources, and 27.2 for agricultural and land use sources (as per the IPCC AR6 report). - share_of_annual_emissions_co2_total: - title: Share of global CO2 emissions - unit: "%" - short_unit: "%" - description: | - Share of global CO2 emissions. - share_of_annual_emissions_ghg_total_co2eq: - title: Share of global greenhouse gas emissions - unit: "%" - short_unit: "%" - description: | - Share of global greenhouse gas emissions. Greenhouse gas emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). - share_of_annual_emissions_n2o_total: - title: Share of global nitrous oxide emissions - unit: "%" - short_unit: "%" - description: | - Share of global nitrous oxide emissions. Nitrous oxide emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 (as per the IPCC AR6 report). - share_of_temperature_response_ghg_total: - title: Share of contribution to global warming - unit: "%" - short_unit: "%" - description: | - Share of contribution to global warming (as a percentage). This measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide. The warming effects of each gas are calculated based on cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach. - annual_emissions_ch4_total_co2eq_per_capita: - title: Per-capita methane emissions in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Per-capita methane emissions in CO2 equivalents. Methane emissions per person are calculated by Our World in Data based on emissions data from Jones et al. (2023) and population data from HYDE and the UN World Population Prospects. Jones et al. (2023) give methane emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources, and 27.2 for agricultural and land use sources (as per the IPCC AR6 report). - annual_emissions_n2o_total_co2eq_per_capita: - title: Per-capita nitrous oxide emissions in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Per-capita nitrous oxide emissions in CO2 equivalents. Nitrous oxide emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 (as per the IPCC AR6 report). - annual_emissions_ghg_total_co2eq_per_capita: - title: Per-capita greenhouse gas emissions in CO2 equivalents - unit: tonnes - short_unit: t - description: | - Per-capita greenhouse gas emissions in CO2 equivalents. Greenhouse gas emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). diff --git a/etl/steps/archive/garden/emissions/2023-05-02/national_contributions.py b/etl/steps/archive/garden/emissions/2023-05-02/national_contributions.py deleted file mode 100644 index 6bf1477b751..00000000000 --- a/etl/steps/archive/garden/emissions/2023-05-02/national_contributions.py +++ /dev/null @@ -1,344 +0,0 @@ -"""Load a meadow dataset and create a garden dataset.""" - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.datautils.dataframes import map_series -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Minimum year to consider. -# There is data from 1830 for some variables and from 1850 for others. -# However, when inspecting data between 1830 and 1850 (e.g. total CO2 emissions) it seems there is an abrupt jump -# between 1849 and 1850, which happens for many countries. -# This jump seems to be spurious, and therefore we start all time series from 1850. -YEAR_MIN = 1850 - -# Conversion factor to change from teragrams to tonnes. -TERAGRAMS_TO_TONNES = 1e6 -# Conversion factor to change from petagrams to tonnes. -PETAGRAMS_TO_TONNES = 1e9 - -# Conversion factors to change from tonnes of gases emitted to tonnes of CO2 equivalents (taken from IPCC AR6). -CH4_FOSSIL_EMISSIONS_TO_CO2_EQUIVALENTS = 29.8 -CH4_LAND_EMISSIONS_TO_CO2_EQUIVALENTS = 27.2 -N2O_EMISSIONS_TO_CO2_EQUIVALENTS = 273 - -# Gases and components expected to be in the data, and how to rename them. -GASES_RENAMING = { - "3-GHG": "ghg", - "CH[4]": "ch4", - "CO[2]": "co2", - "N[2]*O": "n2o", -} -COMPONENTS_RENAMING = { - "Fossil": "fossil", - "LULUCF": "land", - "Total": "total", -} - -# Columns for which we will create "share" variables (percentage with respect to global). -SHARE_VARIABLES = [ - "annual_emissions_ch4_total", - "annual_emissions_co2_total", - "annual_emissions_n2o_total", - "annual_emissions_ghg_total_co2eq", - "temperature_response_ghg_total", -] - -# Columns for which a per-capita variable will be created. -PER_CAPITA_VARIABLES = [ - "annual_emissions_ch4_total_co2eq", - "annual_emissions_n2o_total_co2eq", - "annual_emissions_ghg_total_co2eq", -] - -# Regions to be added by aggregating data from their member countries. -REGIONS = { - # Default continents. - "Africa": {}, - "Asia": {}, - "Europe": {}, - "North America": {}, - "Oceania": {}, - "South America": {}, - # Income groups. - "Low-income countries": {}, - "Upper-middle-income countries": {}, - "Lower-middle-income countries": {}, - "High-income countries": {}, - # Additional composite regions. - "Asia (excl. China and India)": { - "additional_regions": ["Asia"], - "excluded_members": ["China", "India"], - }, - "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, - "Europe (excl. EU-28)": { - "additional_regions": ["Europe"], - "excluded_regions": ["European Union (27)"], - "excluded_members": ["United Kingdom"], - }, - "European Union (28)": { - "additional_regions": ["European Union (27)"], - "additional_members": ["United Kingdom"], - }, - "North America (excl. USA)": { - "additional_regions": ["North America"], - "excluded_members": ["United States"], - }, - # EU27 is already included in the original data. - # "European Union (27)": {}, -} - - -def run_sanity_checks_on_inputs(df): - # Sanity checks. - error = "Names of gases have changed." - assert set(df["gas"]) == set(GASES_RENAMING), error - error = "Names of components have changed." - assert set(df["component"]) == set(COMPONENTS_RENAMING), error - error = "Units have changed." - assert set(df["unit"]) == set( - ["Tg~CH[4]~year^-1", "Pg~CO[2]~year^-1", "Tg~N[2]*O~year^-1", "Pg~CO[2]*-e[100]", "°C"] - ), error - - -def add_kuwaiti_oil_fires_to_kuwait(df: pd.DataFrame) -> pd.DataFrame: - df = df.copy() - - # NOTE: Use this function before harmonizing country names. Otherwise adapt the following definitions. - kuwait = "Kuwait" - oil_fires = "Kuwaiti Oil Fires" - - # Sanity check. - error = f"'{kuwait}' or '{oil_fires}' not found in the data." - assert kuwait in set(df["country"]), error - assert oil_fires in set(df["country"]), error - - # Add the emissions from the Kuwaiti oil fires (in 1991) to Kuwait. - df_kuwait = df[df["country"] == kuwait].drop(columns="country").set_index("year") - df_oil_fires = df[df["country"] == oil_fires].drop(columns="country").fillna(0).set_index(["year"]) - df_combined = (df_kuwait + df_oil_fires).reset_index().assign(**{"country": kuwait}) - - # Replace the origina data for Kuwait by the combined data. - df_updated = pd.concat([df[df["country"] != kuwait].reset_index(drop=True), df_combined], ignore_index=True) - - # Sort conveniently. - df_updated = df_updated.sort_values(["country", "year"]).reset_index(drop=True) - - return df_updated - - -def add_emissions_in_co2_equivalents(df: pd.DataFrame) -> pd.DataFrame: - # Add columns for fossil/land/total emissions of CH4 in terms of CO2 equivalents. - df["annual_emissions_ch4_fossil_co2eq"] = ( - df["annual_emissions_ch4_fossil"] * CH4_FOSSIL_EMISSIONS_TO_CO2_EQUIVALENTS - ) - df["annual_emissions_ch4_land_co2eq"] = df["annual_emissions_ch4_land"] * CH4_LAND_EMISSIONS_TO_CO2_EQUIVALENTS - df["annual_emissions_ch4_total_co2eq"] = ( - df["annual_emissions_ch4_fossil_co2eq"] + df["annual_emissions_ch4_land_co2eq"] - ) - - # Add columns for fossil/land/total emissions of N2O in terms of CO2 equivalents. - for component in ["fossil", "land", "total"]: - df[f"annual_emissions_n2o_{component}_co2eq"] = ( - df[f"annual_emissions_n2o_{component}"] * N2O_EMISSIONS_TO_CO2_EQUIVALENTS - ) - - # Add columns for fossil/land/total emissions of all GHG in terms of CO2 equivalents. - for component in ["fossil", "land", "total"]: - df[f"annual_emissions_ghg_{component}_co2eq"] = ( - df[f"annual_emissions_co2_{component}"] - + df[f"annual_emissions_ch4_{component}_co2eq"] - + df[f"annual_emissions_n2o_{component}_co2eq"] - ) - - return df - - -def add_region_aggregates(df: pd.DataFrame, ds_regions: Dataset, ds_income_groups: Dataset) -> pd.DataFrame: - for region in REGIONS: - # List members in this region. - members = geo.list_members_of_region( - region=region, - ds_regions=ds_regions, - ds_income_groups=ds_income_groups, - additional_regions=REGIONS[region].get("additional_regions", None), - excluded_regions=REGIONS[region].get("excluded_regions", None), - additional_members=REGIONS[region].get("additional_members", None), - excluded_members=REGIONS[region].get("excluded_members", None), - ) - df = geo.add_region_aggregates( - df=df, - region=region, - countries_in_region=members, - countries_that_must_have_data=[], - # Here we allow aggregating even when there are few countries informed. - # However, if absolutely all countries have nan, we want the aggregate to be nan, not zero. - frac_allowed_nans_per_year=0.999, - ) - - return df - - -def add_share_variables(df: pd.DataFrame) -> pd.DataFrame: - df = df.copy() - - # Create "share" variables (percentages with respect to global). - # To do that, first create a separate dataframe for global data, and add it to the main dataframe. - df_global = df[df["country"] == "World"][["year"] + SHARE_VARIABLES].reset_index(drop=True) - - df = pd.merge(df, df_global, on=["year"], how="left", suffixes=("", "_global")) - # For a list of variables, add the percentage with respect to global. - for variable in SHARE_VARIABLES: - df[f"share_of_{variable}"] = 100 * df[variable] / df[f"{variable}_global"] - - # Drop unnecessary columns for global data. - df = df.drop(columns=[column for column in df.columns if column.endswith("_global")]) - - return df - - -def add_per_capita_variables(df: pd.DataFrame, ds_population: Dataset) -> pd.DataFrame: - df = df.copy() - - # Add population to data. - df = geo.add_population_to_dataframe( - df=df, - ds_population=ds_population, - warn_on_missing_countries=False, - ) - - # Add per-capita variables. - for variable in PER_CAPITA_VARIABLES: - df[f"{variable}_per_capita"] = df[variable] / df["population"] - - # Drop population column. - df = df.drop(columns="population") - - return df - - -def run_sanity_checks_on_outputs(df: pd.DataFrame) -> None: - error = "Share of global emissions cannot be larger than 101%" - assert (df[[column for column in df.columns if "share" in column]].max() < 101).all(), error - error = "Share of global emissions was not expected to be smaller than -1%" - # Some countries did contribute negatively to CO2 emissions, however overall the negative contribution is always - # smaller than 1% in absolute value. - assert (df[[column for column in df.columns if "share" in column]].min() > -1).all(), error - - # Ensure that no country contributes to emissions more than the entire world. - columns_that_should_be_smaller_than_global = [ - column for column in df.drop(columns=["country", "year"]).columns if "capita" not in column - ] - df_global = df[df["country"] == "World"].drop(columns="country") - check = pd.merge( - df[df["country"] != "World"].reset_index(drop=True), df_global, on="year", how="left", suffixes=("", "_global") - ) - for column in columns_that_should_be_smaller_than_global: - # It is in principle possible that some region would emit more than the world, if the rest of regions - # were contributing with negative CO2 emissions (e.g. High-income countries in 1854). - # However, the difference should be very small. - error = f"Region contributed to {column} more than the entire world." - assert check[(check[column] - check[f"{column}_global"]) / check[f"{column}_global"] > 0.00001].empty, error - - -def run(dest_dir: str) -> None: - log.info("national_contributions.start") - - # - # Load inputs. - # - # Load meadow dataset and read its main table. - ds_meadow: Dataset = paths.load_dependency("national_contributions") - tb_meadow = ds_meadow["national_contributions"] - - # Load regions dataset. - ds_regions: Dataset = paths.load_dependency("regions") - - # Load income groups dataset. - ds_income_groups: Dataset = paths.load_dependency("wb_income") - - # Load population dataset. - ds_population = paths.load_dependency("population") - - # Create a dataframe with data from the table. - df = pd.DataFrame(tb_meadow).reset_index() - - # - # Process data. - # - # Sanity checks. - run_sanity_checks_on_inputs(df=df) - - # Rename gases and components. - df["gas"] = map_series( - series=df["gas"], mapping=GASES_RENAMING, warn_on_missing_mappings=True, warn_on_unused_mappings=True - ) - df["component"] = map_series( - series=df["component"], mapping=COMPONENTS_RENAMING, warn_on_missing_mappings=True, warn_on_unused_mappings=True - ) - - # Convert units from teragrams and petagrams to tonnes. - df.loc[df["unit"].str.startswith("Tg"), "data"] *= TERAGRAMS_TO_TONNES - df.loc[df["unit"].str.startswith("Pg"), "data"] *= PETAGRAMS_TO_TONNES - - # Transpose data. - df = df.pivot(index=["country", "year"], columns=["file", "gas", "component"], values="data") - df.columns = ["_".join(column) for column in df.columns] - df = df.reset_index() - - # We add the emissions from the Kuwaiti oil fires in 1991 (which are also included as a separate country) as part - # of the emissions of Kuwait. - # This ensures that these emissions will be included in aggregates of regions that include Kuwait. - df = add_kuwaiti_oil_fires_to_kuwait(df=df) - - # Harmonize country names. - df = geo.harmonize_countries( - df, - countries_file=paths.country_mapping_path, - excluded_countries_file=paths.excluded_countries_path, - warn_on_missing_countries=True, - warn_on_unused_countries=True, - ) - - # Add region aggregates. - df = add_region_aggregates(df=df, ds_regions=ds_regions, ds_income_groups=ds_income_groups) - - # Add columns for emissions in terms of CO2 equivalents. - df = add_emissions_in_co2_equivalents(df=df) - - # Add "share" variables (percentages with respect to global emissions). - df = add_share_variables(df=df) - - # Add per-capita variables. - df = add_per_capita_variables(df=df, ds_population=ds_population) - - # Ensure data starts from a certain fixed year (see notes above). - df = df[df["year"] >= YEAR_MIN].reset_index(drop=True) - - # Sanity checks. - run_sanity_checks_on_outputs(df=df) - - # Create a new table with the processed data. - tb_garden = Table(df, short_name=paths.short_name, underscore=True) - - # Set an appropriate index and sort conveniently. - tb_garden = tb_garden.set_index(["country", "year"], verify_integrity=True).sort_index() - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_meadow.metadata) - - # Save changes in the new garden dataset. - ds_garden.save() - - log.info("national_contributions.end") diff --git a/etl/steps/archive/garden/emissions/2023-05-03/owid_co2.meta.yml b/etl/steps/archive/garden/emissions/2023-05-03/owid_co2.meta.yml deleted file mode 100644 index 34dd2c1b65f..00000000000 --- a/etl/steps/archive/garden/emissions/2023-05-03/owid_co2.meta.yml +++ /dev/null @@ -1,12 +0,0 @@ -dataset: - title: OWID CO2 dataset (2023) - description: | - OWID CO2 dataset. - - This dataset will be loaded by [the co2-data repository](https://github.com/owid/co2-data), to create a csv file of the dataset that can be downloaded in one click. - -# Dataset sources will be created in the step by combining all component datasets' sources. -# Also, table metadata will be built from the tables' original metadata. - -tables: - {} diff --git a/etl/steps/archive/garden/emissions/2023-05-03/owid_co2.py b/etl/steps/archive/garden/emissions/2023-05-03/owid_co2.py deleted file mode 100644 index e497bcde0cd..00000000000 --- a/etl/steps/archive/garden/emissions/2023-05-03/owid_co2.py +++ /dev/null @@ -1,419 +0,0 @@ -"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset (2022). - -Datasets combined: -* Global Carbon Budget (Global Carbon Project, 2022). -* National contributions to climate change (Jones et al. (2023), 2023). -* Greenhouse gas emissions by sector (CAIT, 2022). -* Primary energy consumption (BP & EIA, 2022) - -Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2020) on -GDP are included. - -""" - -from typing import List - -import numpy as np -import pandas as pd -from owid import catalog -from owid.datautils import dataframes - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Details for dataset to export. -DATASET_SHORT_NAME = "owid_co2" -DATASET_TITLE = "CO2 dataset (OWID, 2022)" - -# Conversion factor from tonnes to million tonnes. -TONNES_TO_MILLION_TONNES = 1e-6 - -# Select columns to use from each dataset, and how to rename them. -GCP_COLUMNS = { - "country": "country", - "year": "year", - "emissions_total": "co2", - "emissions_total_per_capita": "co2_per_capita", - "traded_emissions": "trade_co2", - "emissions_from_cement": "cement_co2", - "emissions_from_cement_per_capita": "cement_co2_per_capita", - "emissions_from_coal": "coal_co2", - "emissions_from_coal_per_capita": "coal_co2_per_capita", - "emissions_from_flaring": "flaring_co2", - "emissions_from_flaring_per_capita": "flaring_co2_per_capita", - "emissions_from_gas": "gas_co2", - "emissions_from_gas_per_capita": "gas_co2_per_capita", - "emissions_from_oil": "oil_co2", - "emissions_from_oil_per_capita": "oil_co2_per_capita", - "emissions_from_other_industry": "other_industry_co2", - "emissions_from_other_industry_per_capita": "other_co2_per_capita", - "pct_growth_emissions_total": "co2_growth_prct", - "growth_emissions_total": "co2_growth_abs", - "emissions_total_per_gdp": "co2_per_gdp", - "emissions_total_per_unit_energy": "co2_per_unit_energy", - "consumption_emissions": "consumption_co2", - "consumption_emissions_per_capita": "consumption_co2_per_capita", - "consumption_emissions_per_gdp": "consumption_co2_per_gdp", - "cumulative_emissions_total": "cumulative_co2", - "cumulative_emissions_from_cement": "cumulative_cement_co2", - "cumulative_emissions_from_coal": "cumulative_coal_co2", - "cumulative_emissions_from_flaring": "cumulative_flaring_co2", - "cumulative_emissions_from_gas": "cumulative_gas_co2", - "cumulative_emissions_from_oil": "cumulative_oil_co2", - "cumulative_emissions_from_other_industry": "cumulative_other_co2", - "pct_traded_emissions": "trade_co2_share", - "emissions_total_as_share_of_global": "share_global_co2", - "emissions_from_cement_as_share_of_global": "share_global_cement_co2", - "emissions_from_coal_as_share_of_global": "share_global_coal_co2", - "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", - "emissions_from_gas_as_share_of_global": "share_global_gas_co2", - "emissions_from_oil_as_share_of_global": "share_global_oil_co2", - "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", - "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", - "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", - "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", - "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", - "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", - "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", - "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", - # New variables, related to land-use change emissions. - "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", - "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", - "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", - "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", - "emissions_from_land_use_change": "land_use_change_co2", - "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", - "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", - "emissions_total_including_land_use_change": "co2_including_luc", - "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", - "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", - "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", - "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", - "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", - "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", -} -JONES_COLUMNS = { - "country": "country", - "year": "year", - "temperature_response_co2_total": "temperature_change_from_co2", - "temperature_response_ghg_total": "temperature_change_from_ghg", - "temperature_response_ch4_total": "temperature_change_from_ch4", - "temperature_response_n2o_total": "temperature_change_from_n2o", - "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", -} -CAIT_GHG_COLUMNS = { - "country": "country", - "year": "year", - "total_excluding_lucf": "total_ghg_excluding_lucf", - "total_excluding_lucf__per_capita": "ghg_excluding_lucf_per_capita", - "total_including_lucf": "total_ghg", - "total_including_lucf__per_capita": "ghg_per_capita", -} -CAIT_CH4_COLUMNS = { - "country": "country", - "year": "year", - "total_including_lucf": "methane", - "total_including_lucf__per_capita": "methane_per_capita", -} -CAIT_N2O_COLUMNS = { - "country": "country", - "year": "year", - "total_including_lucf": "nitrous_oxide", - "total_including_lucf__per_capita": "nitrous_oxide_per_capita", -} -PRIMARY_ENERGY_COLUMNS = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "primary_energy_consumption", - "primary_energy_consumption_per_capita__kwh": "energy_per_capita", - "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", -} -REGIONS_COLUMNS = { - "name": "country", - "iso_alpha3": "iso_code", -} -POPULATION_COLUMNS = { - "country": "country", - "year": "year", - "population": "population", -} -GDP_COLUMNS = { - "country": "country", - "year": "year", - "gdp": "gdp", -} - -UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes"}} - - -def unique_sources_from_datasets( - datasets: List[catalog.Dataset], -) -> List[catalog.meta.Source]: - """Gather unique sources from datasets. - - Note: To check if a source is already listed, only the name of the source is considered (not the description or any - other field in the source). - - Parameters - ---------- - datasets : list - List of datasets with metadata. - - Returns - ------- - known_sources : list - List of unique sources from all datasets. - - """ - # Initialise list that will gather all unique metadata sources from the tables. - known_sources: List[catalog.meta.Source] = [] - for ds in datasets: - # Get list of sources of the dataset of current table. - table_sources = ds.metadata.sources - # Go source by source of current table, and check if its name is not already in the list of known_sources. - for source in table_sources: - # Check if this source's name is different to all known_sources. - if all([source.name != known_source.name for known_source in known_sources]): - # Add the new source to the list. - known_sources.append(source) - - return known_sources - - -def convert_units(table: catalog.Table) -> catalog.Table: - """Convert units of table. - - Parameters - ---------- - table : catalog.Table - Data with its original units. - - Returns - ------- - catalog.Table - Data after converting units of specific columns. - - """ - table = table.copy() - # Check units and convert to more convenient ones. - for column in table.columns: - unit = table[column].metadata.unit - short_unit = table[column].metadata.short_unit - title = table[column].metadata.title - description = table[column].metadata.description - if unit in list(UNITS): - table[column] *= UNITS[unit]["conversion"] - table[column].metadata.unit = unit - table[column].metadata.short_unit = short_unit - table[column].metadata.title = title - table[column].metadata.description = description.replace(unit, UNITS[unit]["new_unit"]) - - return table - - -def combine_tables( - tb_gcp: catalog.Table, - tb_jones: catalog.Table, - tb_cait_ghg: catalog.Table, - tb_cait_ch4: catalog.Table, - tb_cait_n2o: catalog.Table, - tb_energy: catalog.Table, - tb_gdp: catalog.Table, - tb_population: catalog.Table, - tb_regions: catalog.Table, -) -> catalog.Table: - """Combine tables. - - Parameters - ---------- - tb_gcp : catalog.Table - Global Carbon Budget table (from Global Carbon Project). - tb_jones : catalog.Table - National contributions to climate change (from Jones et al. (2023)). - tb_cait_ghg : catalog.Table - Greenhouse gas emissions table (from CAIT). - tb_cait_ch4 : catalog.Table - CH4 emissions table (from CAIT). - tb_cait_n2o : catalog.Table - N2O emissions table (from CAIT). - tb_energy : catalog.Table - Primary energy consumption table (from BP & EIA). - tb_gdp : catalog.Table - Maddison GDP table (from GGDC). - tb_population : catalog.Table - OWID population table (from various sources). - tb_regions : catalog.Table - OWID regions table. - - Returns - ------- - combined : catalog.Table - Combined table with metadata and variables metadata. - - """ - # Gather all variables' metadata from all tables. - tables = [tb_gcp, tb_jones, tb_cait_ghg, tb_cait_ch4, tb_cait_n2o, tb_energy, tb_gdp, tb_population, tb_regions] - variables_metadata = {} - for table in tables: - for variable in table.columns: - # If variable does not have sources metadata, take them from the dataset metadata. - if len(table[variable].metadata.sources) == 0: - if table.metadata.dataset is None: - table[variable].metadata.sources = [] - else: - table[variable].metadata.sources = table.metadata.dataset.sources - variables_metadata[variable] = table[variable].metadata - - # Combine main tables (with an outer join, to gather all entities from all tables). - tables = [tb_gcp, tb_jones, tb_cait_ghg, tb_cait_ch4, tb_cait_n2o] - combined = dataframes.multi_merge(dfs=tables, on=["country", "year"], how="outer") - - # Add secondary tables (with a left join, to keep only entities for which we have emissions data). - tables = [combined, tb_energy, tb_gdp, tb_population] - combined = dataframes.multi_merge(dfs=tables, on=["country", "year"], how="left") - - # Countries-regions dataset does not have a year column, so it has to be merged on country. - combined = pd.merge(combined, tb_regions, on="country", how="left") - - # Assign variables metadata back to combined dataframe. - for variable in variables_metadata: - combined[variable].metadata = variables_metadata[variable] - - # Check that there were no repetition in column names. - error = "Repeated columns in combined data." - assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error - - # Adjust units. - combined = convert_units(combined) - - # Adjust metadata. - combined.metadata.short_name = "owid_co2" - - return combined - - -def prepare_outputs(combined: catalog.Table) -> catalog.Table: - """Clean and prepare output table. - - Parameters - ---------- - combined : catalog.Table - Combined table. - - Returns - ------- - combined: catalog.Table - Cleaned combined table. - - """ - # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). - columns_that_must_have_data = [ - column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] - ] - combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - - # Sanity check. - columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] - assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" - - # Set index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load the global carbon budget dataset from the Global Carbon Project (GCP). - ds_gcp: catalog.Dataset = paths.load_dependency("global_carbon_budget") - - # Load the Jones et al. (2023) dataset on national contributions to climate change. - ds_jones: catalog.Dataset = paths.load_dependency("national_contributions") - - # Load the greenhouse gas emissions by sector dataset by CAIT. - ds_cait: catalog.Dataset = paths.load_dependency("ghg_emissions_by_sector") - - # Load the GDP dataset by GGDC Maddison. - ds_gdp: catalog.Dataset = paths.load_dependency("ggdc_maddison") - - # Load primary energy consumption dataset (by different sources in our 'energy' namespace). - ds_energy: catalog.Dataset = paths.load_dependency("primary_energy_consumption") - - # Load population dataset. - ds_population: catalog.Dataset = paths.load_dependency("population") - - # Load countries-regions dataset (required to get ISO codes). - ds_regions: catalog.Dataset = paths.load_dependency("regions") - - # Gather all required tables from all datasets. - tb_gcp = ds_gcp["global_carbon_budget"] - tb_jones = ds_jones["national_contributions"] - tb_cait_ghg = ds_cait["greenhouse_gas_emissions_by_sector"] - tb_cait_ch4 = ds_cait["methane_emissions_by_sector"] - tb_cait_n2o = ds_cait["nitrous_oxide_emissions_by_sector"] - tb_energy = ds_energy["primary_energy_consumption"] - tb_gdp = ds_gdp["maddison_gdp"] - tb_population = ds_population["population"] - tb_regions = ds_regions["regions"] - - # - # Process data. - # - # Choose required columns and rename them. - tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS) - tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS) - tb_cait_ghg = tb_cait_ghg.reset_index()[list(CAIT_GHG_COLUMNS)].rename(columns=CAIT_GHG_COLUMNS) - tb_cait_ch4 = tb_cait_ch4.reset_index()[list(CAIT_CH4_COLUMNS)].rename(columns=CAIT_CH4_COLUMNS) - tb_cait_n2o = tb_cait_n2o.reset_index()[list(CAIT_N2O_COLUMNS)].rename(columns=CAIT_N2O_COLUMNS) - tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS) - tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS) - tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename(columns=POPULATION_COLUMNS) - tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS) - - # Combine tables. - combined = combine_tables( - tb_gcp=tb_gcp, - tb_jones=tb_jones, - tb_cait_ghg=tb_cait_ghg, - tb_cait_ch4=tb_cait_ch4, - tb_cait_n2o=tb_cait_n2o, - tb_energy=tb_energy, - tb_gdp=tb_gdp, - tb_population=tb_population, - tb_regions=tb_regions, - ) - - # Prepare outputs. - combined = prepare_outputs(combined=combined) - - # - # Save outputs. - # - ds_garden = create_dataset(dest_dir, tables=[combined]) - - # Gather metadata sources from all tables' original dataset sources. - datasets = [ - ds_gcp, - ds_jones, - ds_cait, - ds_gdp, - ds_energy, - ds_regions, - ] - sources = unique_sources_from_datasets(datasets=datasets) - - # OWID population dataset does not have sources metadata. - sources.append( - catalog.meta.Source( - name="Our World in Data based on different sources (https://ourworldindata.org/population-sources)." - ) - ) - - ds_garden.metadata.sources = sources - - # Create dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/emissions/2023-07-10/owid_co2.meta.yml b/etl/steps/archive/garden/emissions/2023-07-10/owid_co2.meta.yml deleted file mode 100644 index 5d894bf3eb1..00000000000 --- a/etl/steps/archive/garden/emissions/2023-07-10/owid_co2.meta.yml +++ /dev/null @@ -1,9 +0,0 @@ -dataset: - title: OWID CO2 dataset (2023b) - description: | - OWID CO2 dataset. - - This dataset will be loaded by [the co2-data repository](https://github.com/owid/co2-data), to create a csv file of the dataset that can be downloaded in one click. - -# Dataset sources will be created in the step by combining all component datasets' sources. -# Also, table metadata will be built from the tables' original metadata. diff --git a/etl/steps/archive/garden/emissions/2023-07-10/owid_co2.py b/etl/steps/archive/garden/emissions/2023-07-10/owid_co2.py deleted file mode 100644 index 683132c785a..00000000000 --- a/etl/steps/archive/garden/emissions/2023-07-10/owid_co2.py +++ /dev/null @@ -1,415 +0,0 @@ -"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset (2022). - -Datasets combined: -* Global Carbon Budget (Global Carbon Project, 2022). -* National contributions to climate change (Jones et al. (2023), 2023). -* Greenhouse gas emissions by sector (CAIT, 2022). -* Primary energy consumption (BP & EIA, 2022) - -Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2020) on -GDP are included. - -""" - -from typing import List - -import numpy as np -import pandas as pd -from owid import catalog -from owid.datautils import dataframes - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Conversion factor from tonnes to million tonnes. -TONNES_TO_MILLION_TONNES = 1e-6 - -# Select columns to use from each dataset, and how to rename them. -GCP_COLUMNS = { - "country": "country", - "year": "year", - "emissions_total": "co2", - "emissions_total_per_capita": "co2_per_capita", - "traded_emissions": "trade_co2", - "emissions_from_cement": "cement_co2", - "emissions_from_cement_per_capita": "cement_co2_per_capita", - "emissions_from_coal": "coal_co2", - "emissions_from_coal_per_capita": "coal_co2_per_capita", - "emissions_from_flaring": "flaring_co2", - "emissions_from_flaring_per_capita": "flaring_co2_per_capita", - "emissions_from_gas": "gas_co2", - "emissions_from_gas_per_capita": "gas_co2_per_capita", - "emissions_from_oil": "oil_co2", - "emissions_from_oil_per_capita": "oil_co2_per_capita", - "emissions_from_other_industry": "other_industry_co2", - "emissions_from_other_industry_per_capita": "other_co2_per_capita", - "pct_growth_emissions_total": "co2_growth_prct", - "growth_emissions_total": "co2_growth_abs", - "emissions_total_per_gdp": "co2_per_gdp", - "emissions_total_per_unit_energy": "co2_per_unit_energy", - "consumption_emissions": "consumption_co2", - "consumption_emissions_per_capita": "consumption_co2_per_capita", - "consumption_emissions_per_gdp": "consumption_co2_per_gdp", - "cumulative_emissions_total": "cumulative_co2", - "cumulative_emissions_from_cement": "cumulative_cement_co2", - "cumulative_emissions_from_coal": "cumulative_coal_co2", - "cumulative_emissions_from_flaring": "cumulative_flaring_co2", - "cumulative_emissions_from_gas": "cumulative_gas_co2", - "cumulative_emissions_from_oil": "cumulative_oil_co2", - "cumulative_emissions_from_other_industry": "cumulative_other_co2", - "pct_traded_emissions": "trade_co2_share", - "emissions_total_as_share_of_global": "share_global_co2", - "emissions_from_cement_as_share_of_global": "share_global_cement_co2", - "emissions_from_coal_as_share_of_global": "share_global_coal_co2", - "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", - "emissions_from_gas_as_share_of_global": "share_global_gas_co2", - "emissions_from_oil_as_share_of_global": "share_global_oil_co2", - "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", - "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", - "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", - "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", - "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", - "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", - "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", - "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", - # New variables, related to land-use change emissions. - "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", - "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", - "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", - "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", - "emissions_from_land_use_change": "land_use_change_co2", - "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", - "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", - "emissions_total_including_land_use_change": "co2_including_luc", - "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", - "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", - "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", - "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", - "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", - "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", -} -JONES_COLUMNS = { - "country": "country", - "year": "year", - "temperature_response_co2_total": "temperature_change_from_co2", - "temperature_response_ghg_total": "temperature_change_from_ghg", - "temperature_response_ch4_total": "temperature_change_from_ch4", - "temperature_response_n2o_total": "temperature_change_from_n2o", - "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", -} -CAIT_GHG_COLUMNS = { - "country": "country", - "year": "year", - "total_excluding_lucf": "total_ghg_excluding_lucf", - "total_excluding_lucf__per_capita": "ghg_excluding_lucf_per_capita", - "total_including_lucf": "total_ghg", - "total_including_lucf__per_capita": "ghg_per_capita", -} -CAIT_CH4_COLUMNS = { - "country": "country", - "year": "year", - "total_including_lucf": "methane", - "total_including_lucf__per_capita": "methane_per_capita", -} -CAIT_N2O_COLUMNS = { - "country": "country", - "year": "year", - "total_including_lucf": "nitrous_oxide", - "total_including_lucf__per_capita": "nitrous_oxide_per_capita", -} -PRIMARY_ENERGY_COLUMNS = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "primary_energy_consumption", - "primary_energy_consumption_per_capita__kwh": "energy_per_capita", - "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", -} -REGIONS_COLUMNS = { - "name": "country", - "iso_alpha3": "iso_code", -} -POPULATION_COLUMNS = { - "country": "country", - "year": "year", - "population": "population", -} -GDP_COLUMNS = { - "country": "country", - "year": "year", - "gdp": "gdp", -} - -UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes"}} - - -def unique_sources_from_datasets( - datasets: List[catalog.Dataset], -) -> List[catalog.meta.Source]: - """Gather unique sources from datasets. - - Note: To check if a source is already listed, only the name of the source is considered (not the description or any - other field in the source). - - Parameters - ---------- - datasets : list - List of datasets with metadata. - - Returns - ------- - known_sources : list - List of unique sources from all datasets. - - """ - # Initialise list that will gather all unique metadata sources from the tables. - known_sources: List[catalog.meta.Source] = [] - for ds in datasets: - # Get list of sources of the dataset of current table. - table_sources = ds.metadata.sources - # Go source by source of current table, and check if its name is not already in the list of known_sources. - for source in table_sources: - # Check if this source's name is different to all known_sources. - if all([source.name != known_source.name for known_source in known_sources]): - # Add the new source to the list. - known_sources.append(source) - - return known_sources - - -def convert_units(table: catalog.Table) -> catalog.Table: - """Convert units of table. - - Parameters - ---------- - table : catalog.Table - Data with its original units. - - Returns - ------- - catalog.Table - Data after converting units of specific columns. - - """ - table = table.copy() - # Check units and convert to more convenient ones. - for column in table.columns: - unit = table[column].metadata.unit - short_unit = table[column].metadata.short_unit - title = table[column].metadata.title - description = table[column].metadata.description - if unit in list(UNITS): - table[column] *= UNITS[unit]["conversion"] - table[column].metadata.unit = unit - table[column].metadata.short_unit = short_unit - table[column].metadata.title = title - table[column].metadata.description = description.replace(unit, UNITS[unit]["new_unit"]) - - return table - - -def combine_tables( - tb_gcp: catalog.Table, - tb_jones: catalog.Table, - tb_cait_ghg: catalog.Table, - tb_cait_ch4: catalog.Table, - tb_cait_n2o: catalog.Table, - tb_energy: catalog.Table, - tb_gdp: catalog.Table, - tb_population: catalog.Table, - tb_regions: catalog.Table, -) -> catalog.Table: - """Combine tables. - - Parameters - ---------- - tb_gcp : catalog.Table - Global Carbon Budget table (from Global Carbon Project). - tb_jones : catalog.Table - National contributions to climate change (from Jones et al. (2023)). - tb_cait_ghg : catalog.Table - Greenhouse gas emissions table (from CAIT). - tb_cait_ch4 : catalog.Table - CH4 emissions table (from CAIT). - tb_cait_n2o : catalog.Table - N2O emissions table (from CAIT). - tb_energy : catalog.Table - Primary energy consumption table (from BP & EIA). - tb_gdp : catalog.Table - Maddison GDP table (from GGDC). - tb_population : catalog.Table - OWID population table (from various sources). - tb_regions : catalog.Table - OWID regions table. - - Returns - ------- - combined : catalog.Table - Combined table with metadata and variables metadata. - - """ - # Gather all variables' metadata from all tables. - tables = [tb_gcp, tb_jones, tb_cait_ghg, tb_cait_ch4, tb_cait_n2o, tb_energy, tb_gdp, tb_population, tb_regions] - variables_metadata = {} - for table in tables: - for variable in table.columns: - # If variable does not have sources metadata, take them from the dataset metadata. - if len(table[variable].metadata.sources) == 0: - if table.metadata.dataset is None: - table[variable].metadata.sources = [] - else: - table[variable].metadata.sources = table.metadata.dataset.sources - variables_metadata[variable] = table[variable].metadata - - # Combine main tables (with an outer join, to gather all entities from all tables). - tables = [tb_gcp, tb_jones, tb_cait_ghg, tb_cait_ch4, tb_cait_n2o] - combined = dataframes.multi_merge(dfs=tables, on=["country", "year"], how="outer") - - # Add secondary tables (with a left join, to keep only entities for which we have emissions data). - tables = [combined, tb_energy, tb_gdp, tb_population] - combined = dataframes.multi_merge(dfs=tables, on=["country", "year"], how="left") - - # Countries-regions dataset does not have a year column, so it has to be merged on country. - combined = pd.merge(combined, tb_regions, on="country", how="left") - - # Assign variables metadata back to combined dataframe. - for variable in variables_metadata: - combined[variable].metadata = variables_metadata[variable] - - # Check that there were no repetition in column names. - error = "Repeated columns in combined data." - assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error - - # Adjust units. - combined = convert_units(combined) - - # Adjust metadata. - combined.metadata.short_name = "owid_co2" - - return combined - - -def prepare_outputs(combined: catalog.Table) -> catalog.Table: - """Clean and prepare output table. - - Parameters - ---------- - combined : catalog.Table - Combined table. - - Returns - ------- - combined: catalog.Table - Cleaned combined table. - - """ - # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). - columns_that_must_have_data = [ - column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] - ] - combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - - # Sanity check. - columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] - assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" - - # Set index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load the global carbon budget dataset from the Global Carbon Project (GCP). - ds_gcp: catalog.Dataset = paths.load_dependency("global_carbon_budget") - - # Load the Jones et al. (2023) dataset on national contributions to climate change. - ds_jones: catalog.Dataset = paths.load_dependency("national_contributions") - - # Load the greenhouse gas emissions by sector dataset by CAIT. - ds_cait: catalog.Dataset = paths.load_dependency("ghg_emissions_by_sector") - - # Load the GDP dataset by GGDC Maddison. - ds_gdp: catalog.Dataset = paths.load_dependency("ggdc_maddison") - - # Load primary energy consumption dataset (by different sources in our 'energy' namespace). - ds_energy: catalog.Dataset = paths.load_dependency("primary_energy_consumption") - - # Load population dataset. - ds_population: catalog.Dataset = paths.load_dependency("population") - - # Load countries-regions dataset (required to get ISO codes). - ds_regions: catalog.Dataset = paths.load_dependency("regions") - - # Gather all required tables from all datasets. - tb_gcp = ds_gcp["global_carbon_budget"] - tb_jones = ds_jones["national_contributions"] - tb_cait_ghg = ds_cait["greenhouse_gas_emissions_by_sector"] - tb_cait_ch4 = ds_cait["methane_emissions_by_sector"] - tb_cait_n2o = ds_cait["nitrous_oxide_emissions_by_sector"] - tb_energy = ds_energy["primary_energy_consumption"] - tb_gdp = ds_gdp["maddison_gdp"] - tb_population = ds_population["population"] - tb_regions = ds_regions["regions"] - - # - # Process data. - # - # Choose required columns and rename them. - tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS) - tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS) - tb_cait_ghg = tb_cait_ghg.reset_index()[list(CAIT_GHG_COLUMNS)].rename(columns=CAIT_GHG_COLUMNS) - tb_cait_ch4 = tb_cait_ch4.reset_index()[list(CAIT_CH4_COLUMNS)].rename(columns=CAIT_CH4_COLUMNS) - tb_cait_n2o = tb_cait_n2o.reset_index()[list(CAIT_N2O_COLUMNS)].rename(columns=CAIT_N2O_COLUMNS) - tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS) - tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS) - tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename(columns=POPULATION_COLUMNS) - tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS) - - # Combine tables. - combined = combine_tables( - tb_gcp=tb_gcp, - tb_jones=tb_jones, - tb_cait_ghg=tb_cait_ghg, - tb_cait_ch4=tb_cait_ch4, - tb_cait_n2o=tb_cait_n2o, - tb_energy=tb_energy, - tb_gdp=tb_gdp, - tb_population=tb_population, - tb_regions=tb_regions, - ) - - # Prepare outputs. - combined = prepare_outputs(combined=combined) - - # - # Save outputs. - # - ds_garden = create_dataset(dest_dir, tables=[combined]) - - # Gather metadata sources from all tables' original dataset sources. - datasets = [ - ds_gcp, - ds_jones, - ds_cait, - ds_gdp, - ds_energy, - ds_regions, - ] - sources = unique_sources_from_datasets(datasets=datasets) - - # OWID population dataset does not have sources metadata. - sources.append( - catalog.meta.Source( - name="Our World in Data based on different sources (https://ourworldindata.org/population-sources)." - ) - ) - - ds_garden.metadata.sources = sources - - # Create dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/emissions/2023-09-28/owid_co2.meta.yml b/etl/steps/archive/garden/emissions/2023-09-28/owid_co2.meta.yml deleted file mode 100644 index 5d894bf3eb1..00000000000 --- a/etl/steps/archive/garden/emissions/2023-09-28/owid_co2.meta.yml +++ /dev/null @@ -1,9 +0,0 @@ -dataset: - title: OWID CO2 dataset (2023b) - description: | - OWID CO2 dataset. - - This dataset will be loaded by [the co2-data repository](https://github.com/owid/co2-data), to create a csv file of the dataset that can be downloaded in one click. - -# Dataset sources will be created in the step by combining all component datasets' sources. -# Also, table metadata will be built from the tables' original metadata. diff --git a/etl/steps/archive/garden/emissions/2023-09-28/owid_co2.py b/etl/steps/archive/garden/emissions/2023-09-28/owid_co2.py deleted file mode 100644 index 338c6c11129..00000000000 --- a/etl/steps/archive/garden/emissions/2023-09-28/owid_co2.py +++ /dev/null @@ -1,397 +0,0 @@ -"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset (2022). - -Datasets combined: -* Global Carbon Budget (Global Carbon Project, 2022). -* National contributions to climate change (Jones et al. (2023), 2023). -* Greenhouse gas emissions by sector (CAIT, 2022). -* Primary energy consumption (BP & EIA, 2022) - -Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2020) on -GDP are included. - -""" - -from typing import List - -import numpy as np -from owid.catalog import Dataset, Source, Table - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Conversion factor from tonnes to million tonnes. -TONNES_TO_MILLION_TONNES = 1e-6 - -# Select columns to use from each dataset, and how to rename them. -GCP_COLUMNS = { - "country": "country", - "year": "year", - "emissions_total": "co2", - "emissions_total_per_capita": "co2_per_capita", - "traded_emissions": "trade_co2", - "emissions_from_cement": "cement_co2", - "emissions_from_cement_per_capita": "cement_co2_per_capita", - "emissions_from_coal": "coal_co2", - "emissions_from_coal_per_capita": "coal_co2_per_capita", - "emissions_from_flaring": "flaring_co2", - "emissions_from_flaring_per_capita": "flaring_co2_per_capita", - "emissions_from_gas": "gas_co2", - "emissions_from_gas_per_capita": "gas_co2_per_capita", - "emissions_from_oil": "oil_co2", - "emissions_from_oil_per_capita": "oil_co2_per_capita", - "emissions_from_other_industry": "other_industry_co2", - "emissions_from_other_industry_per_capita": "other_co2_per_capita", - "pct_growth_emissions_total": "co2_growth_prct", - "growth_emissions_total": "co2_growth_abs", - "emissions_total_per_gdp": "co2_per_gdp", - "emissions_total_per_unit_energy": "co2_per_unit_energy", - "consumption_emissions": "consumption_co2", - "consumption_emissions_per_capita": "consumption_co2_per_capita", - "consumption_emissions_per_gdp": "consumption_co2_per_gdp", - "cumulative_emissions_total": "cumulative_co2", - "cumulative_emissions_from_cement": "cumulative_cement_co2", - "cumulative_emissions_from_coal": "cumulative_coal_co2", - "cumulative_emissions_from_flaring": "cumulative_flaring_co2", - "cumulative_emissions_from_gas": "cumulative_gas_co2", - "cumulative_emissions_from_oil": "cumulative_oil_co2", - "cumulative_emissions_from_other_industry": "cumulative_other_co2", - "pct_traded_emissions": "trade_co2_share", - "emissions_total_as_share_of_global": "share_global_co2", - "emissions_from_cement_as_share_of_global": "share_global_cement_co2", - "emissions_from_coal_as_share_of_global": "share_global_coal_co2", - "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", - "emissions_from_gas_as_share_of_global": "share_global_gas_co2", - "emissions_from_oil_as_share_of_global": "share_global_oil_co2", - "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", - "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", - "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", - "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", - "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", - "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", - "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", - "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", - # New variables, related to land-use change emissions. - "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", - "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", - "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", - "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", - "emissions_from_land_use_change": "land_use_change_co2", - "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", - "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", - "emissions_total_including_land_use_change": "co2_including_luc", - "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", - "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", - "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", - "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", - "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", - "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", -} -JONES_COLUMNS = { - "country": "country", - "year": "year", - "temperature_response_co2_total": "temperature_change_from_co2", - "temperature_response_ghg_total": "temperature_change_from_ghg", - "temperature_response_ch4_total": "temperature_change_from_ch4", - "temperature_response_n2o_total": "temperature_change_from_n2o", - "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", -} -CAIT_GHG_COLUMNS = { - "country": "country", - "year": "year", - "total_excluding_lucf": "total_ghg_excluding_lucf", - "total_excluding_lucf__per_capita": "ghg_excluding_lucf_per_capita", - "total_including_lucf": "total_ghg", - "total_including_lucf__per_capita": "ghg_per_capita", -} -CAIT_CH4_COLUMNS = { - "country": "country", - "year": "year", - "total_including_lucf": "methane", - "total_including_lucf__per_capita": "methane_per_capita", -} -CAIT_N2O_COLUMNS = { - "country": "country", - "year": "year", - "total_including_lucf": "nitrous_oxide", - "total_including_lucf__per_capita": "nitrous_oxide_per_capita", -} -PRIMARY_ENERGY_COLUMNS = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "primary_energy_consumption", - "primary_energy_consumption_per_capita__kwh": "energy_per_capita", - "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", -} -REGIONS_COLUMNS = { - "name": "country", - "iso_alpha3": "iso_code", -} -POPULATION_COLUMNS = { - "country": "country", - "year": "year", - "population": "population", -} -GDP_COLUMNS = { - "country": "country", - "year": "year", - "gdp": "gdp", -} - -UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes"}} - - -def unique_sources_from_datasets( - datasets: List[Dataset], -) -> List[Source]: - """Gather unique sources from datasets. - - Note: To check if a source is already listed, only the name of the source is considered (not the description or any - other field in the source). - - Parameters - ---------- - datasets : list - List of datasets with metadata. - - Returns - ------- - known_sources : list - List of unique sources from all datasets. - - """ - # Initialise list that will gather all unique metadata sources from the tables. - known_sources: List[Source] = [] - for ds in datasets: - # Get list of sources of the dataset of current table. - table_sources = ds.metadata.sources - # Go source by source of current table, and check if its name is not already in the list of known_sources. - for source in table_sources: - # Check if this source's name is different to all known_sources. - if all([source.name != known_source.name for known_source in known_sources]): - # Add the new source to the list. - known_sources.append(source) - - return known_sources - - -def convert_units(table: Table) -> Table: - """Convert units of table. - - Parameters - ---------- - table : Table - Data with its original units. - - Returns - ------- - Table - Data after converting units of specific columns. - - """ - table = table.copy() - # Check units and convert to more convenient ones. - for column in table.columns: - unit = table[column].metadata.unit - short_unit = table[column].metadata.short_unit - title = table[column].metadata.title - description = table[column].metadata.description or table[column].metadata.description_short - if unit in list(UNITS): - table[column] *= UNITS[unit]["conversion"] - table[column].metadata.unit = unit - table[column].metadata.short_unit = short_unit - table[column].metadata.title = title - table[column].metadata.description = description.replace(unit, UNITS[unit]["new_unit"]) - - return table - - -def combine_tables( - tb_gcp: Table, - tb_jones: Table, - tb_cait_ghg: Table, - tb_cait_ch4: Table, - tb_cait_n2o: Table, - tb_energy: Table, - tb_gdp: Table, - tb_population: Table, - tb_regions: Table, -) -> Table: - """Combine tables. - - Parameters - ---------- - tb_gcp : Table - Global Carbon Budget table (from Global Carbon Project). - tb_jones : Table - National contributions to climate change (from Jones et al. (2023)). - tb_cait_ghg : Table - Greenhouse gas emissions table (from CAIT). - tb_cait_ch4 : Table - CH4 emissions table (from CAIT). - tb_cait_n2o : Table - N2O emissions table (from CAIT). - tb_energy : Table - Primary energy consumption table (from BP & EIA). - tb_gdp : Table - Maddison GDP table (from GGDC). - tb_population : Table - OWID population table (from various sources). - tb_regions : Table - OWID regions table. - - Returns - ------- - combined : Table - Combined table with metadata and variables metadata. - - """ - # Combine main tables (with an outer join, to gather all entities from all tables). - combined = tb_gcp.copy() - for table in [tb_jones, tb_cait_ghg, tb_cait_ch4, tb_cait_n2o]: - combined = combined.merge(table, on=["country", "year"], how="outer", short_name=paths.short_name) - - # Add secondary tables (with a left join, to keep only entities for which we have emissions data). - for table in [tb_energy, tb_gdp, tb_population]: - combined = combined.merge(table, on=["country", "year"], how="left") - - # Countries-regions dataset does not have a year column, so it has to be merged on country. - combined = combined.merge(tb_regions, on="country", how="left") - - # Check that there were no repetition in column names. - error = "Repeated columns in combined data." - assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error - - # Adjust units. - combined = convert_units(combined) - - return combined - - -def prepare_outputs(combined: Table) -> Table: - """Clean and prepare output table. - - Parameters - ---------- - combined : Table - Combined table. - - Returns - ------- - combined: Table - Cleaned combined table. - - """ - # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). - columns_that_must_have_data = [ - column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] - ] - combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - - # Sanity check. - columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] - assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" - - # Set index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load the global carbon budget dataset from the Global Carbon Project (GCP). - ds_gcp = paths.load_dataset("global_carbon_budget") - - # Load the Jones et al. (2023) dataset on national contributions to climate change. - ds_jones = paths.load_dataset("national_contributions") - - # Load the greenhouse gas emissions by sector dataset by CAIT. - ds_cait = paths.load_dataset("ghg_emissions_by_sector") - - # Load the GDP dataset by GGDC Maddison. - ds_gdp = paths.load_dataset("ggdc_maddison") - - # Load primary energy consumption dataset (by different sources in our 'energy' namespace). - ds_energy = paths.load_dataset("primary_energy_consumption") - - # Load population dataset. - ds_population = paths.load_dataset("population") - - # Load countries-regions dataset (required to get ISO codes). - ds_regions = paths.load_dataset("regions") - - # Gather all required tables from all datasets. - tb_gcp = ds_gcp["global_carbon_budget"] - tb_jones = ds_jones["national_contributions"] - tb_cait_ghg = ds_cait["greenhouse_gas_emissions_by_sector"] - tb_cait_ch4 = ds_cait["methane_emissions_by_sector"] - tb_cait_n2o = ds_cait["nitrous_oxide_emissions_by_sector"] - tb_energy = ds_energy["primary_energy_consumption"] - tb_gdp = ds_gdp["maddison_gdp"] - tb_population = ds_population["population"] - tb_regions = ds_regions["regions"] - - #################################################################################################################### - # TODO: Remove this temporary solution once all indicators of all tables of all dataset have metadata. - def propagate_sources_to_all_indicators(table: Table, ds: Dataset) -> Table: - table = table.copy() - for column in table.columns: - error = f"Column {column} of table {table.metadata.short_name} already has sources or origins. Remove temporary solution for this table." - assert (len(table[column].metadata.sources) == 0) and (len(table[column].metadata.origins) == 0), error - # if (len(table[column].metadata.sources) == 0) and (len(table[column].metadata.origins) == 0): - table[column].metadata.sources = ds.metadata.sources - - return table - - tb_jones = propagate_sources_to_all_indicators(table=tb_jones, ds=ds_jones) - tb_cait_ghg = propagate_sources_to_all_indicators(table=tb_cait_ghg, ds=ds_cait) - tb_cait_ch4 = propagate_sources_to_all_indicators(table=tb_cait_ch4, ds=ds_cait) - tb_cait_n2o = propagate_sources_to_all_indicators(table=tb_cait_n2o, ds=ds_cait) - tb_regions = propagate_sources_to_all_indicators(table=tb_regions, ds=ds_regions) - #################################################################################################################### - - # - # Process data. - # - # Choose required columns and rename them. - tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS, errors="raise") - tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS, errors="raise") - tb_cait_ghg = tb_cait_ghg.reset_index()[list(CAIT_GHG_COLUMNS)].rename(columns=CAIT_GHG_COLUMNS, errors="raise") - tb_cait_ch4 = tb_cait_ch4.reset_index()[list(CAIT_CH4_COLUMNS)].rename(columns=CAIT_CH4_COLUMNS, errors="raise") - tb_cait_n2o = tb_cait_n2o.reset_index()[list(CAIT_N2O_COLUMNS)].rename(columns=CAIT_N2O_COLUMNS, errors="raise") - tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename( - columns=PRIMARY_ENERGY_COLUMNS, errors="raise" - ) - tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") - tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename( - columns=POPULATION_COLUMNS, errors="raise" - ) - tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS, errors="raise") - - # Combine tables. - combined = combine_tables( - tb_gcp=tb_gcp, - tb_jones=tb_jones, - tb_cait_ghg=tb_cait_ghg, - tb_cait_ch4=tb_cait_ch4, - tb_cait_n2o=tb_cait_n2o, - tb_energy=tb_energy, - tb_gdp=tb_gdp, - tb_population=tb_population, - tb_regions=tb_regions, - ) - - # Prepare outputs. - combined = prepare_outputs(combined=combined) - - # - # Save outputs. - # - # TODO: Change check_variables_metadata to True once all indicators have origins (for now, they have a mixture of - # origins and sources). - ds_garden = create_dataset(dest_dir, tables=[combined], check_variables_metadata=False) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2022-07-20/fossil_fuel_production.meta.yml b/etl/steps/archive/garden/energy/2022-07-20/fossil_fuel_production.meta.yml deleted file mode 100644 index e552be31878..00000000000 --- a/etl/steps/archive/garden/energy/2022-07-20/fossil_fuel_production.meta.yml +++ /dev/null @@ -1,174 +0,0 @@ -dataset: - namespace: energy - version: 2022-07-20 - title: Fossil fuel production (BP & Shift, 2022) - short_name: fossil_fuel_production - description: >- - This dataset on fossil fuel production is generated by combining the latest data from [the BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html) - and [The Shift Dataportal](https://www.theshiftdataportal.org/energy). - - - BP provide fossil fuel production data from 1965 onwards (and crude prices from 1861 onwards). The Shift Dataportal provides long-term data from 1900, but only extends to 2016. - - - To maintain consistency with the energy datasets on Our World in Data, we have taken BP data as preference – meaning if BP provides data for the given country and year, this is used. - Where data is not available from BP for a given country, or pre-1965 we rely on data from Shift. - - - We have converted primary production in exajoules to terawatt-hours using the conversion factor: 1,000,000 / 3,600 ~ 278. - - Production per capita has been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). - sources: - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: >- - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America - includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions - like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These - aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), - denoted with "(BP)", are: - - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, - Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - - * "Australasia (BP)": Australia, New Zealand. - - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - - * "North America (BP)": US (excluding US territories), Canada, Mexico - - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, - Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe (BP)", or "Other CIS (BP)"). We define our regions in the following way: - - * "Africa" - All African countries + "Other Africa (BP)". - - * "Asia" - All Asian countries + "Other Middle East (BP)" + "Other CIS (BP)" + "Other Asia Pacific (BP)". - - * "Europe" - All European countries + "Other Europe (BP)". - - * "North America" - All North American countries + "Other Caribbean (BP)" + "Other North America (BP)". - - * "Oceania" - All Oceanian countries. - - * "South America" - All South American countries + "Other South America (BP)". - - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included - (e.g. the data for "Other Western Africa (BP)" is included in "Other Africa (BP)"). Finally, income groups are constructed following the definitions - [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - - name: Our World in Data based on The Shift Dataportal (2022) - published_by: The Shift Dataportal - date_accessed: 2022-07-18 - url: https://www.theshiftdataportal.org/energy -tables: - fossil_fuel_production: - variables: - annual_change_in_coal_production__pct: - title: Annual change in coal production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in coal production - annual_change_in_coal_production__twh: - title: Annual change in coal production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in coal production - annual_change_in_gas_production__pct: - title: Annual change in gas production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in gas production - annual_change_in_gas_production__twh: - title: Annual change in gas production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in gas production - annual_change_in_oil_production__pct: - title: Annual change in oil production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in oil production - annual_change_in_oil_production__twh: - title: Annual change in oil production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in oil production - coal_production__twh: - title: Coal production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal production - numDecimalPlaces: 0 - coal_production_per_capita__kwh: - title: Coal production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Coal production per capita - numDecimalPlaces: 0 - gas_production__twh: - title: Gas production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas production - numDecimalPlaces: 0 - gas_production_per_capita__kwh: - title: Gas production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Gas production per capita - numDecimalPlaces: 0 - oil_production__twh: - title: Oil production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil production - numDecimalPlaces: 0 - oil_production_per_capita__kwh: - title: Oil production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Oil production per capita - numDecimalPlaces: 0 diff --git a/etl/steps/archive/garden/energy/2022-07-20/fossil_fuel_production.py b/etl/steps/archive/garden/energy/2022-07-20/fossil_fuel_production.py deleted file mode 100644 index 8802c144abd..00000000000 --- a/etl/steps/archive/garden/energy/2022-07-20/fossil_fuel_production.py +++ /dev/null @@ -1,260 +0,0 @@ -"""Garden step for Fossil fuel production dataset (part of the OWID Energy dataset), based on a combination of BP's -Statistical Review dataset and Shift data on fossil fuel production. - -""" - -import numpy as np -import pandas as pd -from owid import catalog -from owid.catalog.utils import underscore_table -from shared import CURRENT_DIR, add_population -from structlog import get_logger - -from etl.paths import DATA_DIR - -log = get_logger() - -# Namespace and dataset short name for output dataset. -NAMESPACE = "energy" -DATASET_SHORT_NAME = "fossil_fuel_production" -# Metadata file. -METADATA_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Namespace, dataset short name and version for required Shift dataset. -SHIFT_NAMESPACE = "shift" -SHIFT_DATASET_NAME = "fossil_fuel_production" -SHIFT_VERSION = "2022-07-18" -# Namespace, dataset short name and version for required BP dataset (processed Statistical Review from garden). -BP_NAMESPACE = "bp" -BP_DATASET_NAME = "statistical_review" -BP_VERSION = "2022-07-14" - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - - -def load_bp_data() -> catalog.Table: - """Load BP data from the local catalog, and rename columns conveniently. - - Returns - ------- - bp_table : catalog.Table - BP data as a table with metadata. - - """ - # Load BP Statistical Review dataset. - bp_dataset = catalog.Dataset(DATA_DIR / "garden" / BP_NAMESPACE / BP_VERSION / BP_DATASET_NAME) - - # Get table. - bp_table = bp_dataset[bp_dataset.table_names[0]].reset_index() - bp_columns = { - "country": "country", - "year": "year", - "coal_production__twh": "Coal production (TWh)", - "gas_production__twh": "Gas production (TWh)", - "oil_production__twh": "Oil production (TWh)", - } - bp_table = bp_table[list(bp_columns)].rename(columns=bp_columns) - - return bp_table - - -def load_shift_data() -> catalog.Table: - """Load Shift data from the local catalog, and rename columns conveniently. - - Returns - ------- - shift_table : catalog.Table - Shift data as a table with metadata. - - """ - shift_columns = { - "country": "country", - "year": "year", - "coal": "Coal production (TWh)", - "gas": "Gas production (TWh)", - "oil": "Oil production (TWh)", - } - shift_dataset = catalog.Dataset(DATA_DIR / "garden" / SHIFT_NAMESPACE / SHIFT_VERSION / SHIFT_DATASET_NAME) - shift_table = shift_dataset[shift_dataset.table_names[0]].reset_index() - shift_table = shift_table[list(shift_columns)].rename(columns=shift_columns) - - return shift_table - - -def combine_bp_and_shift_data(bp_table: catalog.Table, shift_table: catalog.Table) -> pd.DataFrame: - """Combine BP and Shift data. - - Parameters - ---------- - bp_table : catalog.Table - Table from BP Statistical Review dataset. - shift_table : catalog.Table - Table from Shift fossil fuel production dataset. - - Returns - ------- - combined : pd.DataFrame - Combined data. - - """ - # Check that there are no duplicated rows in any of the two datasets. - assert bp_table[bp_table.duplicated(subset=["country", "year"])].empty, "Duplicated rows in BP data." - assert shift_table[shift_table.duplicated(subset=["country", "year"])].empty, "Duplicated rows in Shift data." - - # Combine Shift data (which goes further back in the past) with BP data (which is more up-to-date). - # On coincident rows, prioritise BP data. - index_columns = ["country", "year"] - data_columns = [col for col in bp_table.columns if col not in index_columns] - # We should not concatenate bp and shift data directly, since there are nans in different places. - # Instead, go column by column, concatenate, remove nans, and then keep the BP version on duplicated rows. - - combined = pd.DataFrame({column: [] for column in index_columns}) - for variable in data_columns: - _shift_data = shift_table[index_columns + [variable]].dropna(subset=variable) - _bp_data = bp_table[index_columns + [variable]].dropna(subset=variable) - _combined = pd.concat([_shift_data, _bp_data], ignore_index=True) # type: ignore - # On rows where both datasets overlap, give priority to BP data. - _combined = _combined.drop_duplicates(subset=index_columns, keep="last") # type: ignore - # Combine data for different variables. - combined = pd.merge(combined, _combined, on=index_columns, how="outer") - - # Sort data appropriately. - combined = combined.sort_values(index_columns).reset_index(drop=True) - - return combined - - -def add_annual_change(df: pd.DataFrame) -> pd.DataFrame: - """Add annual change variables to combined BP & Shift dataset. - - Parameters - ---------- - df : pd.DataFrame - Combined BP & Shift dataset. - - Returns - ------- - combined : pd.DataFrame - Combined BP & Shift dataset after adding annual change variables. - - """ - combined = df.copy() - - # Calculate annual change. - combined = combined.sort_values(["country", "year"]).reset_index(drop=True) - for cat in ("Coal", "Oil", "Gas"): - combined[f"Annual change in {cat.lower()} production (%)"] = ( - combined.groupby("country")[f"{cat} production (TWh)"].pct_change() * 100 - ) - combined[f"Annual change in {cat.lower()} production (TWh)"] = combined.groupby("country")[ - f"{cat} production (TWh)" - ].diff() - - return combined - - -def add_per_capita_variables(df: pd.DataFrame) -> pd.DataFrame: - """Add per-capita variables to combined BP & Shift dataset. - - Parameters - ---------- - df : pd.DataFrame - Combined BP & Shift dataset. - - Returns - ------- - combined : pd.DataFrame - Combined BP & Shift dataset after adding per-capita variables. - - """ - df = df.copy() - - # Add population to data. - combined = add_population( - df=df, - country_col="country", - year_col="year", - population_col="population", - warn_on_missing_countries=False, - ) - - # Calculate production per capita. - for cat in ("Coal", "Oil", "Gas"): - combined[f"{cat} production per capita (kWh)"] = ( - combined[f"{cat} production (TWh)"] / combined["population"] * TWH_TO_KWH - ) - combined = combined.drop(errors="raise", columns=["population"]) - - return combined - - -def remove_spurious_values(df: pd.DataFrame) -> pd.DataFrame: - """Remove spurious infinity values. - - These values are generated when calculating the annual change of a variable that is zero or nan the previous year. - - Parameters - ---------- - df : pd.DataFrame - Data that may contain infinity values. - - Returns - ------- - df : pd.DataFrame - Corrected data. - - """ - for column in df.columns: - issues_mask = df[column] == np.inf - issues = df[issues_mask] - if len(issues) > 0: - df.loc[issues_mask, column] = np.nan - - return df - - -def run(dest_dir: str) -> None: - log.info(f"{DATASET_SHORT_NAME}.start") - - # - # Load data. - # - # Load BP statistical review dataset. - bp_table = load_bp_data() - - # Load Shift data on fossil fuel production. - shift_table = load_shift_data() - - # - # Process data. - # - # Combine BP and Shift data. - df = combine_bp_and_shift_data(bp_table=bp_table, shift_table=shift_table) - - # Add annual change. - df = add_annual_change(df=df) - - # Add per-capita variables. - df = add_per_capita_variables(df=df) - - # Remove spurious values. - df = remove_spurious_values(df=df) - - # - # Save outputs. - # - # Initialize new garden dataset. - dataset = catalog.Dataset.create_empty(dest_dir) - # Add metadata to dataset. - dataset.metadata.update_from_yaml(METADATA_PATH, if_source_exists="replace") - # Create new dataset in garden. - dataset.save() - - # Create new table and add it to new dataset. - tb_garden = underscore_table(catalog.Table(df)) - tb_garden = tb_garden.set_index(["country", "year"]) - tb_garden.update_metadata_from_yaml(METADATA_PATH, DATASET_SHORT_NAME) - dataset.add(tb_garden) - - log.info(f"{DATASET_SHORT_NAME}.end") diff --git a/etl/steps/archive/garden/energy/2022-07-20/shared.py b/etl/steps/archive/garden/energy/2022-07-20/shared.py deleted file mode 100644 index 76a75ad8a7e..00000000000 --- a/etl/steps/archive/garden/energy/2022-07-20/shared.py +++ /dev/null @@ -1,178 +0,0 @@ -from pathlib import Path -from typing import cast - -import pandas as pd -from owid import catalog - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION = { - "Czechoslovakia": { - "continent": "Europe", - "income_group": "High-income countries", - "members": [ - # Europe - High-income countries. - "Czechia", - "Slovakia", - ], - }, - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "members": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, - "Yugoslavia": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Croatia", - "Slovenia", - # Europe - Upper-middle-income countries. - "North Macedonia", - "Bosnia and Herzegovina", - "Serbia", - "Montenegro", - ], - }, -} - - -def load_population() -> pd.DataFrame: - """Load OWID population dataset, and add historical regions to it. - - Returns - ------- - population : pd.DataFrame - Population dataset. - - """ - # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] - - # Add data for historical regions (if not in population) by adding the population of its current successors. - countries_with_population = population["country"].unique() - missing_countries = [country for country in HISTORIC_TO_CURRENT_REGION if country not in countries_with_population] - for country in missing_countries: - members = HISTORIC_TO_CURRENT_REGION[country]["members"] - _population = ( - population[population["country"].isin(members)] - .groupby("year") - .agg({"population": "sum", "country": "nunique"}) - .reset_index() - ) - # Select only years for which we have data for all member countries. - _population = _population[_population["country"] == len(members)].reset_index(drop=True) - _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) - - error = "Duplicate country-years found in population. Check if historical regions changed." - assert population[population.duplicated(subset=["country", "year"])].empty, error - - return cast(pd.DataFrame, population) - - -def add_population( - df: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - This function has been adapted from datautils.geo, because population currently does not include historic regions. - We include them in this function. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Load population dataset. - population = load_population().rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population diff --git a/etl/steps/archive/garden/energy/2022-07-29/primary_energy_consumption.meta.yml b/etl/steps/archive/garden/energy/2022-07-29/primary_energy_consumption.meta.yml deleted file mode 100644 index cdee70489d7..00000000000 --- a/etl/steps/archive/garden/energy/2022-07-29/primary_energy_consumption.meta.yml +++ /dev/null @@ -1,163 +0,0 @@ -dataset: - namespace: energy - version: 2022-07-29 - title: Primary energy consumption (BP & EIA, 2022) - short_name: primary_energy_consumption - description: >- - Primary energy consumption data was compiled by Our World in Data based on two key data sources: - - 1. [BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - - 2. [International energy data from the U.S. Energy Information Administration (EIA)](https://www.eia.gov/international/data/world/total-energy/more-total-energy-data). - - - BP provides the longest and most up-to-date time-series of primary energy. However, it does not provide data for all countries. We have therefore supplemented this dataset - with energy data from the EIA. Where BP provides data for a given country, this data is adopted; for countries where this data is missing, we rely on EIA energy figures. - - - Per capita figures have been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). - - - To calculate energy per unit of GDP, we use total real GDP figures from [the Maddison Project Database, version 2020](https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020). - - This dataset is based on Bolt, Jutta and Jan Luiten van Zanden (2020), “Maddison style estimates of the evolution of the world economy. A new 2020 update ”. GDP is measured in 2011$ which are PPP-adjusted. - sources: - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: >- - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, - whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). - For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data - aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), - denoted with "(BP)", are: - - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, - Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, - Papua New Guinea and Oceania. - - * "Australasia (BP)": Australia, New Zealand. - - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - - * "North America (BP)": US (excluding US territories), Canada, Mexico - - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, - Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, - Japan, Mexico, New Zealand, South Korea, US. - - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe (BP)", or "Other CIS (BP)"). We define our regions in the following way: - - * "Africa" - All African countries + "Other Africa (BP)". - - * "Asia" - All Asian countries + "Other Middle East (BP)" + "Other CIS (BP)" + "Other Asia Pacific (BP)". - - * "Europe" - All European countries + "Other Europe (BP)". - - * "North America" - All North American countries + "Other Caribbean (BP)" + "Other North America (BP)". - - * "Oceania" - All Oceanian countries. - - * "South America" - All South American countries + "Other South America (BP)". - - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). - Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa (BP)" is included - in "Other Africa (BP)"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - - name: Our World in Data based on EIA International energy data (2022) - published_by: U.S. Energy Information Administration (EIA) - date_accessed: 2022-07-27 - url: https://www.eia.gov/opendata/bulkfiles.php - description: | - Total energy consumption, extracted from EIA's international energy data from the EIA, downloaded using their [Bulk Download Facility](https://www.eia.gov/opendata/bulkfiles.php). - - EIA's region definitions sometimes differ from Our World in Data's definitions. For example, in EIA's data, Russia is not included in Europe, whereas Our World in Data includes Russia in Europe (see a map with - [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "Europe (EIA)" to refer to EIA's original data - using their definition of the region, as well as "Europe", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the - contributions from the countries in the region. - - name: Maddison Project Database 2020 (Bolt and van Zanden, 2020) - published_by: "Bolt, Jutta and Jan Luiten van Zanden (2020), 'Maddison style estimates of the evolution of the world economy. A new 2020 update'." - date_accessed: 2022-04-12 - url: https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020 -tables: - primary_energy_consumption: - variables: - annual_change_in_primary_energy_consumption__pct: - title: Annual change in primary energy consumption (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in primary energy consumption - annual_change_in_primary_energy_consumption__twh: - title: Annual change in primary energy consumption (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in primary energy consumption - gdp: - title: GDP - short_unit: $ - unit: 2011 int-$ - description: >- - Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over - time (inflation) and price differences between countries. Calculated by multiplying GDP per capita with population. - display: - numDecimalPlaces: 0 - population: - title: Population - unit: people - primary_energy_consumption__twh: - title: Primary energy consumption (TWh) - short_unit: TWh - unit: terawatt-hours - description: Primary energy consumption, measured in terawatt-hours per year. - display: - name: Primary energy consumption - numDecimalPlaces: 0 - primary_energy_consumption_per_gdp__kwh_per_dollar: - title: Primary energy consumption per GDP (kWh/$) - short_unit: kWh - unit: kilowatt-hours per $ - description: Primary energy consumption per unit of gross domestic product, measured in kilowatt-hours per international-$. - display: - name: Energy consumption per dollar - primary_energy_consumption_per_capita__kwh: - title: Primary energy consumption per capita (kWh/person) - short_unit: kWh - unit: kilowatt-hours per capita - description: Primary energy consumption per capita, measured in kilowatt-hours per person per year. - display: - name: Per capita energy consumption - numDecimalPlaces: 0 - source: - title: Source of data - short_unit: source - unit: source diff --git a/etl/steps/archive/garden/energy/2022-07-29/primary_energy_consumption.py b/etl/steps/archive/garden/energy/2022-07-29/primary_energy_consumption.py deleted file mode 100644 index 0b212de8f31..00000000000 --- a/etl/steps/archive/garden/energy/2022-07-29/primary_energy_consumption.py +++ /dev/null @@ -1,320 +0,0 @@ -"""Garden step for Primary energy consumption dataset (part of the OWID Energy dataset), based on a combination of BP's -Statistical Review dataset and EIA data on energy consumption. - -""" - -from typing import cast - -import numpy as np -import pandas as pd -from owid import catalog -from owid.catalog.utils import underscore_table -from shared import CURRENT_DIR, add_population -from structlog import get_logger - -from etl.paths import DATA_DIR - -log = get_logger() - -# Namespace and dataset short name for output dataset. -NAMESPACE = "energy" -DATASET_SHORT_NAME = "primary_energy_consumption" -# Metadata file. -METADATA_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Path to EIA energy consumption dataset. -EIA_DATASET_PATH = DATA_DIR / "garden" / "eia" / "2022-07-27" / "energy_consumption" -# Path to BP statistical review dataset. -BP_DATASET_PATH = DATA_DIR / "garden" / "bp" / "2022-07-14" / "statistical_review" -# Path to GGDC Maddison 2020 GDP dataset. -GGDC_DATASET_PATH = DATA_DIR / "garden" / "ggdc" / "2020-10-01" / "ggdc_maddison" - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - -# Countries whose data have to be removed since they were identified as outliers. -OUTLIERS = ["Gibraltar"] - - -def load_bp_data() -> catalog.Table: - """Load BP data from the local catalog, and rename columns conveniently. - - Returns - ------- - bp_table : catalog.Table - BP data as a table with metadata. - - """ - # Load BP Statistical Review dataset. - bp_dataset = catalog.Dataset(BP_DATASET_PATH) - - # Get table. - bp_table = bp_dataset[bp_dataset.table_names[0]].reset_index() - bp_columns = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "Primary energy consumption (TWh)", - } - bp_table = bp_table[list(bp_columns)].rename(columns=bp_columns) - - # Drop rows with missing values. - bp_table = bp_table.dropna(how="any").reset_index(drop=True) - - return cast(catalog.Table, bp_table) - - -def load_eia_data() -> catalog.Table: - """Load EIA data from the local catalog, and rename columns conveniently. - - Returns - ------- - eia_table : catalog.Table - EIA data as a table with metadata. - - """ - # Load EIA energy consumption dataset. - eia_dataset = catalog.Dataset(EIA_DATASET_PATH) - - # Get table. - eia_table = eia_dataset[eia_dataset.table_names[0]].reset_index() - eia_columns = { - "country": "country", - "year": "year", - "energy_consumption": "Primary energy consumption (TWh)", - } - eia_table = eia_table[list(eia_columns)].rename(columns=eia_columns) - - # Drop rows with missing values. - eia_table = eia_table.dropna(how="any").reset_index(drop=True) - - return cast(catalog.Table, eia_table) - - -def load_ggdc_data() -> catalog.Table: - """Load GGDC data on GDP from the local catalog, and rename columns conveniently. - - Returns - ------- - ggdc_table : catalog.Table - GGDC data as a table with metadata. - - """ - # Load GGDC Maddison 2020 dataset on GDP. - ggdc_dataset = catalog.Dataset(GGDC_DATASET_PATH) - - # Get table. - ggdc_table = ggdc_dataset[ggdc_dataset.table_names[0]].reset_index() - ggdc_columns = { - "country": "country", - "year": "year", - "gdp": "GDP", - } - ggdc_table = ggdc_table[list(ggdc_columns)].rename(columns=ggdc_columns) - - # Drop rows with missing values. - ggdc_table = ggdc_table.dropna(how="any").reset_index(drop=True) - - return cast(catalog.Table, ggdc_table) - - -def combine_bp_and_eia_data(bp_table: catalog.Table, eia_table: catalog.Table) -> pd.DataFrame: - """Combine BP and EIA data. - - Parameters - ---------- - bp_table : catalog.Table - Table from BP Statistical Review dataset. - eia_table : catalog.Table - Table from EIA energy consumption dataset. - - Returns - ------- - combined : pd.DataFrame - Combined data. - - """ - # Check that there are no duplicated rows in any of the two datasets. - assert bp_table[bp_table.duplicated(subset=["country", "year"])].empty, "Duplicated rows in BP data." - assert eia_table[eia_table.duplicated(subset=["country", "year"])].empty, "Duplicated rows in EIA data." - - bp_table["source"] = "bp" - eia_table["source"] = "eia" - # Combine EIA data (which goes further back in the past) with BP data (which is more up-to-date). - # On coincident rows, prioritise BP data. - index_columns = ["country", "year"] - combined = cast(pd.DataFrame, pd.concat([eia_table, bp_table], ignore_index=True)).drop_duplicates( - subset=index_columns, keep="last" - ) - - # Convert to conventional dataframe, and sort conveniently. - combined = pd.DataFrame(combined).sort_values(index_columns).reset_index(drop=True) - - return cast(pd.DataFrame, combined) - - -def add_annual_change(df: pd.DataFrame) -> pd.DataFrame: - """Add annual change variables to combined BP & EIA dataset. - - Parameters - ---------- - df : pd.DataFrame - Combined BP & EIA dataset. - - Returns - ------- - combined : pd.DataFrame - Combined BP & EIA dataset after adding annual change variables. - - """ - combined = df.copy() - - # Calculate annual change. - combined = combined.sort_values(["country", "year"]).reset_index(drop=True) - combined["Annual change in primary energy consumption (%)"] = ( - combined.groupby("country")["Primary energy consumption (TWh)"].pct_change() * 100 - ) - combined["Annual change in primary energy consumption (TWh)"] = combined.groupby("country")[ - "Primary energy consumption (TWh)" - ].diff() - - return combined - - -def add_per_capita_variables(df: pd.DataFrame) -> pd.DataFrame: - """Add a population column and add per-capita variables. - - Parameters - ---------- - df : pd.DataFrame - Data. - - Returns - ------- - df : pd.DataFrame - Data after adding population and per-capita variables. - - """ - df = df.copy() - - # Add population to data. - df = add_population( - df=df, - country_col="country", - year_col="year", - population_col="Population", - warn_on_missing_countries=False, - ) - - # Calculate consumption per capita. - df["Primary energy consumption per capita (kWh)"] = ( - df["Primary energy consumption (TWh)"] / df["Population"] * TWH_TO_KWH - ) - - return df - - -def add_per_gdp_variables(df: pd.DataFrame, ggdc_table: catalog.Table) -> pd.DataFrame: - """Add a GDP column and add per-gdp variables. - - Parameters - ---------- - df : pd.DataFrame - Data. - ggdc_table : catalog.Table - GDP data from the GGDC Maddison dataset. - - Returns - ------- - df : pd.DataFrame - Data after adding GDP and per-gdp variables. - - """ - df = df.copy() - - # Add population to data. - df = pd.merge(df, ggdc_table, on=["country", "year"], how="left") - - # Calculate consumption per GDP. - df["Primary energy consumption per GDP (kWh per $)"] = ( - df["Primary energy consumption (TWh)"] / df["GDP"] * TWH_TO_KWH - ) - - return df - - -def remove_outliers(df: pd.DataFrame) -> pd.DataFrame: - """Remove infinity values and data that has been identified as spurious outliers. - - Parameters - ---------- - df : pd.DataFrame - Data. - - Returns - ------- - df : pd.DataFrame - Data after removing spurious data. - - """ - df = df.copy() - - # Remove spurious values. - df = df.replace(np.inf, np.nan) - - # Remove indexes of outliers from data. - df = df[~df["country"].isin(OUTLIERS)].reset_index(drop=True) - - return df - - -def run(dest_dir: str) -> None: - log.info(f"{DATASET_SHORT_NAME}.start") - - # - # Load data. - # - # Load BP statistical review dataset. - bp_table = load_bp_data() - - # Load EIA data on energy_consumption. - eia_table = load_eia_data() - - # Load GGDC Maddison data on GDP. - ggdc_table = load_ggdc_data() - - # - # Process data. - # - # Combine BP and EIA data. - df = combine_bp_and_eia_data(bp_table=bp_table, eia_table=eia_table) - - # Add annual change. - df = add_annual_change(df=df) - - # Add per-capita variables. - df = add_per_capita_variables(df=df) - - # Add per-GDP variables. - df = add_per_gdp_variables(df=df, ggdc_table=ggdc_table) - - # Remove outliers. - df = remove_outliers(df=df) - - # - # Save outputs. - # - # Initialize new garden dataset. - dataset = catalog.Dataset.create_empty(dest_dir) - # Add metadata to dataset. - dataset.metadata.update_from_yaml(METADATA_PATH, if_source_exists="replace") - # Create new dataset in garden. - dataset.save() - - # Create new table and add it to new dataset. - tb_garden = underscore_table(catalog.Table(df)) - - tb_garden = tb_garden.set_index(["country", "year"]) - tb_garden.update_metadata_from_yaml(METADATA_PATH, DATASET_SHORT_NAME) - dataset.add(tb_garden) - - log.info(f"{DATASET_SHORT_NAME}.end") diff --git a/etl/steps/archive/garden/energy/2022-07-29/shared.py b/etl/steps/archive/garden/energy/2022-07-29/shared.py deleted file mode 100644 index 76a75ad8a7e..00000000000 --- a/etl/steps/archive/garden/energy/2022-07-29/shared.py +++ /dev/null @@ -1,178 +0,0 @@ -from pathlib import Path -from typing import cast - -import pandas as pd -from owid import catalog - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION = { - "Czechoslovakia": { - "continent": "Europe", - "income_group": "High-income countries", - "members": [ - # Europe - High-income countries. - "Czechia", - "Slovakia", - ], - }, - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "members": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, - "Yugoslavia": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Croatia", - "Slovenia", - # Europe - Upper-middle-income countries. - "North Macedonia", - "Bosnia and Herzegovina", - "Serbia", - "Montenegro", - ], - }, -} - - -def load_population() -> pd.DataFrame: - """Load OWID population dataset, and add historical regions to it. - - Returns - ------- - population : pd.DataFrame - Population dataset. - - """ - # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] - - # Add data for historical regions (if not in population) by adding the population of its current successors. - countries_with_population = population["country"].unique() - missing_countries = [country for country in HISTORIC_TO_CURRENT_REGION if country not in countries_with_population] - for country in missing_countries: - members = HISTORIC_TO_CURRENT_REGION[country]["members"] - _population = ( - population[population["country"].isin(members)] - .groupby("year") - .agg({"population": "sum", "country": "nunique"}) - .reset_index() - ) - # Select only years for which we have data for all member countries. - _population = _population[_population["country"] == len(members)].reset_index(drop=True) - _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) - - error = "Duplicate country-years found in population. Check if historical regions changed." - assert population[population.duplicated(subset=["country", "year"])].empty, error - - return cast(pd.DataFrame, population) - - -def add_population( - df: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - This function has been adapted from datautils.geo, because population currently does not include historic regions. - We include them in this function. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Load population dataset. - population = load_population().rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population diff --git a/etl/steps/archive/garden/energy/2022-09-09/global_primary_energy.meta.yml b/etl/steps/archive/garden/energy/2022-09-09/global_primary_energy.meta.yml deleted file mode 100644 index d7df099c963..00000000000 --- a/etl/steps/archive/garden/energy/2022-09-09/global_primary_energy.meta.yml +++ /dev/null @@ -1,272 +0,0 @@ -dataset: - namespace: energy - version: 2022-09-09 - title: Global Primary Energy (Smil & BP, 2022) - short_name: global_primary_energy - description: | - This dataset comprises of a combination of data from Appendix A of Vaclav Smil's Updated and Revised Edition of his book, 'Energy Transitions: Global and National Perspectives' (2017) and BP's Statistical Review of World Energy (2022). - - All data prior to the year 1965 is sourced from Smil (2017). All data from 1965 onwards, with the exception of traditional biomass is sourced from BP Statistical Review. Smil's estimates of traditional biomass are only available until 2015. For the years 2016 onwards, we have assumed a similar level of traditional biomass consumption. This is approximately in line with recent trends in traditional biomass from Smil's data. - - Our World in Data has normalized all BP fossil fuels data to terawatt-hours (TWh) using a conversion factor of 1,000,000 / 3,600 (~277.778) to convert from exajoules (EJ) to TWh. - - This dataset includes primary energy data using two methodologies: - (1) 'direct' primary energy, which does not take account of the inefficiencies in fossil fuel production. Fossil fuel data is compared to electricity generation (not in input equivalents) of nuclear and renewables. - (2) 'substitution' primary energy, which does take account of inefficiencies in fossil fuel production. This converts non-fossil energy to their 'input equivalents': The amount of primary energy that would be needed if they had the same inefficiencies as fossil fuels. This is the methodology adopted by BP when all data is compared in exajoules. - -tables: - global_primary_energy: - variables: - biofuels__twh_direct_energy: - title: Biofuels (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Modern biofuels - biofuels__twh_substituted_energy: - title: Biofuels (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Modern biofuels - coal__twh_direct_energy: - title: Coal (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - coal__twh_substituted_energy: - title: Coal (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - # data_source - gas__twh_direct_energy: - title: Gas (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - gas__twh_substituted_energy: - title: Gas (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - hydropower__twh_direct_energy: - title: Hydropower (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - hydropower__twh_substituted_energy: - title: Hydropower (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - nuclear__twh_direct_energy: - title: Nuclear (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - nuclear__twh_substituted_energy: - title: Nuclear (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - oil__twh_direct_energy: - title: Oil (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - oil__twh_substituted_energy: - title: Oil (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - other_renewables__twh_direct_energy: - title: Other renewables (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - other_renewables__twh_substituted_energy: - title: Other renewables (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - solar__twh_direct_energy: - title: Solar (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - solar__twh_substituted_energy: - title: Solar (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - traditional_biomass__twh_direct_energy: - title: Traditional biomass (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Traditional biomass - traditional_biomass__twh_substituted_energy: - title: Traditional biomass (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Traditional biomass - wind__twh_direct_energy: - title: Wind (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - wind__twh_substituted_energy: - title: Wind (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - total_consumption__twh_direct_energy: - title: Total consumption (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Total consumption - total_consumption__twh_substituted_energy: - title: Total consumption (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Total consumption - biofuels__pct_of_direct_energy: - title: Biofuels (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Modern biofuels - biofuels__pct_of_substituted_energy: - title: Biofuels (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Modern biofuels - coal__pct_of_direct_energy: - title: Coal (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Coal - coal__pct_of_substituted_energy: - title: Coal (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Coal - gas__pct_of_direct_energy: - title: Gas (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Natural gas - gas__pct_of_substituted_energy: - title: Gas (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Natural gas - hydropower__pct_of_direct_energy: - title: Hydropower (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Hydropower - hydropower__pct_of_substituted_energy: - title: Hydropower (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Hydropower - nuclear__pct_of_direct_energy: - title: Nuclear (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Nuclear - nuclear__pct_of_substituted_energy: - title: Nuclear (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Nuclear - oil__pct_of_direct_energy: - title: Oil (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Oil - oil__pct_of_substituted_energy: - title: Oil (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Oil - other_renewables__pct_of_direct_energy: - title: Other renewables (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Other renewables - other_renewables__pct_of_substituted_energy: - title: Other renewables (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Other renewables - solar__pct_of_direct_energy: - title: Solar (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Solar - solar__pct_of_substituted_energy: - title: Solar (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Solar - traditional_biomass__pct_of_direct_energy: - title: Traditional biomass (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Traditional biomass - traditional_biomass__pct_of_substituted_energy: - title: Traditional biomass (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Traditional biomass - wind__pct_of_direct_energy: - title: Wind (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Wind - wind__pct_of_substituted_energy: - title: Wind (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Wind diff --git a/etl/steps/archive/garden/energy/2022-09-09/global_primary_energy.py b/etl/steps/archive/garden/energy/2022-09-09/global_primary_energy.py deleted file mode 100644 index 3547c14c1be..00000000000 --- a/etl/steps/archive/garden/energy/2022-09-09/global_primary_energy.py +++ /dev/null @@ -1,222 +0,0 @@ -"""Garden step that combines Vaclav Smil's Global Primary Energy with BP's Statistical Review of World Energy. - -""" - -import pandas as pd -from owid import catalog -from shared import ( - CURRENT_DIR, - combine_two_overlapping_dataframes, - gather_sources_from_tables, -) - -from etl.paths import DATA_DIR - -# Details for dataset to export. -DATASET_SHORT_NAME = "global_primary_energy" -DATASET_TITLE = "Global Primary Energy (Smil & BP, 2022)" -METADATA_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Details for datasets to import. -BP_DATASET_PATH = DATA_DIR / "garden/bp/2022-07-14/statistical_review" -SMIL_DATASET_PATH = DATA_DIR / "garden/smil/2017-01-01/global_primary_energy" - -# Exajoules to terawatt-hours. -EJ_TO_TWH = 1e6 / 3600 - -# Average efficiency factor assumed to convert direct energy to input-equivalent energy of Smil's data. -# This factor will be used for hydropower, nuclear, other renewables, solar and wind -# (for which there is data until 1960). -# In practice, it only affects hydropower, since all other non-fossil sources are zero prior to 1960. -# All other energy sources in Smil's data will not be affected by this factor. -EFFICIENCY_FACTOR = 0.36 - - -def prepare_bp_data(tb_bp: catalog.Table) -> pd.DataFrame: - df_bp = pd.DataFrame(tb_bp).reset_index() - - # BP gives generation of direct energy in TWh, and, for non-fossil sources of electricity, - # consumption of input-equivalent energy in EJ. - # The input-equivalent energy is the amount of energy that would be required to generate a given amount of (direct) - # electricity if non-fossil sources were as inefficient as a standard thermal power plant. - # Therefore, direct and substituted energies for Biofuels, Coal, Gas and Oil are identical. - # On the other hand, direct and substituted energy are different for non-fossil electricity sources, namely - # Hydropower, Nuclear, Solar, Other renewables, and Wind. - # The difference is of a factor of ~38%, which is roughly the efficiency of a standard power plant. - # More specifically, BP assumes (for Biofuels, Coal, Gas and Oil) an efficiency factor that grows from 36% - # (until year 2000) to 40.6% (in 2021), to better reflect changes in efficiency over time. - # In the case of biomass used in electricity (included in 'Other renewables'), - # BP assumes a constant factor of 32% for all years. - # For more details: - # https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/pdfs/energy-economics/statistical-review/bp-stats-review-2022-methodology.pdf - bp_columns = { - "country": "country", - "year": "year", - # Fossil sources (direct energy). - "biofuels_consumption__twh__total": "biofuels__twh_direct_energy", - "coal_consumption__twh": "coal__twh_direct_energy", - "gas_consumption__twh": "gas__twh_direct_energy", - "oil_consumption__twh": "oil__twh_direct_energy", - # Non-fossil electricity sources (direct energy). - "geo_biomass_other__twh": "other_renewables__twh_direct_energy", - "hydro_generation__twh": "hydropower__twh_direct_energy", - "nuclear_generation__twh": "nuclear__twh_direct_energy", - "solar_generation__twh": "solar__twh_direct_energy", - "wind_generation__twh": "wind__twh_direct_energy", - # Non-fossil electricity sources (substituted energy). - "geo_biomass_other__ej": "other_renewables__ej_substituted_energy", - "hydro_consumption__ej": "hydropower__ej_substituted_energy", - "nuclear_consumption__ej": "nuclear__ej_substituted_energy", - "solar_consumption__ej": "solar__ej_substituted_energy", - "wind_consumption__ej": "wind__ej_substituted_energy", - } - df_bp = df_bp[list(bp_columns)].rename(columns=bp_columns) - # Convert all units to TWh. - for column in df_bp.columns: - if "_ej_" in column: - # Create a new column in TWh instead of EJ. - df_bp[column.replace("_ej_", "_twh_")] = df_bp[column] * EJ_TO_TWH - # Remove the column in EJ. - df_bp = df_bp.drop(columns=column) - # For completeness, create columns of substituted energy for fossil sources (even if they would coincide with - # direct energy). - for fossil_source in ["biofuels", "coal", "gas", "oil"]: - df_bp[f"{fossil_source}__twh_substituted_energy"] = df_bp[f"{fossil_source}__twh_direct_energy"] - - # Select only data for the World (which is the only region informed in Smil's data). - df_bp = df_bp[df_bp["country"] == "World"].reset_index(drop=True) - - return df_bp - - -def prepare_smil_data(tb_smil: catalog.Table) -> pd.DataFrame: - df_smil = pd.DataFrame(tb_smil).reset_index() - - # Create columns for input-equivalent energy. - # To do this, we follow a similar approach to BP: - # We create input-equivalent energy by dividing direct energy consumption of non-fossil electricity sources - # (hydropower, nuclear, other renewables, solar and wind) by a factor of 36% - # (called EFFICIENCY_FACTOR, defined above). - # This is the efficiency factor of a typical thermal plant assumed by BP between 1965 and 2000, and we assume this - # factor also applies for the period 1800 to 1965. - # For biomass power (included in other renewables), BP assumed a constant factor of 32%. - # However, since we cannot separate biomass from the rest of sources in 'other renewables', - # we use the same 36% factor as all other non-fossil sources. - for source in ["hydropower", "nuclear", "other_renewables", "solar", "wind"]: - df_smil[f"{source}__twh_substituted_energy"] = df_smil[f"{source}__twh_direct_energy"] / EFFICIENCY_FACTOR - # For fossil sources (including biofuels and traditional biomass), direct and substituted energy are the same. - for source in ["biofuels", "coal", "gas", "oil", "traditional_biomass"]: - df_smil[f"{source}__twh_substituted_energy"] = df_smil[f"{source}__twh_direct_energy"] - - return df_smil - - -def combine_bp_and_smil_data(df_bp: pd.DataFrame, df_smil: pd.DataFrame) -> pd.DataFrame: - df_bp = df_bp.copy() - df_smil = df_smil.copy() - - # Add a new column that informs of the source of the data. - df_bp["data_source"] = "BP" - df_smil["data_source"] = "Smil" - # Combine both dataframes, prioritizing BP's data on overlapping rows. - combined = combine_two_overlapping_dataframes( - df1=df_bp, df2=df_smil, index_columns=["country", "year"] - ).sort_values(["year"]) - # We do not have data for traditional biomass after 2015 (BP does not provide it). - # So, to be able to visualize the complete mix of global energy consumption, - # we extrapolate Smil's data for traditional biomass from 2015 onwards, by repeating its last value. - missing_years_mask = combined["year"] >= df_smil["year"].max() - combined.loc[missing_years_mask, "traditional_biomass__twh_direct_energy"] = combined[missing_years_mask][ - "traditional_biomass__twh_direct_energy" - ].ffill() - combined.loc[missing_years_mask, "traditional_biomass__twh_substituted_energy"] = combined[missing_years_mask][ - "traditional_biomass__twh_substituted_energy" - ].ffill() - - # Create an index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return combined - - -def add_total_consumption_and_percentages(combined: pd.DataFrame) -> pd.DataFrame: - # Create a column with the total direct energy (ensuring there is at least one non-nan value). - combined["total_consumption__twh_direct_energy"] = combined[ - [column for column in combined.columns if "direct_energy" in column] - ].sum(axis=1, min_count=1) - # Create a column with the total substituted energy (ensuring there is at least one non-nan value). - combined["total_consumption__twh_substituted_energy"] = combined[ - [column for column in combined.columns if "substituted_energy" in column] - ].sum(axis=1, min_count=1) - # Add share variables. - sources = [ - "biofuels", - "coal", - "gas", - "hydropower", - "nuclear", - "oil", - "other_renewables", - "solar", - "traditional_biomass", - "wind", - ] - for source in sources: - # Add percentage of each source with respect to the total direct energy. - combined[f"{source}__pct_of_direct_energy"] = ( - 100 * combined[f"{source}__twh_direct_energy"] / combined["total_consumption__twh_direct_energy"] - ) - # Add percentage of each source with respect to the total substituted energy. - combined[f"{source}__pct_of_substituted_energy"] = ( - 100 * combined[f"{source}__twh_substituted_energy"] / combined["total_consumption__twh_substituted_energy"] - ) - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read all required datasets. - ds_bp = catalog.Dataset(BP_DATASET_PATH) - ds_smil = catalog.Dataset(SMIL_DATASET_PATH) - - # Gather all required tables from all datasets. - tb_bp = ds_bp[ds_bp.table_names[0]] - tb_smil = ds_smil[ds_smil.table_names[0]] - - # - # Process data. - # - # Prepare BP data. - df_bp = prepare_bp_data(tb_bp=tb_bp) - # Prepare Smil data. - df_smil = prepare_smil_data(tb_smil=tb_smil) - - # Combine BP and Smil data. - combined = combine_bp_and_smil_data(df_bp=df_bp, df_smil=df_smil) - - # Add variables for total consumption and variables of % share of each source. - combined = add_total_consumption_and_percentages(combined=combined) - - # Create a new table with combined data (and no metadata). - tb_combined = catalog.Table(combined) - - # - # Save outputs. - # - ds_garden = catalog.Dataset.create_empty(dest_dir) - # Gather metadata sources from all tables' original dataset sources. - ds_garden.metadata.sources = gather_sources_from_tables(tables=[tb_bp, tb_smil]) - # Get the rest of the metadata from the yaml file. - ds_garden.metadata.update_from_yaml(METADATA_PATH) - # Create dataset. - ds_garden.save() - - # Add other metadata fields to table. - tb_combined.metadata.short_name = DATASET_SHORT_NAME - tb_combined.metadata.title = DATASET_TITLE - tb_combined.update_metadata_from_yaml(METADATA_PATH, "global_primary_energy") - - # Add combined tables to the new dataset. - ds_garden.add(tb_combined) diff --git a/etl/steps/archive/garden/energy/2022-09-09/shared.py b/etl/steps/archive/garden/energy/2022-09-09/shared.py deleted file mode 100644 index 6beed24b51e..00000000000 --- a/etl/steps/archive/garden/energy/2022-09-09/shared.py +++ /dev/null @@ -1,98 +0,0 @@ -from pathlib import Path -from typing import List - -import pandas as pd -from owid import catalog - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - - -def gather_sources_from_tables( - tables: List[catalog.Table], -) -> List[catalog.meta.Source]: - """Gather unique sources from the metadata.dataset of each table in a list of tables. - - Note: To check if a source is already listed, only the name of the source is considered (not the description or any - other field in the source). - - Parameters - ---------- - tables : list - List of tables with metadata. - - Returns - ------- - known_sources : list - List of unique sources from all tables. - - """ - # Initialise list that will gather all unique metadata sources from the tables. - known_sources: List[catalog.meta.Source] = [] - for table in tables: - # Get list of sources of the dataset of current table. - table_sources = table.metadata.dataset.sources - # Go source by source of current table, and check if its name is not already in the list of known_sources. - for source in table_sources: - # Check if this source's name is different to all known_sources. - if all([source.name != known_source.name for known_source in known_sources]): - # Add the new source to the list. - known_sources.append(source) - - return known_sources - - -def combine_two_overlapping_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, index_columns: List[str]) -> pd.DataFrame: - """Combine two dataframes that may have identical columns, prioritizing the first one. - - Both dataframes must have a dummy index (if not, use reset_index() on both of them). - The columns to be considered as index should be declared in index_columns. - - Suppose you have two dataframes, df1 and df2, both having columns "col_a" and "col_b", and we want to create a - combined dataframe with the union of rows and columns, and, on the overlapping elements, prioritize df1 values. - To do this, you could: - * Merge the dataframes. But then the result would have columns "col_a_x", "col_a_y", "col_b_x", and "col_b_y". - * Concatenate them and then drop duplicates (for example keeping the last repetition). This works, but, if df1 has - nans then we would keep those nans. - To solve these problems, this function will not create new columns, and will prioritize df1 **only if it has data**, - and otherwise use values from df2. - - Parameters - ---------- - df1 : pd.DataFrame - First dataframe (the one that has priority). - df2 : pd.DataFrame - Second dataframe. - index_columns : list - Columns (that must be present in both dataframes) that should be treated as index (e.g. ["country", "year"]). - - Returns - ------- - combined : pd.DataFrame - Combination of the two dataframes. - - """ - # Find columns of data (those that are not index columns). - df1_columns = df1.columns.tolist() - df2_columns = df2.columns.tolist() - common_columns = [column for column in df1_columns if column not in index_columns] + [ - column for column in df2_columns if column not in df1_columns - ] - - # Go column by column, concatenate, remove nans, and then keep df1 version on duplicated rows. - # Note: There may be a faster, simpler way to achieve this. - combined = pd.DataFrame({column: [] for column in index_columns}) - for variable in common_columns: - _df1 = pd.DataFrame() - _df2 = pd.DataFrame() - if variable in df1.columns: - _df1 = df1[index_columns + [variable]].dropna(subset=variable) - if variable in df2.columns: - _df2 = df2[index_columns + [variable]].dropna(subset=variable) - _combined = pd.concat([_df1, _df2], ignore_index=True) - # On rows where both datasets overlap, give priority to df1. - _combined = _combined.drop_duplicates(subset=index_columns, keep="first") - # Add the current variable to the combined dataframe. - combined = pd.merge(combined, _combined, on=index_columns, how="outer") - - return combined diff --git a/etl/steps/archive/garden/energy/2022-12-13/electricity_mix.meta.yml b/etl/steps/archive/garden/energy/2022-12-13/electricity_mix.meta.yml deleted file mode 100644 index 1d5e478c1b7..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-13/electricity_mix.meta.yml +++ /dev/null @@ -1,390 +0,0 @@ -dataset: - namespace: energy - version: 2022-12-13 - title: Electricity mix (BP & Ember, 2022c) - short_name: electricity_mix - description: | - Data is compiled by Our World in Data based on three main sources: - - [BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - - [Ember Yearly Electricity Data (2022)](https://ember-climate.org/data-catalogue/yearly-electricity-data/). - - [Ember European Electricity Review (2022)](https://ember-climate.org/insights/research/european-electricity-review-2022/). - - Ember compile their global dataset from various sources including: - - Eurostat: Annual European generation and import data, and monthly data in some cases where better sources are not available. - - ENTSO-E: Monthly European generation and import data. - - EIA: Annual global generation and import data. - - UN: Monthly global generation data in some cases. - - GEM: Annual global coal and gas capacity data. - - IRENA: Annual global capacity data for all non-fossil fuel types, and for Other Fossil where available. - - WRI: Annual global capacity data for Other Fossil where other sources are not available. - - European carbon intensities rely on data from the European Environment Agency (EEA). - - A complete list of data sources for each individual country in Ember's Yearly Electricity Data can be found [here](https://ember-climate.org/app/uploads/2022/07/Ember-Electricity-Data-Methodology.pdf). - - A complete list of data sources for each individual country in Ember's European Electricity Review can be found [here](https://ember-climate.org/app/uploads/2022/02/EER-Methodology.pdf). - - We rely on Ember as the primary source of electricity consumption data. While BP provides primary energy (not just electricity) consumption data and it provides a longer time-series (dating back to 1965) than Ember (which only dates back to 1990), BP does not provide data for all countries or for all sources of electricity (for example, only Ember provides data on electricity from bioenergy). So, where data from Ember is available for a given country and year, we rely on it as the primary source. We then supplement this with data from BP where data from Ember is not available. - - Our World in Data has converted absolute electricity production by source to the share in the mix by dividing each by total electricity production. - - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe (BP)", or "Other CIS (BP)"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa (BP)". - * "Asia" - All Asian countries + "Other Middle East (BP)" + "Other CIS (BP)" + "Other Asia Pacific (BP)". - * "Europe" - All European countries + "Other Europe (BP)". - * "North America" - All North American countries + "Other Caribbean (BP)" + "Other North America (BP)". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America (BP)". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa (BP)" is included in "Other Africa (BP)"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - - [Ember's region definitions](https://ember-climate.org/countries-and-regions/), denoted with "(Ember)", are: - * "G20 (Ember)" - Group of Twenty: Argentina, Australia, Brazil, Canada, China, France, Germany, India, Indonesia, Italy, Japan, Mexico, Russia, Saudi Arabia, South Africa, South Korea, Turkey, United Kingdom, United States and the 27 members of the European Union. - * "G7 (Ember)" - Group of Seven: Canada, France, Germany, Italy, Japan, United Kingdom and United States. - * "Latin America and Caribbean (Ember)": Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, Uruguay, Venezuela, Aruba, British Virgin Islands, Cayman Islands, Falkland Islands, French Guiana, Guadeloupe, Martinique, Montserrat, Puerto Rico, Turks and Caicos Islands and United States Virgin Islands. - * "Middle East (Ember)": Bahrain, Iran, Iraq, Israel, Jordan, Kuwait, Lebanon, Oman, Palestine, Qatar, Saudi Arabia, Syria, United Arab Emirates and Yemen. - * "OECD (Ember)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, United Kingdom, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, and United States. - sources: - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - - name: Our World in Data based on Ember's Yearly Electricity Data (2022) - published_by: Ember - publication_year: 2022 - date_accessed: 2022-12-13 - url: https://ember-climate.org/data-catalogue/yearly-electricity-data/ - - name: Our World in Data based on Ember's European Electricity Review (2022) - published_by: Ember - publication_year: 2022 - date_accessed: 2022-08-01 - url: https://ember-climate.org/insights/research/european-electricity-review-2022/ -tables: - electricity_mix: - variables: - bioenergy_generation__twh: - title: Electricity from bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy - bioenergy_share_of_electricity__pct: - title: Bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Bioenergy - co2_intensity__gco2_kwh: - title: Carbon intensity of electricity (gCO2/kWh) - short_unit: gCO₂ - unit: grams of CO₂ equivalent per kilowatt-hour - display: - name: Carbon intensity of electricity per kilowatt-hour - coal_generation__twh: - title: Electricity from coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - coal_share_of_electricity__pct: - title: Coal (% electricity) - short_unit: '%' - unit: '%' - display: - name: Coal - fossil_generation__twh: - title: Electricity from fossil fuels (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil fuels - fossil_share_of_electricity__pct: - title: Fossil fuels (% electricity) - short_unit: '%' - unit: '%' - display: - name: Fossil fuels - gas_generation__twh: - title: Electricity from gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas - gas_share_of_electricity__pct: - title: Gas (% electricity) - short_unit: '%' - unit: '%' - display: - name: Gas - hydro_generation__twh: - title: Electricity from hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - hydro_share_of_electricity__pct: - title: Hydro (% electricity) - short_unit: '%' - unit: '%' - display: - name: Hydropower - low_carbon_generation__twh: - title: Low-carbon electricity (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Low-carbon electricity - low_carbon_share_of_electricity__pct: - title: Low-carbon electricity (% electricity) - short_unit: '%' - unit: '%' - display: - name: Share of electricity from low-carbon sources - net_imports_share_of_demand__pct: - title: Net electricity imports as a share of demand (%) - short_unit: '%' - unit: '%' - display: - name: Net electricity imports as a share of demand - nuclear_generation__twh: - title: Electricity from nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - nuclear_share_of_electricity__pct: - title: Nuclear (% electricity) - short_unit: '%' - unit: '%' - display: - name: Nuclear - oil_generation__twh: - title: Electricity from oil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - oil_share_of_electricity__pct: - title: Oil (% electricity) - short_unit: '%' - unit: '%' - display: - name: Oil - other_renewables_excluding_bioenergy_generation__twh: - title: Other renewables excluding bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables, excluding bioenergy - other_renewables_excluding_bioenergy_share_of_electricity__pct: - title: Other renewables excluding bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Other renewables, excluding bioenergy - other_renewables_including_bioenergy_generation__twh: - title: Other renewables including bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables, including bioenergy - other_renewables_including_bioenergy_share_of_electricity__pct: - title: Other renewables including bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Other renewables, including bioenergy - per_capita_bioenergy_generation__kwh: - title: Bioenergy electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Bioenergy electricity per capita - numDecimalPlaces: 0 - per_capita_coal_generation__kwh: - title: Coal electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Coal electricity per capita - numDecimalPlaces: 0 - per_capita_fossil_generation__kwh: - title: Fossil fuel electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Fossil fuel electricity per capita - numDecimalPlaces: 0 - per_capita_gas_generation__kwh: - title: Gas electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Gas electricity per capita - numDecimalPlaces: 0 - per_capita_hydro_generation__kwh: - title: Hydro electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Hydro electricity per capita - numDecimalPlaces: 0 - per_capita_low_carbon_generation__kwh: - title: Low-carbon electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Low-carbon electricity per capita - numDecimalPlaces: 0 - per_capita_nuclear_generation__kwh: - title: Nuclear electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Nuclear electricity per capita - numDecimalPlaces: 0 - per_capita_oil_generation__kwh: - title: Oil electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Oil electricity per capita - numDecimalPlaces: 0 - per_capita_other_renewables_excluding_bioenergy_generation__kwh: - title: Other renewable electricity excluding bioenergy per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Other renewable electricity excluding bioenergy per capita - numDecimalPlaces: 0 - per_capita_other_renewables_including_bioenergy_generation__kwh: - title: Other renewable electricity including bioenergy per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Other renewable electricity including bioenergy per capita - numDecimalPlaces: 0 - per_capita_renewable_generation__kwh: - title: Renewable electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Renewable electricity per capita - numDecimalPlaces: 0 - per_capita_solar_generation__kwh: - title: Solar electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Solar electricity per capita - numDecimalPlaces: 0 - per_capita_total_generation__kwh: - title: Per capita electricity (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Per capita electricity use - numDecimalPlaces: 0 - per_capita_wind_generation__kwh: - title: Wind electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Wind electricity per capita - numDecimalPlaces: 0 - population: - title: Population - short_unit: people - unit: people - display: - name: Population - primary_energy_consumption__twh: - title: Electricity from primary energy consumption (twh) (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Primary energy consumption - renewable_generation__twh: - title: Electricity from renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables - renewable_share_of_electricity__pct: - title: Renewables (% electricity) - short_unit: '%' - unit: '%' - display: - name: Renewables - numDecimalPlaces: 2 - solar_generation__twh: - title: Electricity from solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - solar_share_of_electricity__pct: - title: Solar (% electricity) - short_unit: '%' - unit: '%' - display: - name: Solar - total_demand__twh: - title: Electricity demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Electricity demand - total_electricity_share_of_primary_energy__pct: - title: Electricity as share of primary energy (%) - short_unit: '%' - unit: '%' - display: - name: Electricity as share of primary energy - total_emissions__mtco2: - title: Emissions (MtCO2) - short_unit: million t - unit: million tonnes CO2 equivalent - display: - name: Emissions - total_generation__twh: - title: Electricity generation (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Electricity generation - total_net_imports__twh: - title: Net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Net imports - wind_generation__twh: - title: Electricity from wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - wind_share_of_electricity__pct: - title: Wind (% electricity) - short_unit: '%' - unit: '%' - display: - name: Wind diff --git a/etl/steps/archive/garden/energy/2022-12-13/electricity_mix.py b/etl/steps/archive/garden/energy/2022-12-13/electricity_mix.py deleted file mode 100644 index bf0c1a80fb9..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-13/electricity_mix.py +++ /dev/null @@ -1,324 +0,0 @@ -"""Garden step that combines BP's statistical review with Ember's combined electricity data (combination of the European -Electricity Review and the Yearly Electricity Data) to create the Electricity Mix (BP & Ember) dataset. - -""" - -from typing import Dict, List - -import pandas as pd -from owid import catalog -from owid.datautils.dataframes import combine_two_overlapping_dataframes -from shared import CURRENT_DIR, add_population - -from etl.helpers import PathFinder - -# Get relevant paths for current file. -paths = PathFinder(__file__) - -# Details for dataset to export. -DATASET_SHORT_NAME = "electricity_mix" -METADATA_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 -# Megatonnes to grams. -MT_TO_G = 1e12 - - -def process_bp_data(table_bp: catalog.Table) -> pd.DataFrame: - """Load necessary columns from BP's Statistical Review dataset, and create some new variables (e.g. electricity - generation from fossil fuels). - - Parameters - ---------- - table_bp : catalog.Table - BP's Statistical Review (already processed, with harmonized countries and region aggregates). - - Returns - ------- - df_bp : pd.DataFrame - Processed BP data. - - """ - # Columns to load from BP dataset. - columns = { - "electricity_generation": "total_generation__twh", - "primary_energy_consumption__twh": "primary_energy_consumption__twh", - "hydro_generation__twh": "hydro_generation__twh", - "nuclear_generation__twh": "nuclear_generation__twh", - "solar_generation__twh": "solar_generation__twh", - "wind_generation__twh": "wind_generation__twh", - "geo_biomass_other__twh": "other_renewables_including_bioenergy_generation__twh", - "elec_gen_from_oil": "oil_generation__twh", - "elec_gen_from_coal": "coal_generation__twh", - "elec_gen_from_gas": "gas_generation__twh", - } - table_bp = table_bp[list(columns)].rename(columns=columns, errors="raise") - # New columns to be created by summing other columns. - aggregates: Dict[str, List[str]] = { - "fossil_generation__twh": [ - "oil_generation__twh", - "coal_generation__twh", - "gas_generation__twh", - ], - "renewable_generation__twh": [ - "hydro_generation__twh", - "solar_generation__twh", - "wind_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - ], - "low_carbon_generation__twh": [ - "renewable_generation__twh", - "nuclear_generation__twh", - ], - } - - # Create a dataframe with a dummy index. - df_bp = pd.DataFrame(table_bp).reset_index() - - # Create new columns, by adding up other columns (and allowing for only one nan in each sum). - for new_column in aggregates: - df_bp[new_column] = df_bp[aggregates[new_column]].sum(axis=1, min_count=len(aggregates[new_column]) - 1) - - return df_bp - - -def process_ember_data(table_ember: catalog.Table) -> pd.DataFrame: - """Load necessary columns from the Combined Electricity dataset and prepare a dataframe with the required variables. - - Parameters - ---------- - table_ember : catalog.Table - Combined Electricity (combination of Ember's Yearly Electricity Data and European Electricity Review). - - Returns - ------- - df_ember : pd.DataFrame - Processed Combined Electricity data. - - """ - # Columns to load from Ember dataset. - columns = { - "generation__bioenergy__twh": "bioenergy_generation__twh", - "generation__gas__twh": "gas_generation__twh", - "generation__coal__twh": "coal_generation__twh", - "generation__other_fossil__twh": "oil_generation__twh", - "generation__renewables__twh": "renewable_generation__twh", - "generation__other_renewables__twh": "other_renewables_excluding_bioenergy_generation__twh", - "generation__clean__twh": "low_carbon_generation__twh", - "generation__hydro__twh": "hydro_generation__twh", - "generation__nuclear__twh": "nuclear_generation__twh", - "generation__solar__twh": "solar_generation__twh", - "generation__wind__twh": "wind_generation__twh", - "generation__fossil__twh": "fossil_generation__twh", - "generation__total_generation__twh": "total_generation__twh", - "demand__total_demand__twh": "total_demand__twh", - "emissions__total_emissions__mtco2": "total_emissions__mtco2", - "emissions__co2_intensity__gco2_kwh": "co2_intensity__gco2_kwh", - "imports__total_net_imports__twh": "total_net_imports__twh", - } - table_ember = table_ember[list(columns)].rename(columns=columns, errors="raise") - - # Create a dataframe with a dummy index. - df_ember = pd.DataFrame(table_ember).reset_index() - - # In BP data, there is a variable "Geo Biomass Other", which combines all other renewables. - # In Ember data, "other rewenables" excludes bioenergy. - # To be able to combine both datasets, create a new variable for generation of other renewables including bioenergy. - df_ember["other_renewables_including_bioenergy_generation__twh"] = ( - df_ember["other_renewables_excluding_bioenergy_generation__twh"] + df_ember["bioenergy_generation__twh"] - ) - - return df_ember - - -def add_per_capita_variables(combined: pd.DataFrame) -> pd.DataFrame: - """Add per capita variables (in kWh per person) to the combined BP and Ember dataframe. - - The list of variables to make per capita are given in this function. The new variable names will be 'per_capita_' - followed by the original variable's name. - - Parameters - ---------- - combined : pd.DataFrame - Combination of BP's Statistical Review and Ember's Combined Electricity. - - Returns - ------- - combined : pd.DataFrame - Input dataframe after adding per capita variables. - - """ - combined = combined.copy() - - # Variables to make per capita. - per_capita_variables = [ - "bioenergy_generation__twh", - "coal_generation__twh", - "fossil_generation__twh", - "gas_generation__twh", - "hydro_generation__twh", - "low_carbon_generation__twh", - "nuclear_generation__twh", - "oil_generation__twh", - "other_renewables_excluding_bioenergy_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - "renewable_generation__twh", - "solar_generation__twh", - "total_generation__twh", - "wind_generation__twh", - ] - # Add a column for population (only for harmonized countries). - combined = add_population(df=combined, warn_on_missing_countries=False) - - for variable in per_capita_variables: - assert "twh" in variable, f"Variables are assumed to be in TWh, but {variable} is not." - new_column = "per_capita_" + variable.replace("__twh", "__kwh") - combined[new_column] = combined[variable] * TWH_TO_KWH / combined["population"] - - return combined - - -def add_share_variables(combined: pd.DataFrame) -> pd.DataFrame: - """Add variables for the electricity generation as a share of the total electricity generation (as a percentage). - - The following new variables will be created: - * For each source (e.g. coal_generation__twh) in a list given in this function, a new variable will be created - (named, e.g. coal_share_of_electricity__pct). - * Total electricity generation as a share of primary energy consumption. - * Total net electricity imports as a share of total electricity demand. - - Parameters - ---------- - combined : pd.DataFrame - Combination of BP's Statistical Review and Ember's Combined Electricity. - - Returns - ------- - combined : pd.DataFrame - Input dataframe after adding share variables. - - """ - # Variables to make as share of electricity (new variable names will be the name of the original variable followed - # by '_share_of_electricity__pct'). - share_variables = [ - "bioenergy_generation__twh", - "coal_generation__twh", - "fossil_generation__twh", - "gas_generation__twh", - "hydro_generation__twh", - "low_carbon_generation__twh", - "nuclear_generation__twh", - "oil_generation__twh", - "other_renewables_excluding_bioenergy_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - "renewable_generation__twh", - "solar_generation__twh", - "total_generation__twh", - "wind_generation__twh", - ] - for variable in share_variables: - new_column = variable.replace("_generation__twh", "_share_of_electricity__pct") - combined[new_column] = 100 * combined[variable] / combined["total_generation__twh"] - - # Calculate the percentage of electricity as a share of primary energy. - combined["total_electricity_share_of_primary_energy__pct"] = ( - 100 * combined["total_generation__twh"] / combined["primary_energy_consumption__twh"] - ) - - # Calculate the percentage of electricity demand that is imported. - combined["net_imports_share_of_demand__pct"] = ( - 100 * combined["total_net_imports__twh"] / combined["total_demand__twh"] - ) - - # Sanity check. - error = "Total electricity share does not add up to 100%." - assert all(abs(combined["total_share_of_electricity__pct"].dropna() - 100) < 0.01), error - - # Remove unnecessary columns. - combined = combined.drop(columns=["total_share_of_electricity__pct"]) - - return combined - - -def prepare_output_table(combined: pd.DataFrame) -> catalog.Table: - """Convert the combined (BP + Ember) dataframe into a table with the appropriate metadata and variables metadata. - - Parameters - ---------- - combined : pd.DataFrame - BP's Statistical Review combined with Ember's Combined Electricity, after adding per capita variables and - share variables. - - Returns - ------- - table : catalog.Table - Original data in a table format with metadata. - - """ - # Set an appropriate index and sort rows and columns conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Convert dataframe into a table (with no metadata). - table = catalog.Table(combined) - - # Load metadata from yaml file. - table.update_metadata_from_yaml(METADATA_PATH, DATASET_SHORT_NAME) - - return table - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BP's statistical review dataset. - ds_bp: catalog.Dataset = paths.load_dependency("statistical_review") - # Select main table. - table_bp = ds_bp["statistical_review"] - # Create a convenient dataframe. - df_bp = pd.DataFrame(table_bp) - - # Idem for Ember's combined electricity. - ds_ember: catalog.Dataset = paths.load_dependency("combined_electricity") - table_ember = ds_ember["combined_electricity"] - df_ember = pd.DataFrame(table_ember) - - # - # Process data. - # - # Prepare BP and Ember data. - df_bp = process_bp_data(table_bp=table_bp) - df_ember = process_ember_data(table_ember=table_ember) - - # Combine both tables, giving priority to Ember data (on overlapping values). - combined = combine_two_overlapping_dataframes(df1=df_ember, df2=df_bp, index_columns=["country", "year"]) - - # Add carbon intensities. - # There is already a variable for this in the Ember dataset, but now that we have combined - # BP and Ember data, intensities should be recalculated for consistency. - combined["co2_intensity__gco2_kwh"] = (combined["total_emissions__mtco2"] * MT_TO_G) / ( - combined["total_generation__twh"] * TWH_TO_KWH - ) - - # Add per capita variables. - combined = add_per_capita_variables(combined=combined) - - # Add "share" variables. - combined = add_share_variables(combined=combined) - - # Prepare output table. - table = prepare_output_table(combined=combined) - - # - # Save outputs. - # - ds_garden = catalog.Dataset.create_empty(dest_dir) - # Import metadata from the metadata yaml file. - ds_garden.metadata.update_from_yaml(METADATA_PATH, if_source_exists="replace") - # Create dataset. - ds_garden.save() - - # Add combined tables to the new dataset. - ds_garden.add(table) diff --git a/etl/steps/archive/garden/energy/2022-12-13/owid_energy.meta.yml b/etl/steps/archive/garden/energy/2022-12-13/owid_energy.meta.yml deleted file mode 100644 index 71459dcc410..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-13/owid_energy.meta.yml +++ /dev/null @@ -1,12 +0,0 @@ -dataset: - namespace: energy - version: 2022-12-13 - title: OWID Energy dataset (2022c) - short_name: owid_energy - description: | - OWID Energy dataset. - - This dataset will be loaded by [the energy-data repository](https://github.com/owid/energy-data), to create a csv file of the dataset that can be downloaded in one click. - -# Dataset sources will be created in the step by combining all component datasets' sources. -# Also, table metadata will be built from the tables' metadata and the content of owid_energy_variable_mapping.csv. diff --git a/etl/steps/archive/garden/energy/2022-12-13/owid_energy.py b/etl/steps/archive/garden/energy/2022-12-13/owid_energy.py deleted file mode 100644 index c44601b350f..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-13/owid_energy.py +++ /dev/null @@ -1,214 +0,0 @@ -"""Garden step that combines various datasets related to energy and produces the OWID Energy dataset (2022). - -Datasets combined: -* Energy mix from BP. -* Fossil fuel production (BP & Shift, 2022). -* Primary energy consumption (BP & EIA, 2022). -* Electricity mix (BP & Ember, 2022). - -""" - -from typing import Dict, cast - -import numpy as np -import pandas as pd -from owid import catalog -from owid.datautils import dataframes - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -from .shared import ( - BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES, - CURRENT_DIR, - HISTORIC_TO_CURRENT_REGION, - add_population, - gather_sources_from_tables, -) - -paths = PathFinder(__file__) - -# Details for dataset to export. -DATASET_SHORT_NAME = "owid_energy" -DATASET_TITLE = "Energy dataset (OWID, 2022)" -METADATA_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Details for datasets to import. -ENERGY_MIX_DATASET_PATH = DATA_DIR / "garden/bp/2022-07-14/energy_mix" -ENERGY_MIX_TABLE_NAME = "energy_mix" -FOSSIL_FUEL_PRODUCTION_DATASET_PATH = DATA_DIR / "garden/energy/2022-07-20/fossil_fuel_production" -FOSSIL_FUEL_PRODUCTION_TABLE_NAME = "fossil_fuel_production" -PRIMARY_ENERGY_CONSUMPTION_DATASET_PATH = DATA_DIR / "garden/energy/2022-07-29/primary_energy_consumption" -PRIMARY_ENERGY_CONSUMPTION_TABLE_NAME = "primary_energy_consumption" -ELECTRICITY_MIX_DATASET_PATH = DATA_DIR / "garden/energy/2022-12-13/electricity_mix" -ELECTRICITY_MIX_TABLE_NAME = "electricity_mix" -# Population and GDP are only used to add the population and gdp columns (and no other derived variables). -POPULATION_DATASET_PATH = DATA_DIR / "garden/owid/latest/key_indicators/" -GDP_DATASET_PATH = DATA_DIR / "garden/ggdc/2020-10-01/ggdc_maddison" -# Path to file with mapping of variable names from one of the datasets to the final energy dataset. -VARIABLE_MAPPING_FILE = CURRENT_DIR / "owid_energy_variable_mapping.csv" - - -def combine_tables_data_and_metadata( - tables: Dict[str, catalog.Table], - countries_regions: catalog.Table, - gdp: pd.DataFrame, - variable_mapping: pd.DataFrame, -) -> catalog.Table: - """Combine data and metadata of a list of tables, map variable names and add variables metadata. - - Parameters - ---------- - tables : dict - Dictionary where the key is the short name of the table, and the value is the actual table, for all tables to be - combined. - countries_regions : catalog.Table - Main table from countries-regions dataset. - gdp: pd.DataFrame - GDP (from owid catalog, after converting into a dataframe, resetting index, and selecting country, year and gdp - columns). - variable_mapping : pd.DataFrame - Dataframe (with columns variable, source_variable, source_dataset, description, source) that specifies the names - of variables to take from each table, and their new name in the output table. It also gives a description of the - variable, and the sources of the table. - - Returns - ------- - tb_combined : catalog.Table - Combined table with metadata. - - """ - # Merge all tables as a dataframe (without metadata). - dfs = [pd.DataFrame(table) for table in tables.values()] - df_combined = dataframes.multi_merge(dfs, on=["country", "year"], how="outer") - - # Add ISO codes for countries (regions that are not in countries-regions dataset will have nan iso_code). - df_combined = pd.merge(df_combined, countries_regions, left_on="country", right_on="name", how="left") - - # Add population and gdp of countries (except for dataset-specific regions e.g. those ending in (BP) or (Shift)). - - historical_regions = { - region: HISTORIC_TO_CURRENT_REGION[region] - for region in HISTORIC_TO_CURRENT_REGION - if region in BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES - } - df_combined = add_population(df=df_combined, regions=historical_regions, warn_on_missing_countries=False) - df_combined = pd.merge(df_combined, gdp, on=["country", "year"], how="left") - - # Check that there were no repetition in column names. - error = "Repeated columns in combined data." - assert len([column for column in set(df_combined.columns) if "_x" in column]) == 0, error - - # Create a table with combined data and no metadata. - tb_combined = catalog.Table(df_combined) - - # List the names of the variables described in the variable mapping file. - source_variables = variable_mapping.index.get_level_values(0).tolist() - - # Gather original metadata for each variable, add the descriptions and sources from the variable mapping file. - for source_variable in source_variables: - variable_metadata = variable_mapping.loc[source_variable] - source_dataset = variable_metadata["source_dataset"] - # Check that the variable indeed exists in the original dataset that the variable mapping says. - # Ignore columns "country", "year" (assigned to a dummy dataset 'various_datasets'), "population" (that comes - # from key_indicators) and "iso_alpha3" (that comes from countries_regions dataset). - if source_dataset not in [ - "various_datasets", - "countries_regions", - "key_indicators", - "maddison_gdp", - ]: - error = f"Variable {source_variable} not found in any of the original datasets." - assert source_variable in tables[source_dataset].columns, error - tb_combined[source_variable].metadata = tables[source_dataset][source_variable].metadata - - # Update metadata with the content of the variable mapping file. - tb_combined[source_variable].metadata.description = variable_metadata["description"] - tb_combined[source_variable].metadata.sources = [catalog.meta.Source(name=variable_metadata["source"])] - - # Select only variables in the mapping file, and rename variables according to the mapping. - tb_combined = tb_combined[source_variables].rename(columns=variable_mapping.to_dict()["variable"]) - - # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). - columns_that_must_have_data = [ - column for column in tb_combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] - ] - tb_combined = tb_combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - - # Sanity check. - columns_with_inf = [column for column in tb_combined.columns if len(tb_combined[tb_combined[column] == np.inf]) > 0] - assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" - - # Set index and sort conveniently. - tb_combined = tb_combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - return cast(catalog.Table, tb_combined) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read all required datasets. - ds_energy_mix = catalog.Dataset(ENERGY_MIX_DATASET_PATH) - ds_fossil_fuels = catalog.Dataset(FOSSIL_FUEL_PRODUCTION_DATASET_PATH) - ds_primary_energy = catalog.Dataset(PRIMARY_ENERGY_CONSUMPTION_DATASET_PATH) - ds_electricity_mix = catalog.Dataset(ELECTRICITY_MIX_DATASET_PATH) - - # Gather all required tables from all datasets. - tb_energy_mix = ds_energy_mix[ENERGY_MIX_TABLE_NAME].reset_index() - tb_fossil_fuels = ds_fossil_fuels[FOSSIL_FUEL_PRODUCTION_TABLE_NAME].reset_index() - tb_primary_energy = ds_primary_energy[PRIMARY_ENERGY_CONSUMPTION_TABLE_NAME].reset_index() - tb_electricity_mix = ds_electricity_mix[ELECTRICITY_MIX_TABLE_NAME].reset_index() - - # Load countries-regions dataset (required to get ISO codes). - countries_regions = cast(catalog.Dataset, paths.load_dependency("regions"))["regions"] - - # Population data will also be loaded (used only to add a population column, and not to create any other derived - # variables). Historical regions will be added to the population. - - # Load gdp (used only to add gdp column, and no other derived variables). - gdp = ( - pd.DataFrame(catalog.Dataset(GDP_DATASET_PATH)["maddison_gdp"]) - .reset_index()[["country", "year", "gdp"]] - .dropna() - ) - - # Load mapping from variable names in the component dataset to the final variable name in the output dataset. - variable_mapping = pd.read_csv(VARIABLE_MAPPING_FILE).set_index(["source_variable"]) - - # - # Process data. - # - # Combine all tables. - tables = { - "energy_mix": tb_energy_mix.drop(columns=["country_code"], errors="ignore"), - "fossil_fuel_production": tb_fossil_fuels, - "primary_energy_consumption": tb_primary_energy.drop(columns=["gdp", "population", "source"], errors="ignore"), - "electricity_mix": tb_electricity_mix.drop( - columns=["population", "primary_energy_consumption__twh"], errors="ignore" - ), - } - tb_combined = combine_tables_data_and_metadata( - tables=tables, - countries_regions=countries_regions, - gdp=gdp, - variable_mapping=variable_mapping, - ) - - # - # Save outputs. - # - ds_garden = catalog.Dataset.create_empty(dest_dir) - # Gather metadata sources from all tables' original dataset sources. - ds_garden.metadata.sources = gather_sources_from_tables(tables=list(tables.values())) - # Get the rest of the metadata from the yaml file. - ds_garden.metadata.update_from_yaml(METADATA_PATH) - # Create dataset. - ds_garden.save() - - # Add other metadata fields to table. - tb_combined.metadata.short_name = DATASET_SHORT_NAME - tb_combined.metadata.title = DATASET_TITLE - - # Add combined tables to the new dataset. - ds_garden.add(tb_combined) diff --git a/etl/steps/archive/garden/energy/2022-12-13/owid_energy_variable_mapping.csv b/etl/steps/archive/garden/energy/2022-12-13/owid_energy_variable_mapping.csv deleted file mode 100644 index 8c6c44b25af..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-13/owid_energy_variable_mapping.csv +++ /dev/null @@ -1,130 +0,0 @@ -variable,source_variable,source_dataset,description,source -country,country,various_datasets,Geographic location,Our World in Data -year,year,various_datasets,Year of observation,Our World in Data -iso_code,iso_alpha3,countries_regions,ISO 3166-1 alpha-3 three-letter country codes,International Organization for Standardization -population,population,key_indicators,"Population","Calculated by Our World in Data based on different sources (https://ourworldindata.org/population-sources)" -gdp,gdp,maddison_gdp,"Total real gross domestic product, inflation-adjusted",Maddison Project Database -biofuel_cons_change_pct,biofuels__pct_growth,energy_mix,Annual percentage change in biofuel consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_cons_change_twh,biofuels__twh_growth,energy_mix,"Annual change in biofuel consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_cons_per_capita,biofuels_per_capita__kwh,energy_mix,"Per capita primary energy consumption from biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_consumption,biofuels__twh,energy_mix,"Primary energy consumption from biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_elec_per_capita,per_capita_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_electricity,bioenergy_generation__twh,electricity_mix,"Electricity generation from biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_share_elec,bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_share_energy,biofuels__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy -carbon_intensity_elec,co2_intensity__gco2_kwh,electricity_mix,"Carbon intensity of electricity production, measured in grams of carbon dioxide emitted per kilowatt-hour",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_cons_change_pct,coal__pct_growth,energy_mix,Annual percentage change in coal consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_cons_change_twh,coal__twh_growth,energy_mix,"Annual change in coal consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_cons_per_capita,coal_per_capita__kwh,energy_mix,"Per capita primary energy consumption from coal, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_consumption,coal__twh,energy_mix,"Primary energy consumption from coal, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_elec_per_capita,per_capita_coal_generation__kwh,electricity_mix,"Per capita electricity generation from coal, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_electricity,coal_generation__twh,electricity_mix,"Electricity generation from coal, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_prod_change_pct,annual_change_in_coal_production__pct,fossil_fuel_production,Annual percentage change in coal production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_prod_change_twh,annual_change_in_coal_production__twh,fossil_fuel_production,"Annual change in coal production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_prod_per_capita,coal_production_per_capita__kwh,fossil_fuel_production,"Per capita coal production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_production,coal_production__twh,fossil_fuel_production,"Coal production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_share_elec,coal_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from coal,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_share_energy,coal__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from coal,Calculated by Our World in Data based on BP Statistical Review of World Energy -electricity_demand,total_demand__twh,electricity_mix,"Electricity demand, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -electricity_generation,total_generation__twh,electricity_mix,"Electricity generation, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -electricity_share_energy,total_electricity_share_of_primary_energy__pct,electricity_mix,"Electricity generation as a share of primary energy",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -energy_cons_change_pct,annual_change_in_primary_energy_consumption__pct,primary_energy_consumption,Annual percentage change in primary energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_cons_change_twh,annual_change_in_primary_energy_consumption__twh,primary_energy_consumption,"Annual change in primary energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_per_capita,primary_energy_consumption_per_capita__kwh,primary_energy_consumption,"Primary energy consumption per capita, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_per_gdp,primary_energy_consumption_per_gdp__kwh_per_dollar,primary_energy_consumption,Energy consumption per unit of GDP. This is measured in kilowatt-hours per 2011 international-$.,"Calculated by Our World in Data based on BP Statistical Review of World Energy, EIA International Energy Data and Maddison Project Database" -fossil_cons_change_pct,fossil_fuels__pct_growth,energy_mix,Annual percentage change in fossil fuel consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_cons_change_twh,fossil_fuels__twh_growth,energy_mix,"Annual change in fossil fuel consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_elec_per_capita,per_capita_fossil_generation__kwh,electricity_mix,"Per capita electricity generation from fossil fuels, measured in kilowatt-hours. This is the sum of electricity generated from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_electricity,fossil_generation__twh,electricity_mix,"Electricity generation from fossil fuels, measured in terawatt-hours. This is the sum of electricity generation from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_energy_per_capita,fossil_fuels_per_capita__kwh,energy_mix,"Per capita fossil fuel consumption, measured in kilowatt-hours. This is the sum of primary energy from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_fuel_consumption,fossil_fuels__twh,energy_mix,"Fossil fuel consumption, measured in terawatt-hours. This is the sum of primary energy from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_share_elec,fossil_share_of_electricity__pct,electricity_mix,"Share of electricity generation that comes from fossil fuels (coal, oil and gas combined)",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_share_energy,fossil_fuels__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from fossil fuels,Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_cons_change_pct,gas__pct_growth,energy_mix,Annual percentage change in gas consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_cons_change_twh,gas__twh_growth,energy_mix,"Annual change in gas consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_consumption,gas__twh,energy_mix,"Primary energy consumption from gas, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_elec_per_capita,per_capita_gas_generation__kwh,electricity_mix,"Per capita electricity generation from gas, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_electricity,gas_generation__twh,electricity_mix,"Electricity generation from gas, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_energy_per_capita,gas_per_capita__kwh,energy_mix,"Per capita primary energy consumption from gas, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_prod_change_pct,annual_change_in_gas_production__pct,fossil_fuel_production,Annual percentage change in gas production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_prod_change_twh,annual_change_in_gas_production__twh,fossil_fuel_production,"Annual change in gas production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_prod_per_capita,gas_production_per_capita__kwh,fossil_fuel_production,"Per capita gas production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_production,gas_production__twh,fossil_fuel_production,"Gas production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_share_elec,gas_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from gas,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_share_energy,gas__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from gas,Calculated by Our World in Data based on BP Statistical Review of World Energy -greenhouse_gas_emissions,total_emissions__mtco2,electricity_mix,"Greenhouse-gas emissions produced in the generation of electricity, measured in million tonnes of CO2 equivalent",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_cons_change_pct,hydro__pct_growth,energy_mix,Annual percentage change in hydropower consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_cons_change_twh,hydro__twh_growth__equivalent,energy_mix,"Annual change in hydropower consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_consumption,hydro__twh__equivalent,energy_mix,"Primary energy consumption from hydropower, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_elec_per_capita,per_capita_hydro_generation__kwh,electricity_mix,"Per capita electricity generation from hydropower, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_electricity,hydro_generation__twh,electricity_mix,"Electricity generation from hydropower, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_energy_per_capita,hydro_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from hydropower, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_share_elec,hydro_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from hydropower,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_share_energy,hydro__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from hydropower,Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_cons_change_pct,low_carbon_energy__pct_growth,energy_mix,Annual percentage change in low-carbon energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_cons_change_twh,low_carbon_energy__twh_growth__equivalent,energy_mix,"Annual change in low-carbon energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_consumption,low_carbon_energy__twh__equivalent,energy_mix,"Primary energy consumption from low-carbon sources, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_elec_per_capita,per_capita_low_carbon_generation__kwh,electricity_mix,"Per capita electricity generation from low-carbon sources, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_electricity,low_carbon_generation__twh,electricity_mix,"Electricity generation from low-carbon sources, measured in terawatt-hours. This is the sum of electricity generation from renewables and nuclear power",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_energy_per_capita,low_carbon_energy_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from low-carbon sources, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_share_elec,low_carbon_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from low-carbon sources. This is the sum of electricity from renewables and nuclear,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_share_energy,low_carbon_energy__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from low-carbon sources. This is the sum of primary energy from renewables and nuclear,Calculated by Our World in Data based on BP Statistical Review of World Energy -net_elec_imports,total_net_imports__twh,electricity_mix,"Net electricity imports, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -net_elec_imports_share_demand,net_imports_share_of_demand__pct,electricity_mix,Net electricity imports as a share of electricity demand,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_cons_change_pct,nuclear__pct_growth,energy_mix,Annual percentage change in nuclear consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_cons_change_twh,nuclear__twh_growth__equivalent,energy_mix,"Annual change in nuclear consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_consumption,nuclear__twh__equivalent,energy_mix,"Primary energy consumption from nuclear power, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_elec_per_capita,per_capita_nuclear_generation__kwh,electricity_mix,"Per capita electricity generation from nuclear power, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_electricity,nuclear_generation__twh,electricity_mix,"Electricity generation from nuclear power, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_energy_per_capita,nuclear_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from nuclear, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_share_elec,nuclear_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from nuclear power,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_share_energy,nuclear__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from nuclear power,Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_cons_change_pct,oil__pct_growth,energy_mix,Annual percentage change in oil consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_cons_change_twh,oil__twh_growth,energy_mix,"Annual change in oil consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_consumption,oil__twh,energy_mix,"Primary energy consumption from oil, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_elec_per_capita,per_capita_oil_generation__kwh,electricity_mix,"Per capita electricity generation from oil, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_electricity,oil_generation__twh,electricity_mix,"Electricity generation from oil, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_energy_per_capita,oil_per_capita__kwh,energy_mix,"Per capita primary energy consumption from oil, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_prod_change_pct,annual_change_in_oil_production__pct,fossil_fuel_production,Annual percentage change in oil production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_prod_change_twh,annual_change_in_oil_production__twh,fossil_fuel_production,"Annual change in oil production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_prod_per_capita,oil_production_per_capita__kwh,fossil_fuel_production,"Per capita oil production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_production,oil_production__twh,fossil_fuel_production,"Oil production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_share_elec,oil_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from oil,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_share_energy,oil__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from oil,Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewable_consumption,other_renewables__twh__equivalent,energy_mix,"Primary energy consumption from other renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewable_electricity,other_renewables_including_bioenergy_generation__twh,electricity_mix,"Electricity generation from other renewable sources including biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewable_exc_biofuel_electricity,other_renewables_excluding_bioenergy_generation__twh,electricity_mix,"Electricity generation from other renewable sources excluding biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_cons_change_pct,other_renewables__pct_growth,energy_mix,Annual percentage change in energy consumption from other renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_cons_change_twh,other_renewables__twh_growth__equivalent,energy_mix,"Annual change in other renewable consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_elec_per_capita,per_capita_other_renewables_including_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from other renewables including biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_elec_per_capita_exc_biofuel,per_capita_other_renewables_excluding_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from other renewables excluding biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_energy_per_capita,other_renewables_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from other renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_share_elec,other_renewables_including_bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from other renewables including biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_share_elec_exc_biofuel,other_renewables_excluding_bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from other renewables excluding biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_share_energy,other_renewables__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from other renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -per_capita_electricity,per_capita_total_generation__kwh,electricity_mix,"Electricity generation per capita, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -primary_energy_consumption,primary_energy_consumption__twh,primary_energy_consumption,"Primary energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -renewables_cons_change_pct,renewables__pct_growth,energy_mix,Annual percentage change in renewable energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_cons_change_twh,renewables__twh_growth__equivalent,energy_mix,"Annual change in renewable energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_consumption,renewables__twh__equivalent,energy_mix,"Primary energy consumption from renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_elec_per_capita,per_capita_renewable_generation__kwh,electricity_mix,"Per capita electricity generation from renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_electricity,renewable_generation__twh,electricity_mix,"Electricity generation from renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_energy_per_capita,renewables_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_share_elec,renewable_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_share_energy,renewables__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_cons_change_pct,solar__pct_growth,energy_mix,Annual percentage change in solar consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_cons_change_twh,solar__twh_growth__equivalent,energy_mix,"Annual change in solar consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_consumption,solar__twh__equivalent,energy_mix,"Primary energy consumption from solar, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_elec_per_capita,per_capita_solar_generation__kwh,electricity_mix,"Per capita electricity generation from solar, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_electricity,solar_generation__twh,electricity_mix,"Electricity generation from solar, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_energy_per_capita,solar_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from solar, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_share_elec,solar_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from solar,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_share_energy,solar__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from solar,Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_cons_change_pct,wind__pct_growth,energy_mix,Annual percentage change in wind consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_cons_change_twh,wind__twh_growth__equivalent,energy_mix,"Annual change in wind consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_consumption,wind__twh__equivalent,energy_mix,"Primary energy consumption from wind, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_elec_per_capita,per_capita_wind_generation__kwh,electricity_mix,"Per capita electricity generation from wind, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_electricity,wind_generation__twh,electricity_mix,"Electricity generation from wind, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_energy_per_capita,wind_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from wind, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_share_elec,wind_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from wind,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_share_energy,wind__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from wind,Calculated by Our World in Data based on BP Statistical Review of World Energy diff --git a/etl/steps/archive/garden/energy/2022-12-13/shared.py b/etl/steps/archive/garden/energy/2022-12-13/shared.py deleted file mode 100644 index 25488f162a2..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-13/shared.py +++ /dev/null @@ -1,480 +0,0 @@ -from pathlib import Path -from typing import Any, Dict, List, Optional, Union, cast - -import numpy as np -import pandas as pd -from owid import catalog - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION: Dict[str, Dict[str, Union[str, List[str]]]] = { - "Czechoslovakia": { - "continent": "Europe", - "income_group": "High-income countries", - "regions_included": [ - # Europe - High-income countries. - "Czechia", - "Slovakia", - ], - }, - "East Germany": { - "continent": "Europe", - "income_group": "", - "regions_included": [ - # Europe - High-income countries. - "Germany", - ], - }, - "West Germany": { - "continent": "Europe", - "income_group": "", - "regions_included": [ - # Europe - High-income countries. - "Germany", - ], - }, - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "regions_included": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - "Bonaire Sint Eustatius and Saba", - ], - }, - "Serbia and Montenegro": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - Upper-middle-income countries. - "Serbia", - "Montenegro", - ], - }, - "North Yemen": { - "continent": "Asia", - "income_group": "Low-income countries", - "regions_included": [ - # Asia - Low-income countries. - "Yemen", - ], - }, - "South Yemen": { - "continent": "Asia", - "income_group": "Low-income countries", - "regions_included": [ - # Asia - Low-income countries. - "Yemen", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, - "Yugoslavia": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - High-income countries. - "Croatia", - "Slovenia", - # Europe - Upper-middle-income countries. - "North Macedonia", - "Bosnia and Herzegovina", - "Serbia", - "Montenegro", - ], - }, -} - -# Historical countries whose population can be built by adding up the population of their successor countries. -# Those historical countries not listed here will have no population data. -BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES = [ - # The following regions split into smaller ones, and can be estimated by the population of the successors. - "Czechoslovakia", - "Netherlands Antilles", - "Serbia and Montenegro", - "USSR", - "Yugoslavia", - # The following countries cannot be replaced by the successor countries. - # 'East Germany', - # 'West Germany', - # 'North Yemen', - # 'South Yemen', -] - - -# Historical countries for which we don't have population, and can't be built from successor countries. -EXPECTED_COUNTRIES_WITHOUT_POPULATION = list( - set(HISTORIC_TO_CURRENT_REGION) - set(BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES) -) - -# Overlaps found between historical regions and successor countries, that we accept in the data. -# We accept them either because they happened close to the transition, or to avoid needing to introduce new -# countries for which we do not have data (like the Russian Empire). -ACCEPTED_OVERLAPS = { - # 1991: {"Georgia", "USSR"}, -} - - -def gather_sources_from_tables( - tables: List[catalog.Table], -) -> List[catalog.meta.Source]: - """Gather unique sources from the metadata.dataset of each table in a list of tables. - - Note: To check if a source is already listed, only the name of the source is considered (not the description or any - other field in the source). - - Parameters - ---------- - tables : list - List of tables with metadata. - - Returns - ------- - known_sources : list - List of unique sources from all tables. - - """ - # Initialise list that will gather all unique metadata sources from the tables. - known_sources: List[catalog.meta.Source] = [] - for table in tables: - # Get list of sources of the dataset of current table. - table_sources = table.metadata.dataset.sources - # Go source by source of current table, and check if its name is not already in the list of known_sources. - for source in table_sources: - # Check if this source's name is different to all known_sources. - if all([source.name != known_source.name for known_source in known_sources]): - # Add the new source to the list. - known_sources.append(source) - - return known_sources - - -def get_countries_in_region( - region: str, region_modifications: Optional[Dict[str, Dict[str, List[str]]]] = None -) -> List[str]: - """Get countries in a region, both for known regions (e.g. "Africa") and custom ones (e.g. "Europe (excl. EU-27)"). - - Parameters - ---------- - region : str - Region name (e.g. "Africa", or "Europe (excl. EU-27)"). - region_modifications : dict or None - If None (or an empty dictionary), the region should be in OWID's countries-regions dataset. - If not None, it should be a dictionary with any (or all) of the following keys: - - "regions_included": List of regions whose countries will be included. - - "regions_excluded": List of regions whose countries will be excluded. - - "countries_included": List of additional individual countries to be included. - - "countries_excluded": List of additional individual countries to be excluded. - NOTE: All regions and countries defined in this dictionary should be in OWID's countries-regions dataset. - - Returns - ------- - countries : list - List of countries in the specified region. - - """ - if region_modifications is None: - region_modifications = {} - - # Check that the fields in the regions_modifications dictionary are well defined. - expected_fields = ["regions_included", "regions_excluded", "countries_included", "countries_excluded"] - assert all([field in expected_fields for field in region_modifications]) - - # Get lists of regions whose countries will be included and excluded. - regions_included = region_modifications.get("regions_included", [region]) - regions_excluded = region_modifications.get("regions_excluded", []) - # Get lists of additional individual countries to include and exclude. - countries_included = region_modifications.get("countries_included", []) - countries_excluded = region_modifications.get("countries_excluded", []) - - # List countries from the list of regions included. - countries_set = set( - sum([geo.list_countries_in_region(region_included) for region_included in regions_included], []) - ) - - # Remove all countries from the list of regions excluded. - countries_set -= set( - sum([geo.list_countries_in_region(region_excluded) for region_excluded in regions_excluded], []) - ) - - # Add the list of individual countries to be included. - countries_set |= set(countries_included) - - # Remove the list of individual countries to be excluded. - countries_set -= set(countries_excluded) - - # Convert set of countries into a sorted list. - countries = sorted(countries_set) - - return countries - - -def load_population(regions: Optional[Dict[Any, Any]] = None) -> pd.DataFrame: - """Load OWID population dataset, and add historical regions to it. - - Returns - ------- - population : pd.DataFrame - Population dataset. - - """ - # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] - - # Add data for historical regions (if not in population) by adding the population of its current successors. - countries_with_population = population["country"].unique() - - # Consider additional regions (e.g. historical regions). - if regions is None: - regions = {} - missing_countries = [country for country in regions if country not in countries_with_population] - for country in missing_countries: - members = regions[country]["regions_included"] - _population = ( - population[population["country"].isin(members)] - .groupby("year") - .agg({"population": "sum", "country": "nunique"}) - .reset_index() - ) - # Select only years for which we have data for all member countries. - _population = _population[_population["country"] == len(members)].reset_index(drop=True) - _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) - - error = "Duplicate country-years found in population. Check if historical regions changed." - assert population[population.duplicated(subset=["country", "year"])].empty, error - - return cast(pd.DataFrame, population) - - -def load_income_groups() -> pd.DataFrame: - """Load dataset of income groups and add historical regions to it. - - Returns - ------- - income_groups : pd.DataFrame - Income groups data. - - """ - # Load the WorldBank dataset for income grups. - income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() - - # Add historical regions to income groups. - for historic_region in HISTORIC_TO_CURRENT_REGION: - historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] - if historic_region not in income_groups["country"]: - historic_region_df = pd.DataFrame( - { - "country": [historic_region], - "income_group": [historic_region_income_group], - } - ) - income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) - - return cast(pd.DataFrame, income_groups) - - -def add_population( - df: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - interpolate_missing_population: bool = False, - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, - regions: Optional[Dict[Any, Any]] = None, - expected_countries_without_population: List[str] = [], -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - This function has been adapted from datautils.geo, because population currently does not include historic regions. - We include them in this function. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - interpolate_missing_population : bool - True to linearly interpolate population on years that are presented in df, but for which we do not have - population data; otherwise False to keep missing population data as nans. - For example, if interpolate_missing_population is True and df has data for all years between 1900 and 1910, - but population is only given for 1900 and 1910, population will be linearly interpolated between those years. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - regions : dict - Definitions of regions whose population also needs to be included. - expected_countries_without_population : list - Countries that are expected to not have population (that should be ignored if warnings are activated). - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Load population dataset. - population = load_population(regions=regions).rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - if interpolate_missing_population: - # For some countries we have population data only on certain years, e.g. 1900, 1910, etc. - # Optionally fill missing years linearly. - countries_in_data = df[country_col].unique() - years_in_data = df[year_col].unique() - - population = population.set_index([country_col, year_col]).reindex( - pd.MultiIndex.from_product([countries_in_data, years_in_data], names=[country_col, year_col]) - ) - - population = population.groupby(country_col).transform( - lambda x: x.interpolate(method="linear", limit_direction="both") - ) - - error = "Countries without population data differs from list of expected countries without population data." - assert set(population[population[population_col].isnull()].reset_index()[country_col]) == set( - expected_countries_without_population - ), error - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population - - -def detect_overlapping_regions( - df, index_columns, region_and_members, country_col="country", year_col="year", ignore_zeros=True -): - """Detect years on which the data for two regions overlap, e.g. a historical region and one of its successors. - - Parameters - ---------- - df : _type_ - Data (with a dummy index). - index_columns : _type_ - Names of index columns. - region_and_members : _type_ - Regions to check for overlaps. Each region must have a dictionary "regions_included", listing the subregions - contained. If the region is historical, "regions_included" would be the list of successor countries. - country_col : str, optional - Name of country column (usually "country"). - year_col : str, optional - Name of year column (usually "year"). - ignore_zeros : bool, optional - True to ignore overlaps of zeros. - - Returns - ------- - all_overlaps : dict - All overlaps found. - - """ - # Sum over all columns to get the total sum of each column for each country-year. - df_total = ( - df.groupby([country_col, year_col]) - .agg({column: "sum" for column in df.columns if column not in index_columns}) - .reset_index() - ) - # Create a list of values that will be ignored in overlaps (usually zero or nothing). - if ignore_zeros: - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - # List all variables in data (ignoring index columns). - variables = [column for column in df.columns if column not in index_columns] - # List all country names found in data. - countries_in_data = df[country_col].unique().tolist() - # List all regions found in data. - regions = [country for country in list(region_and_members) if country in countries_in_data] - # Initialize a dictionary that will store all overlaps found. - all_overlaps = {} - for region in regions: - # List members of current region. - members = [member for member in region_and_members[region]["regions_included"] if member in countries_in_data] - for member in members: - # Select data for current region. - region_values = ( - df_total[df_total[country_col] == region] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=variables, how="all") - ) - # Select data for current member. - member_values = ( - df_total[df_total[country_col] == member] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=variables, how="all") - ) - # Concatenate both selections of data, and select duplicated rows. - combined = pd.concat([region_values, member_values]) - overlaps = combined[combined.duplicated(subset=[year_col], keep=False)] # type: ignore - if len(overlaps) > 0: - # Add the overlap found to the dictionary of all overlaps. - all_overlaps.update({year: set(overlaps[country_col]) for year in overlaps[year_col].unique()}) - - # Sort overlaps conveniently. - all_overlaps = {year: all_overlaps[year] for year in sorted(list(all_overlaps))} - - return all_overlaps diff --git a/etl/steps/archive/garden/energy/2022-12-13/uk_historical_electricity.meta.yml b/etl/steps/archive/garden/energy/2022-12-13/uk_historical_electricity.meta.yml deleted file mode 100644 index a864510d549..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-13/uk_historical_electricity.meta.yml +++ /dev/null @@ -1,91 +0,0 @@ -dataset: - namespace: energy - version: 2022-12-13 - title: UK historical electricity (DUKES, 2022b) - short_name: uk_historical_electricity - description: | - All data prior to 1985 (and prior to 1965 in the case of renewables), is sourced from [the Digest of UK Energy Statistics (DUKES), published by the UK's Department for Business, Energy & Industrial Strategy](https://www.gov.uk/government/statistics/electricity-chapter-5-digest-of-united-kingdom-energy-statistics-dukes). - - All other data is sourced from the [BP's Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html) and [Ember's Yearly Electricity Data](https://ember-climate.org/data-catalogue/yearly-electricity-data/). Where data from BP is available for a given year, we rely on it as the primary source. We then supplement this with data from Ember where data from BP is not available. - sources: - - name: Digest of UK Energy Statistics - published_by: UK's Department for Business, Energy & Industrial Strategy - date_accessed: 2022-09-21 - url: https://www.gov.uk/government/statistical-data-sets/historical-electricity-data - - name: BP Statistical Review of World Energy - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - - name: Ember's Yearly Electricity Data - published_by: Ember - publication_year: 2022 - date_accessed: 2022-12-13 - url: https://ember-climate.org/data-catalogue/yearly-electricity-data/ - - name: Ember's European Electricity Review - published_by: Ember - publication_year: 2022 - date_accessed: 2022-08-01 - url: https://ember-climate.org/insights/research/european-electricity-review-2022/ -tables: - uk_historical_electricity: - variables: - coal_generation: - title: Electricity generation from coal - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - oil_generation: - title: Electricity generation from oil - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - gas_generation: - title: Electricity generation from gas - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - nuclear_generation: - title: Electricity generation from nuclear - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - hydro_generation: - title: Electricity generation from hydropower - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - solar_generation: - title: Electricity generation from solar - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - wind_generation: - title: Electricity generation from wind - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - other_renewables_generation: - title: Electricity generation from other renewables - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - total_generation: - title: Total electricity generation - short_unit: TWh - unit: terawatt-hours - display: - name: Total electricity generation - net_imports: - title: Net electricity imports - short_unit: TWh - unit: terawatt-hours - display: - name: Net electricity imports diff --git a/etl/steps/archive/garden/energy/2022-12-13/uk_historical_electricity.py b/etl/steps/archive/garden/energy/2022-12-13/uk_historical_electricity.py deleted file mode 100644 index e4f3df578a4..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-13/uk_historical_electricity.py +++ /dev/null @@ -1,205 +0,0 @@ -"""Combine UK BEIS' historical electricity with our electricity mix dataset (by BP & Ember) -to obtain a long-run electricity mix in the UK. - -""" - -from typing import cast - -import numpy as np -import pandas as pd -from owid import catalog -from owid.datautils import dataframes - -from etl.helpers import PathFinder - -# Get relevant paths for current file. -paths = PathFinder(__file__) - - -def prepare_electricity_mix_data(df_elec: pd.DataFrame) -> pd.DataFrame: - """Select necessary columns from the electricity mix, and select rows corresponding to the UK. - - Parameters - ---------- - df_elec : pd.DataFrame - Data from the main table of the electricity mix dataset. - - Returns - ------- - df_elec : pd.DataFrame - Selected columns and rows from the electricity mix data. - - """ - df_elec = df_elec.copy() - - # Select columns and rename them conveniently. - elec_columns = { - "country": "country", - "year": "year", - "coal_generation__twh": "coal_generation", - "gas_generation__twh": "gas_generation", - "oil_generation__twh": "oil_generation", - "hydro_generation__twh": "hydro_generation", - "nuclear_generation__twh": "nuclear_generation", - "other_renewables_including_bioenergy_generation__twh": "other_renewables_generation", - "solar_generation__twh": "solar_generation", - "total_generation__twh": "total_generation", - "wind_generation__twh": "wind_generation", - "total_net_imports__twh": "net_imports", - } - - # Select necessary columns from electricity mix dataset. - df_elec = df_elec[list(elec_columns)].rename(columns=elec_columns) - - # Select UK data from Ember dataset. - df_elec = df_elec[df_elec["country"] == "United Kingdom"].reset_index(drop=True) - - return df_elec - - -def prepare_beis_data(df_beis: pd.DataFrame) -> pd.DataFrame: - """Select (and rename) columns from the UK historical electricity data from BEIS. - - Parameters - ---------- - df_beis : pd.DataFrame - Combined data for UK historical electricity data from BEIS. - - Returns - ------- - df_beis : pd.DataFrame - Selected columns from the UK historical electricity data. - - """ - df_beis = df_beis.copy() - - # Select columns and rename them conveniently. - beis_columns = { - "country": "country", - "year": "year", - "coal": "coal_generation", - "oil": "oil_generation", - "electricity_generation": "total_generation", - "gas": "gas_generation", - "hydro": "hydro_generation", - "nuclear": "nuclear_generation", - "net_imports": "net_imports", - "implied_efficiency": "implied_efficiency", - "wind_and_solar": "wind_and_solar_generation", - } - df_beis = df_beis[list(beis_columns)].rename(columns=beis_columns) - - return df_beis - - -def combine_beis_and_electricity_mix_data(df_beis: pd.DataFrame, df_elec: pd.DataFrame) -> pd.DataFrame: - """Combine BEIS data on UK historical electricity with the electricity mix data (after having selected rows for only - the UK). - - There are different processing steps done to the data, see comments below in the code. - - Parameters - ---------- - df_beis : pd.DataFrame - Selected data from BEIS on UK historical electricity. - df_elec : pd.DataFrame - Selected data from the electricity mix (after having selected rows for the UK). - - Returns - ------- - df_combined : pd.DataFrame - Combined and processed data with a verified index. - - """ - # In the BEIS dataset, wind and solar are given as one joined variable. - # Check if we can ignore it (since it's better to have the two sources separately). - # Find the earliest year informed in the electricity mix for solar or wind generation. - solar_or_wind_first_year = df_elec[df_elec["wind_generation"].notnull() | df_elec["solar_generation"].notnull()][ - "year" - ].min() - # Now check that, prior to that year, all generation from solar and wind was zero. - assert df_beis[df_beis["year"] < solar_or_wind_first_year]["wind_and_solar_generation"].fillna(0).max() == 0 - # Therefore, since wind and solar is always zero (prior to the beginning of the electricity mix data) - # we can ignore this column from the BEIS dataset. - df_beis = df_beis.drop(columns=["wind_and_solar_generation"]) - # And create two columns of zeros for wind and solar. - df_beis["solar_generation"] = 0 - df_beis["wind_generation"] = 0 - # Similarly, given that in the BEIS dataset there is no data about other renewable sources (apart from hydro, solar - # and wind), we can assume that the contribution from other renewables is zero. - df_beis["other_renewables_generation"] = 0 - # And ensure these new columns do not have any values after the electricity mix data begins. - df_beis.loc[ - df_beis["year"] >= solar_or_wind_first_year, - ["solar_generation", "wind_generation", "other_renewables_generation"], - ] = np.nan - - # BEIS data on fuel input gives raw energy, but we want electricity generation (which is less, given the - # inefficiencies of the process of burning fossil fuels). - # They also include a variable on "implied efficiency", which they obtain by dividing the input energy by the total - # electricity generation. - # We multiply the raw energy by the efficiency to have an estimate of the electricity generated by each fossil fuel. - # This only affects data prior to the beginning of the electricity mix's data (which is 1965 for renewables and - # nuclear, and 1985 for the rest). - for source in ["coal", "oil", "gas"]: - df_beis[f"{source}_generation"] *= df_beis["implied_efficiency"] - - # Drop other unnecessary columns. - df_beis = df_beis.drop(columns=["implied_efficiency"]) - - # Combine BEIS and electricity mix data. - df_combined = dataframes.combine_two_overlapping_dataframes( - df1=df_elec, df2=df_beis, index_columns=["country", "year"] - ) - - # Add an index and sort conveniently. - df_combined = df_combined.set_index(["country", "year"]).sort_index().sort_index(axis=1) - - return cast(pd.DataFrame, df_combined) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read all required datasets. - ds_beis: catalog.Dataset = paths.load_dependency("uk_historical_electricity") - ds_elec: catalog.Dataset = paths.load_dependency("electricity_mix") - - # Gather all required tables from all datasets. - tb_beis = ds_beis["uk_historical_electricity"] - tb_elec = ds_elec["electricity_mix"] - - # Create convenient dataframes. - df_beis = pd.DataFrame(tb_beis).reset_index() - df_elec = pd.DataFrame(tb_elec).reset_index() - - # - # Process data. - # - # Prepare electricity mix data. - df_elec = prepare_electricity_mix_data(df_elec=df_elec) - # Prepare BEIS data. - df_beis = prepare_beis_data(df_beis=df_beis) - - # Combine BEIS and electricity mix data. - df_combined = combine_beis_and_electricity_mix_data(df_beis=df_beis, df_elec=df_elec) - - # Create a new table with combined data (and no metadata). - tb_combined = catalog.Table(df_combined) - - # - # Save outputs. - # - # Create new garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Add combined table to the new dataset. - tb_combined.metadata.short_name = "uk_historical_electricity" - ds_garden.add(tb_combined) - - # Update the rest of the metadata from the yaml file. - ds_garden.update_metadata(paths.metadata_path, if_source_exists="replace") - - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2022-12-28/electricity_mix.meta.yml b/etl/steps/archive/garden/energy/2022-12-28/electricity_mix.meta.yml deleted file mode 100644 index 84faa0b650e..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/electricity_mix.meta.yml +++ /dev/null @@ -1,390 +0,0 @@ -dataset: - namespace: energy - version: 2022-12-28 - title: Electricity mix (BP & Ember, 2022d) - short_name: electricity_mix - description: | - Data is compiled by Our World in Data based on three main sources: - - [BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - - [Ember Yearly Electricity Data (2022)](https://ember-climate.org/data-catalogue/yearly-electricity-data/). - - [Ember European Electricity Review (2022)](https://ember-climate.org/insights/research/european-electricity-review-2022/). - - Ember compile their global dataset from various sources including: - - Eurostat: Annual European generation and import data, and monthly data in some cases where better sources are not available. - - ENTSO-E: Monthly European generation and import data. - - EIA: Annual global generation and import data. - - UN: Monthly global generation data in some cases. - - GEM: Annual global coal and gas capacity data. - - IRENA: Annual global capacity data for all non-fossil fuel types, and for Other Fossil where available. - - WRI: Annual global capacity data for Other Fossil where other sources are not available. - - European carbon intensities rely on data from the European Environment Agency (EEA). - - A complete list of data sources for each individual country in Ember's Yearly Electricity Data can be found [here](https://ember-climate.org/app/uploads/2022/07/Ember-Electricity-Data-Methodology.pdf). - - A complete list of data sources for each individual country in Ember's European Electricity Review can be found [here](https://ember-climate.org/app/uploads/2022/02/EER-Methodology.pdf). - - We rely on Ember as the primary source of electricity consumption data. While BP provides primary energy (not just electricity) consumption data and it provides a longer time-series (dating back to 1965) than Ember (which only dates back to 1990), BP does not provide data for all countries or for all sources of electricity (for example, only Ember provides data on electricity from bioenergy). So, where data from Ember is available for a given country and year, we rely on it as the primary source. We then supplement this with data from BP where data from Ember is not available. - - Our World in Data has converted absolute electricity production by source to the share in the mix by dividing each by total electricity production. - - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe", or "Other CIS"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa". - * "Asia" - All Asian countries + "Other Middle East" + "Other CIS" + "Other Asia Pacific". - * "Europe" - All European countries + "Other Europe". - * "North America" - All North American countries + "Other Caribbean" + "Other North America". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa" is included in "Other Africa"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - - [Ember's region definitions](https://ember-climate.org/countries-and-regions/), denoted with "(Ember)", are: - * "G20 (Ember)" - Group of Twenty: Argentina, Australia, Brazil, Canada, China, France, Germany, India, Indonesia, Italy, Japan, Mexico, Russia, Saudi Arabia, South Africa, South Korea, Turkey, United Kingdom, United States and the 27 members of the European Union. - * "G7 (Ember)" - Group of Seven: Canada, France, Germany, Italy, Japan, United Kingdom and United States. - * "Latin America and Caribbean (Ember)": Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, Uruguay, Venezuela, Aruba, British Virgin Islands, Cayman Islands, Falkland Islands, French Guiana, Guadeloupe, Martinique, Montserrat, Puerto Rico, Turks and Caicos Islands and United States Virgin Islands. - * "Middle East (Ember)": Bahrain, Iran, Iraq, Israel, Jordan, Kuwait, Lebanon, Oman, Palestine, Qatar, Saudi Arabia, Syria, United Arab Emirates and Yemen. - * "OECD (Ember)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, United Kingdom, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, and United States. - sources: - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - - name: Our World in Data based on Ember's Yearly Electricity Data (2022) - published_by: Ember - publication_year: 2022 - date_accessed: 2022-12-13 - url: https://ember-climate.org/data-catalogue/yearly-electricity-data/ - - name: Our World in Data based on Ember's European Electricity Review (2022) - published_by: Ember - publication_year: 2022 - date_accessed: 2022-08-01 - url: https://ember-climate.org/insights/research/european-electricity-review-2022/ -tables: - electricity_mix: - variables: - bioenergy_generation__twh: - title: Electricity from bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy - bioenergy_share_of_electricity__pct: - title: Bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Bioenergy - co2_intensity__gco2_kwh: - title: Carbon intensity of electricity (gCO2/kWh) - short_unit: gCO₂ - unit: grams of CO₂ equivalent per kilowatt-hour - display: - name: Carbon intensity of electricity per kilowatt-hour - coal_generation__twh: - title: Electricity from coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - coal_share_of_electricity__pct: - title: Coal (% electricity) - short_unit: '%' - unit: '%' - display: - name: Coal - fossil_generation__twh: - title: Electricity from fossil fuels (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil fuels - fossil_share_of_electricity__pct: - title: Fossil fuels (% electricity) - short_unit: '%' - unit: '%' - display: - name: Fossil fuels - gas_generation__twh: - title: Electricity from gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas - gas_share_of_electricity__pct: - title: Gas (% electricity) - short_unit: '%' - unit: '%' - display: - name: Gas - hydro_generation__twh: - title: Electricity from hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - hydro_share_of_electricity__pct: - title: Hydro (% electricity) - short_unit: '%' - unit: '%' - display: - name: Hydropower - low_carbon_generation__twh: - title: Low-carbon electricity (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Low-carbon electricity - low_carbon_share_of_electricity__pct: - title: Low-carbon electricity (% electricity) - short_unit: '%' - unit: '%' - display: - name: Share of electricity from low-carbon sources - net_imports_share_of_demand__pct: - title: Net electricity imports as a share of demand (%) - short_unit: '%' - unit: '%' - display: - name: Net electricity imports as a share of demand - nuclear_generation__twh: - title: Electricity from nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - nuclear_share_of_electricity__pct: - title: Nuclear (% electricity) - short_unit: '%' - unit: '%' - display: - name: Nuclear - oil_generation__twh: - title: Electricity from oil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - oil_share_of_electricity__pct: - title: Oil (% electricity) - short_unit: '%' - unit: '%' - display: - name: Oil - other_renewables_excluding_bioenergy_generation__twh: - title: Other renewables excluding bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables, excluding bioenergy - other_renewables_excluding_bioenergy_share_of_electricity__pct: - title: Other renewables excluding bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Other renewables, excluding bioenergy - other_renewables_including_bioenergy_generation__twh: - title: Other renewables including bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables, including bioenergy - other_renewables_including_bioenergy_share_of_electricity__pct: - title: Other renewables including bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Other renewables, including bioenergy - per_capita_bioenergy_generation__kwh: - title: Bioenergy electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Bioenergy electricity per capita - numDecimalPlaces: 0 - per_capita_coal_generation__kwh: - title: Coal electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Coal electricity per capita - numDecimalPlaces: 0 - per_capita_fossil_generation__kwh: - title: Fossil fuel electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Fossil fuel electricity per capita - numDecimalPlaces: 0 - per_capita_gas_generation__kwh: - title: Gas electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Gas electricity per capita - numDecimalPlaces: 0 - per_capita_hydro_generation__kwh: - title: Hydro electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Hydro electricity per capita - numDecimalPlaces: 0 - per_capita_low_carbon_generation__kwh: - title: Low-carbon electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Low-carbon electricity per capita - numDecimalPlaces: 0 - per_capita_nuclear_generation__kwh: - title: Nuclear electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Nuclear electricity per capita - numDecimalPlaces: 0 - per_capita_oil_generation__kwh: - title: Oil electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Oil electricity per capita - numDecimalPlaces: 0 - per_capita_other_renewables_excluding_bioenergy_generation__kwh: - title: Other renewable electricity excluding bioenergy per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Other renewable electricity excluding bioenergy per capita - numDecimalPlaces: 0 - per_capita_other_renewables_including_bioenergy_generation__kwh: - title: Other renewable electricity including bioenergy per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Other renewable electricity including bioenergy per capita - numDecimalPlaces: 0 - per_capita_renewable_generation__kwh: - title: Renewable electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Renewable electricity per capita - numDecimalPlaces: 0 - per_capita_solar_generation__kwh: - title: Solar electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Solar electricity per capita - numDecimalPlaces: 0 - per_capita_total_generation__kwh: - title: Per capita electricity (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Per capita electricity use - numDecimalPlaces: 0 - per_capita_wind_generation__kwh: - title: Wind electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Wind electricity per capita - numDecimalPlaces: 0 - population: - title: Population - short_unit: people - unit: people - display: - name: Population - primary_energy_consumption__twh: - title: Electricity from primary energy consumption (twh) (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Primary energy consumption - renewable_generation__twh: - title: Electricity from renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables - renewable_share_of_electricity__pct: - title: Renewables (% electricity) - short_unit: '%' - unit: '%' - display: - name: Renewables - numDecimalPlaces: 2 - solar_generation__twh: - title: Electricity from solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - solar_share_of_electricity__pct: - title: Solar (% electricity) - short_unit: '%' - unit: '%' - display: - name: Solar - total_demand__twh: - title: Electricity demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Electricity demand - total_electricity_share_of_primary_energy__pct: - title: Electricity as share of primary energy (%) - short_unit: '%' - unit: '%' - display: - name: Electricity as share of primary energy - total_emissions__mtco2: - title: Emissions (MtCO2) - short_unit: million t - unit: million tonnes CO2 equivalent - display: - name: Emissions - total_generation__twh: - title: Electricity generation (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Electricity generation - total_net_imports__twh: - title: Net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Net imports - wind_generation__twh: - title: Electricity from wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - wind_share_of_electricity__pct: - title: Wind (% electricity) - short_unit: '%' - unit: '%' - display: - name: Wind diff --git a/etl/steps/archive/garden/energy/2022-12-28/electricity_mix.py b/etl/steps/archive/garden/energy/2022-12-28/electricity_mix.py deleted file mode 100644 index ffdcb66f9f4..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/electricity_mix.py +++ /dev/null @@ -1,304 +0,0 @@ -"""Garden step that combines BP's statistical review with Ember's combined electricity data (combination of the European -Electricity Review and the Yearly Electricity Data) to create the Electricity Mix (BP & Ember) dataset. - -""" - -from typing import Dict, List - -import pandas as pd -from owid import catalog -from owid.datautils.dataframes import combine_two_overlapping_dataframes -from shared import CURRENT_DIR, add_population - -from etl.paths import DATA_DIR - -# Details for dataset to export. -DATASET_SHORT_NAME = "electricity_mix" -METADATA_FILE_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Details for datasets to import. -BP_DATASET_PATH = DATA_DIR / "garden/bp/2022-12-28/statistical_review" -EMBER_DATASET_PATH = DATA_DIR / "garden/ember/2022-12-13/combined_electricity" - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 -# Megatonnes to grams. -MT_TO_G = 1e12 - - -def process_bp_data(table_bp: catalog.Table) -> pd.DataFrame: - """Load necessary columns from BP's Statistical Review dataset, and create some new variables (e.g. electricity - generation from fossil fuels). - - Parameters - ---------- - table_bp : catalog.Table - BP's Statistical Review (already processed, with harmonized countries and region aggregates). - - Returns - ------- - df_bp : pd.DataFrame - Processed BP data. - - """ - # Columns to load from BP dataset. - columns = { - "electricity_generation": "total_generation__twh", - "primary_energy_consumption__twh": "primary_energy_consumption__twh", - "hydro_generation__twh": "hydro_generation__twh", - "nuclear_generation__twh": "nuclear_generation__twh", - "solar_generation__twh": "solar_generation__twh", - "wind_generation__twh": "wind_generation__twh", - "geo_biomass_other__twh": "other_renewables_including_bioenergy_generation__twh", - "elec_gen_from_oil": "oil_generation__twh", - "elec_gen_from_coal": "coal_generation__twh", - "elec_gen_from_gas": "gas_generation__twh", - } - table_bp = table_bp[list(columns)].rename(columns=columns, errors="raise") - # New columns to be created by summing other columns. - aggregates: Dict[str, List[str]] = { - "fossil_generation__twh": [ - "oil_generation__twh", - "coal_generation__twh", - "gas_generation__twh", - ], - "renewable_generation__twh": [ - "hydro_generation__twh", - "solar_generation__twh", - "wind_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - ], - "low_carbon_generation__twh": [ - "renewable_generation__twh", - "nuclear_generation__twh", - ], - } - - # Create a dataframe with a dummy index. - df_bp = pd.DataFrame(table_bp).reset_index() - - # Create new columns, by adding up other columns (and allowing for only one nan in each sum). - for new_column in aggregates: - df_bp[new_column] = df_bp[aggregates[new_column]].sum(axis=1, min_count=len(aggregates[new_column]) - 1) - - return df_bp - - -def process_ember_data(table_ember: catalog.Table) -> pd.DataFrame: - """Load necessary columns from the Combined Electricity dataset and prepare a dataframe with the required variables. - - Parameters - ---------- - table_ember : catalog.Table - Combined Electricity (combination of Ember's Yearly Electricity Data and European Electricity Review). - - Returns - ------- - df_ember : pd.DataFrame - Processed Combined Electricity data. - - """ - # Columns to load from Ember dataset. - columns = { - "generation__bioenergy__twh": "bioenergy_generation__twh", - "generation__gas__twh": "gas_generation__twh", - "generation__coal__twh": "coal_generation__twh", - "generation__other_fossil__twh": "oil_generation__twh", - "generation__renewables__twh": "renewable_generation__twh", - "generation__other_renewables__twh": "other_renewables_excluding_bioenergy_generation__twh", - "generation__clean__twh": "low_carbon_generation__twh", - "generation__hydro__twh": "hydro_generation__twh", - "generation__nuclear__twh": "nuclear_generation__twh", - "generation__solar__twh": "solar_generation__twh", - "generation__wind__twh": "wind_generation__twh", - "generation__fossil__twh": "fossil_generation__twh", - "generation__total_generation__twh": "total_generation__twh", - "demand__total_demand__twh": "total_demand__twh", - "emissions__total_emissions__mtco2": "total_emissions__mtco2", - "emissions__co2_intensity__gco2_kwh": "co2_intensity__gco2_kwh", - "imports__total_net_imports__twh": "total_net_imports__twh", - } - table_ember = table_ember[list(columns)].rename(columns=columns, errors="raise") - - # Create a dataframe with a dummy index. - df_ember = pd.DataFrame(table_ember).reset_index() - - # In BP data, there is a variable "Geo Biomass Other", which combines all other renewables. - # In Ember data, "other rewenables" excludes bioenergy. - # To be able to combine both datasets, create a new variable for generation of other renewables including bioenergy. - df_ember["other_renewables_including_bioenergy_generation__twh"] = ( - df_ember["other_renewables_excluding_bioenergy_generation__twh"] + df_ember["bioenergy_generation__twh"] - ) - - return df_ember - - -def add_per_capita_variables(combined: pd.DataFrame) -> pd.DataFrame: - """Add per capita variables (in kWh per person) to the combined BP and Ember dataframe. - - The list of variables to make per capita are given in this function. The new variable names will be 'per_capita_' - followed by the original variable's name. - - Parameters - ---------- - combined : pd.DataFrame - Combination of BP's Statistical Review and Ember's Combined Electricity. - - Returns - ------- - combined : pd.DataFrame - Input dataframe after adding per capita variables. - - """ - combined = combined.copy() - - # Variables to make per capita. - per_capita_variables = [ - "bioenergy_generation__twh", - "coal_generation__twh", - "fossil_generation__twh", - "gas_generation__twh", - "hydro_generation__twh", - "low_carbon_generation__twh", - "nuclear_generation__twh", - "oil_generation__twh", - "other_renewables_excluding_bioenergy_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - "renewable_generation__twh", - "solar_generation__twh", - "total_generation__twh", - "wind_generation__twh", - ] - # Add a column for population (only for harmonized countries). - combined = add_population(df=combined, warn_on_missing_countries=False) - - for variable in per_capita_variables: - assert "twh" in variable, f"Variables are assumed to be in TWh, but {variable} is not." - new_column = "per_capita_" + variable.replace("__twh", "__kwh") - combined[new_column] = combined[variable] * TWH_TO_KWH / combined["population"] - - return combined - - -def add_share_variables(combined: pd.DataFrame) -> pd.DataFrame: - """Add variables for the electricity generation as a share of the total electricity generation (as a percentage). - - The following new variables will be created: - * For each source (e.g. coal_generation__twh) in a list given in this function, a new variable will be created - (named, e.g. coal_share_of_electricity__pct). - * Total electricity generation as a share of primary energy consumption. - * Total net electricity imports as a share of total electricity demand. - - Parameters - ---------- - combined : pd.DataFrame - Combination of BP's Statistical Review and Ember's Combined Electricity. - - Returns - ------- - combined : pd.DataFrame - Input dataframe after adding share variables. - - """ - # Variables to make as share of electricity (new variable names will be the name of the original variable followed - # by '_share_of_electricity__pct'). - share_variables = [ - "bioenergy_generation__twh", - "coal_generation__twh", - "fossil_generation__twh", - "gas_generation__twh", - "hydro_generation__twh", - "low_carbon_generation__twh", - "nuclear_generation__twh", - "oil_generation__twh", - "other_renewables_excluding_bioenergy_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - "renewable_generation__twh", - "solar_generation__twh", - "total_generation__twh", - "wind_generation__twh", - ] - for variable in share_variables: - new_column = variable.replace("_generation__twh", "_share_of_electricity__pct") - combined[new_column] = 100 * combined[variable] / combined["total_generation__twh"] - - # Calculate the percentage of electricity as a share of primary energy. - combined["total_electricity_share_of_primary_energy__pct"] = ( - 100 * combined["total_generation__twh"] / combined["primary_energy_consumption__twh"] - ) - - # Calculate the percentage of electricity demand that is imported. - combined["net_imports_share_of_demand__pct"] = ( - 100 * combined["total_net_imports__twh"] / combined["total_demand__twh"] - ) - - # Sanity check. - error = "Total electricity share does not add up to 100%." - assert all(abs(combined["total_share_of_electricity__pct"].dropna() - 100) < 0.01), error - - # Remove unnecessary columns. - combined = combined.drop(columns=["total_share_of_electricity__pct"]) - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BP's statistical review dataset. - ds_bp = catalog.Dataset(BP_DATASET_PATH) - # Select main table. - table_bp = ds_bp["statistical_review"] - # Create a convenient dataframe. - df_bp = pd.DataFrame(table_bp) - - # Idem for Ember's combined electricity. - ds_ember = catalog.Dataset(EMBER_DATASET_PATH) - table_ember = ds_ember["combined_electricity"] - df_ember = pd.DataFrame(table_ember) - - # - # Process data. - # - # Prepare BP and Ember data. - df_bp = process_bp_data(table_bp=table_bp) - df_ember = process_ember_data(table_ember=table_ember) - - # Combine both tables, giving priority to Ember data (on overlapping values). - combined = combine_two_overlapping_dataframes(df1=df_ember, df2=df_bp, index_columns=["country", "year"]) - - # Add carbon intensities. - # There is already a variable for this in the Ember dataset, but now that we have combined - # BP and Ember data, intensities should be recalculated for consistency. - combined["co2_intensity__gco2_kwh"] = (combined["total_emissions__mtco2"] * MT_TO_G) / ( - combined["total_generation__twh"] * TWH_TO_KWH - ) - - # Add per capita variables. - combined = add_per_capita_variables(combined=combined) - - # Add "share" variables. - combined = add_share_variables(combined=combined) - - # Set an appropriate index and sort rows and columns conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Convert dataframe into a table (with no metadata). - table = catalog.Table(combined, underscore=True) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Add table to dataset. - table.metadata.short_name = "electricity_mix" - ds_garden.add(table) - - # Update dataset and table metadata using yaml file. - ds_garden.update_metadata(METADATA_FILE_PATH) - - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2022-12-28/fossil_fuel_production.meta.yml b/etl/steps/archive/garden/energy/2022-12-28/fossil_fuel_production.meta.yml deleted file mode 100644 index ae2b1aab34e..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/fossil_fuel_production.meta.yml +++ /dev/null @@ -1,169 +0,0 @@ -dataset: - namespace: energy - version: 2022-12-28 - title: Fossil fuel production (BP & Shift, 2022b) - short_name: fossil_fuel_production - description: >- - This dataset on fossil fuel production is generated by combining the latest data from [the BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html) - and [The Shift Dataportal](https://www.theshiftdataportal.org/energy). - - BP provide fossil fuel production data from 1965 onwards (and crude prices from 1861 onwards). The Shift Dataportal provides long-term data from 1900, but only extends to 2016. - - To maintain consistency with the energy datasets on Our World in Data, we have taken BP data as preference - meaning if BP provides data for the given country and year, this is used. Where data is not available - from BP for a given country, or pre-1965 we rely on data from Shift. - - We have converted primary production in exajoules to terawatt-hours using the conversion factor: 1,000,000 / 3,600 ~ 278. - - Production per capita has been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). - sources: - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: >- - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes - countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like - "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These - aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), - denoted with "(BP)", are: - - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - - * "Australasia (BP)": Australia, New Zealand. - - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - - * "North America (BP)": US (excluding US territories), Canada, Mexico - - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe", or "Other CIS"). We define our regions in the following way: - - * "Africa" - All African countries + "Other Africa". - - * "Asia" - All Asian countries + "Other Middle East" + "Other CIS" + "Other Asia Pacific". - - * "Europe" - All European countries + "Other Europe". - - * "North America" - All North American countries + "Other Caribbean" + "Other North America". - - * "Oceania" - All Oceanian countries. - - * "South America" - All South American countries + "Other South America". - - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). - Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa" is included in "Other Africa"). - Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - - name: Our World in Data based on The Shift Dataportal (2022) - published_by: The Shift Dataportal - date_accessed: 2022-07-18 - url: https://www.theshiftdataportal.org/energy -tables: - fossil_fuel_production: - variables: - annual_change_in_coal_production__pct: - title: Annual change in coal production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in coal production - annual_change_in_coal_production__twh: - title: Annual change in coal production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in coal production - annual_change_in_gas_production__pct: - title: Annual change in gas production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in gas production - annual_change_in_gas_production__twh: - title: Annual change in gas production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in gas production - annual_change_in_oil_production__pct: - title: Annual change in oil production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in oil production - annual_change_in_oil_production__twh: - title: Annual change in oil production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in oil production - coal_production__twh: - title: Coal production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal production - numDecimalPlaces: 0 - coal_production_per_capita__kwh: - title: Coal production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Coal production per capita - numDecimalPlaces: 0 - gas_production__twh: - title: Gas production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas production - numDecimalPlaces: 0 - gas_production_per_capita__kwh: - title: Gas production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Gas production per capita - numDecimalPlaces: 0 - oil_production__twh: - title: Oil production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil production - numDecimalPlaces: 0 - oil_production_per_capita__kwh: - title: Oil production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Oil production per capita - numDecimalPlaces: 0 diff --git a/etl/steps/archive/garden/energy/2022-12-28/fossil_fuel_production.py b/etl/steps/archive/garden/energy/2022-12-28/fossil_fuel_production.py deleted file mode 100644 index 984d04ffe23..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/fossil_fuel_production.py +++ /dev/null @@ -1,266 +0,0 @@ -"""Garden step for Fossil fuel production dataset (part of the OWID Energy dataset), based on a combination of BP's -Statistical Review dataset and Shift data on fossil fuel production. - -""" - -import numpy as np -import pandas as pd -from owid import catalog -from shared import CURRENT_DIR, HISTORIC_TO_CURRENT_REGION, add_population -from structlog import get_logger - -from etl.paths import DATA_DIR - -log = get_logger() - -# Namespace and dataset short name for output dataset. -NAMESPACE = "energy" -DATASET_SHORT_NAME = "fossil_fuel_production" -# Metadata file. -METADATA_FILE_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Namespace, dataset short name and version for required Shift dataset. -SHIFT_NAMESPACE = "shift" -SHIFT_DATASET_NAME = "fossil_fuel_production" -SHIFT_VERSION = "2022-07-18" -# Namespace, dataset short name and version for required BP dataset (processed Statistical Review from garden). -BP_NAMESPACE = "bp" -BP_DATASET_NAME = "statistical_review" -BP_VERSION = "2022-12-28" - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - - -def load_bp_data() -> catalog.Table: - """Load BP data from the local catalog, and rename columns conveniently. - - Returns - ------- - bp_table : catalog.Table - BP data as a table with metadata. - - """ - # Load BP Statistical Review dataset. - bp_dataset = catalog.Dataset(DATA_DIR / "garden" / BP_NAMESPACE / BP_VERSION / BP_DATASET_NAME) - - # Get table. - bp_table = bp_dataset[bp_dataset.table_names[0]].reset_index() - bp_columns = { - "country": "country", - "year": "year", - "coal_production__twh": "Coal production (TWh)", - "gas_production__twh": "Gas production (TWh)", - "oil_production__twh": "Oil production (TWh)", - } - bp_table = bp_table[list(bp_columns)].rename(columns=bp_columns) - - return bp_table - - -def load_shift_data() -> catalog.Table: - """Load Shift data from the local catalog, and rename columns conveniently. - - Returns - ------- - shift_table : catalog.Table - Shift data as a table with metadata. - - """ - shift_columns = { - "country": "country", - "year": "year", - "coal": "Coal production (TWh)", - "gas": "Gas production (TWh)", - "oil": "Oil production (TWh)", - } - shift_dataset = catalog.Dataset(DATA_DIR / "garden" / SHIFT_NAMESPACE / SHIFT_VERSION / SHIFT_DATASET_NAME) - shift_table = shift_dataset[shift_dataset.table_names[0]].reset_index() - shift_table = shift_table[list(shift_columns)].rename(columns=shift_columns) - - return shift_table - - -def combine_bp_and_shift_data(bp_table: catalog.Table, shift_table: catalog.Table) -> pd.DataFrame: - """Combine BP and Shift data. - - Parameters - ---------- - bp_table : catalog.Table - Table from BP Statistical Review dataset. - shift_table : catalog.Table - Table from Shift fossil fuel production dataset. - - Returns - ------- - combined : pd.DataFrame - Combined data. - - """ - # Check that there are no duplicated rows in any of the two datasets. - assert bp_table[bp_table.duplicated(subset=["country", "year"])].empty, "Duplicated rows in BP data." - assert shift_table[shift_table.duplicated(subset=["country", "year"])].empty, "Duplicated rows in Shift data." - - # Combine Shift data (which goes further back in the past) with BP data (which is more up-to-date). - # On coincident rows, prioritise BP data. - index_columns = ["country", "year"] - data_columns = [col for col in bp_table.columns if col not in index_columns] - # We should not concatenate bp and shift data directly, since there are nans in different places. - # Instead, go column by column, concatenate, remove nans, and then keep the BP version on duplicated rows. - - combined = pd.DataFrame({column: [] for column in index_columns}) - for variable in data_columns: - _shift_data = shift_table[index_columns + [variable]].dropna(subset=variable) - _bp_data = bp_table[index_columns + [variable]].dropna(subset=variable) - _combined = pd.concat([_shift_data, _bp_data], ignore_index=True) # type: ignore - # On rows where both datasets overlap, give priority to BP data. - _combined = _combined.drop_duplicates(subset=index_columns, keep="last") # type: ignore - # Combine data for different variables. - combined = pd.merge(combined, _combined, on=index_columns, how="outer") - - # Sort data appropriately. - combined = combined.sort_values(index_columns).reset_index(drop=True) - - return combined - - -def add_annual_change(df: pd.DataFrame) -> pd.DataFrame: - """Add annual change variables to combined BP & Shift dataset. - - Parameters - ---------- - df : pd.DataFrame - Combined BP & Shift dataset. - - Returns - ------- - combined : pd.DataFrame - Combined BP & Shift dataset after adding annual change variables. - - """ - combined = df.copy() - - # Calculate annual change. - combined = combined.sort_values(["country", "year"]).reset_index(drop=True) - for cat in ("Coal", "Oil", "Gas"): - combined[f"Annual change in {cat.lower()} production (%)"] = ( - combined.groupby("country")[f"{cat} production (TWh)"].pct_change() * 100 - ) - combined[f"Annual change in {cat.lower()} production (TWh)"] = combined.groupby("country")[ - f"{cat} production (TWh)" - ].diff() - - return combined - - -def add_per_capita_variables(df: pd.DataFrame) -> pd.DataFrame: - """Add per-capita variables to combined BP & Shift dataset. - - Parameters - ---------- - df : pd.DataFrame - Combined BP & Shift dataset. - - Returns - ------- - combined : pd.DataFrame - Combined BP & Shift dataset after adding per-capita variables. - - """ - df = df.copy() - - # Add population to data. - combined = add_population( - df=df, - country_col="country", - year_col="year", - population_col="population", - warn_on_missing_countries=False, - regions=HISTORIC_TO_CURRENT_REGION, - ) - - # Calculate production per capita. - for cat in ("Coal", "Oil", "Gas"): - combined[f"{cat} production per capita (kWh)"] = ( - combined[f"{cat} production (TWh)"] / combined["population"] * TWH_TO_KWH - ) - combined = combined.drop(errors="raise", columns=["population"]) - - return combined - - -def remove_spurious_values(df: pd.DataFrame) -> pd.DataFrame: - """Remove spurious infinity values. - - These values are generated when calculating the annual change of a variable that is zero or nan the previous year. - - Parameters - ---------- - df : pd.DataFrame - Data that may contain infinity values. - - Returns - ------- - df : pd.DataFrame - Corrected data. - - """ - for column in df.columns: - issues_mask = df[column] == np.inf - issues = df[issues_mask] - if len(issues) > 0: - df.loc[issues_mask, column] = np.nan - - return df - - -def run(dest_dir: str) -> None: - log.info(f"{DATASET_SHORT_NAME}.start") - - # - # Load data. - # - # Load BP statistical review dataset. - bp_table = load_bp_data() - - # Load Shift data on fossil fuel production. - shift_table = load_shift_data() - - # - # Process data. - # - # Combine BP and Shift data. - df = combine_bp_and_shift_data(bp_table=bp_table, shift_table=shift_table) - - # Add annual change. - df = add_annual_change(df=df) - - # Add per-capita variables. - df = add_per_capita_variables(df=df) - - # Remove spurious values. - df = remove_spurious_values(df=df) - - # Create an appropriate index and sort conveniently. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Create new table. - table = catalog.Table(df, underscore=True) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Add table to dataset. - table.metadata.short_name = "fossil_fuel_production" - ds_garden.add(table) - - # Update dataset and table metadata using yaml file. - ds_garden.update_metadata(METADATA_FILE_PATH) - - # Save dataset. - ds_garden.save() - - log.info(f"{DATASET_SHORT_NAME}.end") diff --git a/etl/steps/archive/garden/energy/2022-12-28/global_primary_energy.meta.yml b/etl/steps/archive/garden/energy/2022-12-28/global_primary_energy.meta.yml deleted file mode 100644 index 75c07a53441..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/global_primary_energy.meta.yml +++ /dev/null @@ -1,272 +0,0 @@ -dataset: - namespace: energy - version: 2022-12-28 - title: Global Primary Energy (Smil & BP, 2022b) - short_name: global_primary_energy - description: | - This dataset comprises of a combination of data from Appendix A of Vaclav Smil's Updated and Revised Edition of his book, 'Energy Transitions: Global and National Perspectives' (2017) and BP's Statistical Review of World Energy (2022). - - All data prior to the year 1965 is sourced from Smil (2017). All data from 1965 onwards, with the exception of traditional biomass is sourced from BP Statistical Review. Smil's estimates of traditional biomass are only available until 2015. For the years 2016 onwards, we have assumed a similar level of traditional biomass consumption. This is approximately in line with recent trends in traditional biomass from Smil's data. - - Our World in Data has normalized all BP fossil fuels data to terawatt-hours (TWh) using a conversion factor of 1,000,000 / 3,600 (~277.778) to convert from exajoules (EJ) to TWh. - - This dataset includes primary energy data using two methodologies: - (1) 'direct' primary energy, which does not take account of the inefficiencies in fossil fuel production. Fossil fuel data is compared to electricity generation (not in input equivalents) of nuclear and renewables. - (2) 'substitution' primary energy, which does take account of inefficiencies in fossil fuel production. This converts non-fossil energy to their 'input equivalents': The amount of primary energy that would be needed if they had the same inefficiencies as fossil fuels. This is the methodology adopted by BP when all data is compared in exajoules. - -tables: - global_primary_energy: - variables: - biofuels__twh_direct_energy: - title: Biofuels (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Modern biofuels - biofuels__twh_substituted_energy: - title: Biofuels (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Modern biofuels - coal__twh_direct_energy: - title: Coal (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - coal__twh_substituted_energy: - title: Coal (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - # data_source - gas__twh_direct_energy: - title: Gas (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - gas__twh_substituted_energy: - title: Gas (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - hydropower__twh_direct_energy: - title: Hydropower (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - hydropower__twh_substituted_energy: - title: Hydropower (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - nuclear__twh_direct_energy: - title: Nuclear (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - nuclear__twh_substituted_energy: - title: Nuclear (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - oil__twh_direct_energy: - title: Oil (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - oil__twh_substituted_energy: - title: Oil (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - other_renewables__twh_direct_energy: - title: Other renewables (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - other_renewables__twh_substituted_energy: - title: Other renewables (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - solar__twh_direct_energy: - title: Solar (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - solar__twh_substituted_energy: - title: Solar (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - traditional_biomass__twh_direct_energy: - title: Traditional biomass (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Traditional biomass - traditional_biomass__twh_substituted_energy: - title: Traditional biomass (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Traditional biomass - wind__twh_direct_energy: - title: Wind (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - wind__twh_substituted_energy: - title: Wind (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - total_consumption__twh_direct_energy: - title: Total consumption (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Total consumption - total_consumption__twh_substituted_energy: - title: Total consumption (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Total consumption - biofuels__pct_of_direct_energy: - title: Biofuels (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Modern biofuels - biofuels__pct_of_substituted_energy: - title: Biofuels (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Modern biofuels - coal__pct_of_direct_energy: - title: Coal (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Coal - coal__pct_of_substituted_energy: - title: Coal (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Coal - gas__pct_of_direct_energy: - title: Gas (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Natural gas - gas__pct_of_substituted_energy: - title: Gas (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Natural gas - hydropower__pct_of_direct_energy: - title: Hydropower (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Hydropower - hydropower__pct_of_substituted_energy: - title: Hydropower (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Hydropower - nuclear__pct_of_direct_energy: - title: Nuclear (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Nuclear - nuclear__pct_of_substituted_energy: - title: Nuclear (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Nuclear - oil__pct_of_direct_energy: - title: Oil (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Oil - oil__pct_of_substituted_energy: - title: Oil (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Oil - other_renewables__pct_of_direct_energy: - title: Other renewables (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Other renewables - other_renewables__pct_of_substituted_energy: - title: Other renewables (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Other renewables - solar__pct_of_direct_energy: - title: Solar (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Solar - solar__pct_of_substituted_energy: - title: Solar (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Solar - traditional_biomass__pct_of_direct_energy: - title: Traditional biomass (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Traditional biomass - traditional_biomass__pct_of_substituted_energy: - title: Traditional biomass (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Traditional biomass - wind__pct_of_direct_energy: - title: Wind (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Wind - wind__pct_of_substituted_energy: - title: Wind (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Wind diff --git a/etl/steps/archive/garden/energy/2022-12-28/global_primary_energy.py b/etl/steps/archive/garden/energy/2022-12-28/global_primary_energy.py deleted file mode 100644 index 8e23b3504d5..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/global_primary_energy.py +++ /dev/null @@ -1,224 +0,0 @@ -"""Garden step that combines Vaclav Smil's Global Primary Energy with BP's Statistical Review of World Energy. - -""" - -import numpy as np -import pandas as pd -from owid import catalog -from owid.datautils.dataframes import combine_two_overlapping_dataframes -from shared import CURRENT_DIR, gather_sources_from_tables - -from etl.paths import DATA_DIR - -# Details for dataset to export. -DATASET_SHORT_NAME = "global_primary_energy" -DATASET_TITLE = "Global Primary Energy (Smil & BP, 2022)" -METADATA_FILE_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Details for datasets to import. -BP_DATASET_PATH = DATA_DIR / "garden/bp/2022-12-28/statistical_review" -SMIL_DATASET_PATH = DATA_DIR / "garden/smil/2017-01-01/global_primary_energy" - -# Exajoules to terawatt-hours. -EJ_TO_TWH = 1e6 / 3600 - -# Average efficiency factor assumed to convert direct energy to input-equivalent energy of Smil's data. -# This factor will be used for hydropower, nuclear, other renewables, solar and wind -# (for which there is data until 1960). -# In practice, it only affects hydropower, since all other non-fossil sources are zero prior to 1960. -# All other energy sources in Smil's data will not be affected by this factor. -EFFICIENCY_FACTOR = 0.36 - - -def prepare_bp_data(tb_bp: catalog.Table) -> pd.DataFrame: - df_bp = pd.DataFrame(tb_bp).reset_index() - - # BP gives generation of direct energy in TWh, and, for non-fossil sources of electricity, - # consumption of input-equivalent energy in EJ. - # The input-equivalent energy is the amount of energy that would be required to generate a given amount of (direct) - # electricity if non-fossil sources were as inefficient as a standard thermal power plant. - # Therefore, direct and substituted energies for Biofuels, Coal, Gas and Oil are identical. - # On the other hand, direct and substituted energy are different for non-fossil electricity sources, namely - # Hydropower, Nuclear, Solar, Other renewables, and Wind. - # The difference is of a factor of ~38%, which is roughly the efficiency of a standard power plant. - # More specifically, BP assumes (for Biofuels, Coal, Gas and Oil) an efficiency factor that grows from 36% - # (until year 2000) to 40.6% (in 2021), to better reflect changes in efficiency over time. - # In the case of biomass used in electricity (included in 'Other renewables'), - # BP assumes a constant factor of 32% for all years. - # For more details: - # https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/pdfs/energy-economics/statistical-review/bp-stats-review-2022-methodology.pdf - bp_columns = { - "country": "country", - "year": "year", - # Fossil sources (direct energy). - "biofuels_consumption__twh__total": "biofuels__twh_direct_energy", - "coal_consumption__twh": "coal__twh_direct_energy", - "gas_consumption__twh": "gas__twh_direct_energy", - "oil_consumption__twh": "oil__twh_direct_energy", - # Non-fossil electricity sources (direct energy). - "geo_biomass_other__twh": "other_renewables__twh_direct_energy", - "hydro_generation__twh": "hydropower__twh_direct_energy", - "nuclear_generation__twh": "nuclear__twh_direct_energy", - "solar_generation__twh": "solar__twh_direct_energy", - "wind_generation__twh": "wind__twh_direct_energy", - # Non-fossil electricity sources (substituted energy). - "geo_biomass_other__ej": "other_renewables__ej_substituted_energy", - "hydro_consumption__ej": "hydropower__ej_substituted_energy", - "nuclear_consumption__ej": "nuclear__ej_substituted_energy", - "solar_consumption__ej": "solar__ej_substituted_energy", - "wind_consumption__ej": "wind__ej_substituted_energy", - } - df_bp = df_bp[list(bp_columns)].rename(columns=bp_columns) - # Convert all units to TWh. - for column in df_bp.columns: - if "_ej_" in column: - # Create a new column in TWh instead of EJ. - df_bp[column.replace("_ej_", "_twh_")] = df_bp[column] * EJ_TO_TWH - # Remove the column in EJ. - df_bp = df_bp.drop(columns=column) - # For completeness, create columns of substituted energy for fossil sources (even if they would coincide with - # direct energy). - for fossil_source in ["biofuels", "coal", "gas", "oil"]: - df_bp[f"{fossil_source}__twh_substituted_energy"] = df_bp[f"{fossil_source}__twh_direct_energy"] - - # Select only data for the World (which is the only region informed in Smil's data). - df_bp = df_bp[df_bp["country"] == "World"].reset_index(drop=True) - - return df_bp - - -def prepare_smil_data(tb_smil: catalog.Table) -> pd.DataFrame: - df_smil = pd.DataFrame(tb_smil).reset_index() - - # Create columns for input-equivalent energy. - # To do this, we follow a similar approach to BP: - # We create input-equivalent energy by dividing direct energy consumption of non-fossil electricity sources - # (hydropower, nuclear, other renewables, solar and wind) by a factor of 36% - # (called EFFICIENCY_FACTOR, defined above). - # This is the efficiency factor of a typical thermal plant assumed by BP between 1965 and 2000, and we assume this - # factor also applies for the period 1800 to 1965. - # For biomass power (included in other renewables), BP assumed a constant factor of 32%. - # However, since we cannot separate biomass from the rest of sources in 'other renewables', - # we use the same 36% factor as all other non-fossil sources. - for source in ["hydropower", "nuclear", "other_renewables", "solar", "wind"]: - df_smil[f"{source}__twh_substituted_energy"] = df_smil[f"{source}__twh_direct_energy"] / EFFICIENCY_FACTOR - # For fossil sources (including biofuels and traditional biomass), direct and substituted energy are the same. - for source in ["biofuels", "coal", "gas", "oil", "traditional_biomass"]: - df_smil[f"{source}__twh_substituted_energy"] = df_smil[f"{source}__twh_direct_energy"] - - return df_smil - - -def combine_bp_and_smil_data(df_bp: pd.DataFrame, df_smil: pd.DataFrame) -> pd.DataFrame: - df_bp = df_bp.copy() - df_smil = df_smil.copy() - - # Add a new column that informs of the source of the data. - df_bp["data_source"] = "BP" - df_smil["data_source"] = "Smil" - # Combine both dataframes, prioritizing BP's data on overlapping rows. - combined = combine_two_overlapping_dataframes( - df1=df_bp, df2=df_smil, index_columns=["country", "year"] - ).sort_values(["year"]) - - # Replace by numpy nans. - combined = combined.fillna(np.nan) - - # We do not have data for traditional biomass after 2015 (BP does not provide it). - # So, to be able to visualize the complete mix of global energy consumption, - # we extrapolate Smil's data for traditional biomass from 2015 onwards, by repeating its last value. - missing_years_mask = combined["year"] >= df_smil["year"].max() - combined.loc[missing_years_mask, "traditional_biomass__twh_direct_energy"] = combined[missing_years_mask][ - "traditional_biomass__twh_direct_energy" - ].ffill() - combined.loc[missing_years_mask, "traditional_biomass__twh_substituted_energy"] = combined[missing_years_mask][ - "traditional_biomass__twh_substituted_energy" - ].ffill() - - # Create an index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return combined - - -def add_total_consumption_and_percentages(combined: pd.DataFrame) -> pd.DataFrame: - # Create a column with the total direct energy (ensuring there is at least one non-nan value). - combined["total_consumption__twh_direct_energy"] = combined[ - [column for column in combined.columns if "direct_energy" in column] - ].sum(axis=1, min_count=1) - # Create a column with the total substituted energy (ensuring there is at least one non-nan value). - combined["total_consumption__twh_substituted_energy"] = combined[ - [column for column in combined.columns if "substituted_energy" in column] - ].sum(axis=1, min_count=1) - # Add share variables. - sources = [ - "biofuels", - "coal", - "gas", - "hydropower", - "nuclear", - "oil", - "other_renewables", - "solar", - "traditional_biomass", - "wind", - ] - for source in sources: - # Add percentage of each source with respect to the total direct energy. - combined[f"{source}__pct_of_direct_energy"] = ( - 100 * combined[f"{source}__twh_direct_energy"] / combined["total_consumption__twh_direct_energy"] - ) - # Add percentage of each source with respect to the total substituted energy. - combined[f"{source}__pct_of_substituted_energy"] = ( - 100 * combined[f"{source}__twh_substituted_energy"] / combined["total_consumption__twh_substituted_energy"] - ) - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read all required datasets. - ds_bp = catalog.Dataset(BP_DATASET_PATH) - ds_smil = catalog.Dataset(SMIL_DATASET_PATH) - - # Gather all required tables from all datasets. - tb_bp = ds_bp[ds_bp.table_names[0]] - tb_smil = ds_smil[ds_smil.table_names[0]] - - # - # Process data. - # - # Prepare BP data. - df_bp = prepare_bp_data(tb_bp=tb_bp) - # Prepare Smil data. - df_smil = prepare_smil_data(tb_smil=tb_smil) - - # Combine BP and Smil data. - combined = combine_bp_and_smil_data(df_bp=df_bp, df_smil=df_smil) - - # Add variables for total consumption and variables of % share of each source. - combined = add_total_consumption_and_percentages(combined=combined) - - # Create a new table with combined data (and no metadata). - tb_combined = catalog.Table(combined, underscore=True) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Gather metadata sources from all tables' original dataset sources. - ds_garden.metadata.sources = gather_sources_from_tables(tables=[tb_bp, tb_smil]) - - # Add table to dataset. - tb_combined.metadata.short_name = "global_primary_energy" - ds_garden.add(tb_combined) - - # Update dataset and table metadata using yaml file. - ds_garden.update_metadata(METADATA_FILE_PATH) - - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2022-12-28/owid_energy.meta.yml b/etl/steps/archive/garden/energy/2022-12-28/owid_energy.meta.yml deleted file mode 100644 index aae6ccecf70..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/owid_energy.meta.yml +++ /dev/null @@ -1,16 +0,0 @@ -dataset: - namespace: energy - version: 2022-12-28 - title: Energy dataset (OWID, 2022) - short_name: owid_energy - description: | - OWID Energy dataset. - - This dataset will be loaded by [the energy-data repository](https://github.com/owid/energy-data), to create a csv file of the dataset that can be downloaded in one click. - -# Dataset sources will be created in the step by combining all component datasets' sources. -# Also, table metadata will be built from the tables' metadata and the content of owid_energy_variable_mapping.csv. - -tables: - owid_energy: - variables: {} diff --git a/etl/steps/archive/garden/energy/2022-12-28/owid_energy.py b/etl/steps/archive/garden/energy/2022-12-28/owid_energy.py deleted file mode 100644 index a9b5f3032da..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/owid_energy.py +++ /dev/null @@ -1,215 +0,0 @@ -"""Garden step that combines various datasets related to energy and produces the OWID Energy dataset (2022). - -Datasets combined: -* Energy mix from BP. -* Fossil fuel production (BP & Shift, 2022). -* Primary energy consumption (BP & EIA, 2022). -* Electricity mix (BP & Ember, 2022). - -""" - -from typing import Dict, cast - -import numpy as np -import pandas as pd -from owid import catalog -from owid.datautils import dataframes - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -from .shared import ( - BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES, - CURRENT_DIR, - HISTORIC_TO_CURRENT_REGION, - add_population, - gather_sources_from_tables, -) - -paths = PathFinder(__file__) - -# Details for dataset to export. -DATASET_SHORT_NAME = "owid_energy" -DATASET_TITLE = "Energy dataset (OWID, 2022)" -METADATA_FILE_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Details for datasets to import. -ENERGY_MIX_DATASET_PATH = DATA_DIR / "garden/bp/2022-12-28/energy_mix" -ENERGY_MIX_TABLE_NAME = "energy_mix" -FOSSIL_FUEL_PRODUCTION_DATASET_PATH = DATA_DIR / "garden/energy/2022-12-28/fossil_fuel_production" -FOSSIL_FUEL_PRODUCTION_TABLE_NAME = "fossil_fuel_production" -PRIMARY_ENERGY_CONSUMPTION_DATASET_PATH = DATA_DIR / "garden/energy/2022-12-28/primary_energy_consumption" -PRIMARY_ENERGY_CONSUMPTION_TABLE_NAME = "primary_energy_consumption" -ELECTRICITY_MIX_DATASET_PATH = DATA_DIR / "garden/energy/2022-12-28/electricity_mix" -ELECTRICITY_MIX_TABLE_NAME = "electricity_mix" -# Population and GDP are only used to add the population and gdp columns (and no other derived variables). -POPULATION_DATASET_PATH = DATA_DIR / "garden/owid/latest/key_indicators/" -GDP_DATASET_PATH = DATA_DIR / "garden/ggdc/2020-10-01/ggdc_maddison" -# Path to file with mapping of variable names from one of the datasets to the final energy dataset. -VARIABLE_MAPPING_FILE = CURRENT_DIR / "owid_energy_variable_mapping.csv" - - -def combine_tables_data_and_metadata( - tables: Dict[str, catalog.Table], - countries_regions: catalog.Table, - gdp: pd.DataFrame, - variable_mapping: pd.DataFrame, -) -> catalog.Table: - """Combine data and metadata of a list of tables, map variable names and add variables metadata. - - Parameters - ---------- - tables : dict - Dictionary where the key is the short name of the table, and the value is the actual table, for all tables to be - combined. - countries_regions : catalog.Table - Main table from countries-regions dataset. - gdp: pd.DataFrame - GDP (from owid catalog, after converting into a dataframe, resetting index, and selecting country, year and gdp - columns). - variable_mapping : pd.DataFrame - Dataframe (with columns variable, source_variable, source_dataset, description, source) that specifies the names - of variables to take from each table, and their new name in the output table. It also gives a description of the - variable, and the sources of the table. - - Returns - ------- - tb_combined : catalog.Table - Combined table with metadata. - - """ - # Merge all tables as a dataframe (without metadata). - dfs = [pd.DataFrame(table) for table in tables.values()] - df_combined = dataframes.multi_merge(dfs, on=["country", "year"], how="outer") - - # Add ISO codes for countries (regions that are not in countries-regions dataset will have nan iso_code). - df_combined = pd.merge(df_combined, countries_regions, left_on="country", right_on="name", how="left") - - # Add population and gdp of countries (except for dataset-specific regions e.g. those ending in (BP) or (Shift)). - - historical_regions = { - region: HISTORIC_TO_CURRENT_REGION[region] - for region in HISTORIC_TO_CURRENT_REGION - if region in BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES - } - df_combined = add_population(df=df_combined, regions=historical_regions, warn_on_missing_countries=False) - df_combined = pd.merge(df_combined, gdp, on=["country", "year"], how="left") - - # Check that there were no repetition in column names. - error = "Repeated columns in combined data." - assert len([column for column in set(df_combined.columns) if "_x" in column]) == 0, error - - # Create a table with combined data and no metadata. - tb_combined = catalog.Table(df_combined) - - # List the names of the variables described in the variable mapping file. - source_variables = variable_mapping.index.get_level_values(0).tolist() - - # Gather original metadata for each variable, add the descriptions and sources from the variable mapping file. - for source_variable in source_variables: - variable_metadata = variable_mapping.loc[source_variable] - source_dataset = variable_metadata["source_dataset"] - # Check that the variable indeed exists in the original dataset that the variable mapping says. - # Ignore columns "country", "year" (assigned to a dummy dataset 'various_datasets'), "population" (that comes - # from key_indicators) and "iso_alpha3" (that comes from countries_regions dataset). - if source_dataset not in [ - "various_datasets", - "countries_regions", - "key_indicators", - "maddison_gdp", - ]: - error = f"Variable {source_variable} not found in any of the original datasets." - assert source_variable in tables[source_dataset].columns, error - tb_combined[source_variable].metadata = tables[source_dataset][source_variable].metadata - - # Update metadata with the content of the variable mapping file. - tb_combined[source_variable].metadata.description = variable_metadata["description"] - tb_combined[source_variable].metadata.sources = [catalog.meta.Source(name=variable_metadata["source"])] - - # Select only variables in the mapping file, and rename variables according to the mapping. - tb_combined = tb_combined[source_variables].rename(columns=variable_mapping.to_dict()["variable"]) - - # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). - columns_that_must_have_data = [ - column for column in tb_combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] - ] - tb_combined = tb_combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - - # Sanity check. - columns_with_inf = [column for column in tb_combined.columns if len(tb_combined[tb_combined[column] == np.inf]) > 0] - assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" - - # Set index and sort conveniently. - tb_combined = tb_combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - return cast(catalog.Table, tb_combined) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read all required datasets. - ds_energy_mix = catalog.Dataset(ENERGY_MIX_DATASET_PATH) - ds_fossil_fuels = catalog.Dataset(FOSSIL_FUEL_PRODUCTION_DATASET_PATH) - ds_primary_energy = catalog.Dataset(PRIMARY_ENERGY_CONSUMPTION_DATASET_PATH) - ds_electricity_mix = catalog.Dataset(ELECTRICITY_MIX_DATASET_PATH) - - # Gather all required tables from all datasets. - tb_energy_mix = ds_energy_mix[ENERGY_MIX_TABLE_NAME].reset_index() - tb_fossil_fuels = ds_fossil_fuels[FOSSIL_FUEL_PRODUCTION_TABLE_NAME].reset_index() - tb_primary_energy = ds_primary_energy[PRIMARY_ENERGY_CONSUMPTION_TABLE_NAME].reset_index() - tb_electricity_mix = ds_electricity_mix[ELECTRICITY_MIX_TABLE_NAME].reset_index() - - # Load countries-regions dataset (required to get ISO codes). - countries_regions = cast(catalog.Dataset, paths.load_dependency("regions"))["regions"] - - # Population data will also be loaded (used only to add a population column, and not to create any other derived - # variables). Historical regions will be added to the population. - - # Load gdp (used only to add gdp column, and no other derived variables). - gdp = ( - pd.DataFrame(catalog.Dataset(GDP_DATASET_PATH)["maddison_gdp"]) - .reset_index()[["country", "year", "gdp"]] - .dropna() - ) - - # Load mapping from variable names in the component dataset to the final variable name in the output dataset. - variable_mapping = pd.read_csv(VARIABLE_MAPPING_FILE).set_index(["source_variable"]) - - # - # Process data. - # - # Combine all tables. - tables = { - "energy_mix": tb_energy_mix.drop(columns=["country_code"], errors="ignore"), - "fossil_fuel_production": tb_fossil_fuels, - "primary_energy_consumption": tb_primary_energy.drop(columns=["gdp", "population", "source"], errors="ignore"), - "electricity_mix": tb_electricity_mix.drop( - columns=["population", "primary_energy_consumption__twh"], errors="ignore" - ), - } - tb_combined = combine_tables_data_and_metadata( - tables=tables, - countries_regions=countries_regions, - gdp=gdp, - variable_mapping=variable_mapping, - ) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Gather metadata sources from all tables' original dataset sources. - ds_garden.metadata.sources = gather_sources_from_tables(tables=list(tables.values())) - - # Add table to dataset. - tb_combined.metadata.short_name = "owid_energy" - ds_garden.add(tb_combined) - - # Update dataset and table metadata using yaml file. - ds_garden.update_metadata(METADATA_FILE_PATH) - - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2022-12-28/owid_energy_variable_mapping.csv b/etl/steps/archive/garden/energy/2022-12-28/owid_energy_variable_mapping.csv deleted file mode 100644 index 8c6c44b25af..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/owid_energy_variable_mapping.csv +++ /dev/null @@ -1,130 +0,0 @@ -variable,source_variable,source_dataset,description,source -country,country,various_datasets,Geographic location,Our World in Data -year,year,various_datasets,Year of observation,Our World in Data -iso_code,iso_alpha3,countries_regions,ISO 3166-1 alpha-3 three-letter country codes,International Organization for Standardization -population,population,key_indicators,"Population","Calculated by Our World in Data based on different sources (https://ourworldindata.org/population-sources)" -gdp,gdp,maddison_gdp,"Total real gross domestic product, inflation-adjusted",Maddison Project Database -biofuel_cons_change_pct,biofuels__pct_growth,energy_mix,Annual percentage change in biofuel consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_cons_change_twh,biofuels__twh_growth,energy_mix,"Annual change in biofuel consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_cons_per_capita,biofuels_per_capita__kwh,energy_mix,"Per capita primary energy consumption from biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_consumption,biofuels__twh,energy_mix,"Primary energy consumption from biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_elec_per_capita,per_capita_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_electricity,bioenergy_generation__twh,electricity_mix,"Electricity generation from biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_share_elec,bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_share_energy,biofuels__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy -carbon_intensity_elec,co2_intensity__gco2_kwh,electricity_mix,"Carbon intensity of electricity production, measured in grams of carbon dioxide emitted per kilowatt-hour",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_cons_change_pct,coal__pct_growth,energy_mix,Annual percentage change in coal consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_cons_change_twh,coal__twh_growth,energy_mix,"Annual change in coal consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_cons_per_capita,coal_per_capita__kwh,energy_mix,"Per capita primary energy consumption from coal, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_consumption,coal__twh,energy_mix,"Primary energy consumption from coal, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_elec_per_capita,per_capita_coal_generation__kwh,electricity_mix,"Per capita electricity generation from coal, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_electricity,coal_generation__twh,electricity_mix,"Electricity generation from coal, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_prod_change_pct,annual_change_in_coal_production__pct,fossil_fuel_production,Annual percentage change in coal production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_prod_change_twh,annual_change_in_coal_production__twh,fossil_fuel_production,"Annual change in coal production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_prod_per_capita,coal_production_per_capita__kwh,fossil_fuel_production,"Per capita coal production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_production,coal_production__twh,fossil_fuel_production,"Coal production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_share_elec,coal_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from coal,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_share_energy,coal__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from coal,Calculated by Our World in Data based on BP Statistical Review of World Energy -electricity_demand,total_demand__twh,electricity_mix,"Electricity demand, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -electricity_generation,total_generation__twh,electricity_mix,"Electricity generation, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -electricity_share_energy,total_electricity_share_of_primary_energy__pct,electricity_mix,"Electricity generation as a share of primary energy",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -energy_cons_change_pct,annual_change_in_primary_energy_consumption__pct,primary_energy_consumption,Annual percentage change in primary energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_cons_change_twh,annual_change_in_primary_energy_consumption__twh,primary_energy_consumption,"Annual change in primary energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_per_capita,primary_energy_consumption_per_capita__kwh,primary_energy_consumption,"Primary energy consumption per capita, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_per_gdp,primary_energy_consumption_per_gdp__kwh_per_dollar,primary_energy_consumption,Energy consumption per unit of GDP. This is measured in kilowatt-hours per 2011 international-$.,"Calculated by Our World in Data based on BP Statistical Review of World Energy, EIA International Energy Data and Maddison Project Database" -fossil_cons_change_pct,fossil_fuels__pct_growth,energy_mix,Annual percentage change in fossil fuel consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_cons_change_twh,fossil_fuels__twh_growth,energy_mix,"Annual change in fossil fuel consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_elec_per_capita,per_capita_fossil_generation__kwh,electricity_mix,"Per capita electricity generation from fossil fuels, measured in kilowatt-hours. This is the sum of electricity generated from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_electricity,fossil_generation__twh,electricity_mix,"Electricity generation from fossil fuels, measured in terawatt-hours. This is the sum of electricity generation from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_energy_per_capita,fossil_fuels_per_capita__kwh,energy_mix,"Per capita fossil fuel consumption, measured in kilowatt-hours. This is the sum of primary energy from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_fuel_consumption,fossil_fuels__twh,energy_mix,"Fossil fuel consumption, measured in terawatt-hours. This is the sum of primary energy from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_share_elec,fossil_share_of_electricity__pct,electricity_mix,"Share of electricity generation that comes from fossil fuels (coal, oil and gas combined)",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_share_energy,fossil_fuels__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from fossil fuels,Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_cons_change_pct,gas__pct_growth,energy_mix,Annual percentage change in gas consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_cons_change_twh,gas__twh_growth,energy_mix,"Annual change in gas consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_consumption,gas__twh,energy_mix,"Primary energy consumption from gas, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_elec_per_capita,per_capita_gas_generation__kwh,electricity_mix,"Per capita electricity generation from gas, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_electricity,gas_generation__twh,electricity_mix,"Electricity generation from gas, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_energy_per_capita,gas_per_capita__kwh,energy_mix,"Per capita primary energy consumption from gas, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_prod_change_pct,annual_change_in_gas_production__pct,fossil_fuel_production,Annual percentage change in gas production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_prod_change_twh,annual_change_in_gas_production__twh,fossil_fuel_production,"Annual change in gas production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_prod_per_capita,gas_production_per_capita__kwh,fossil_fuel_production,"Per capita gas production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_production,gas_production__twh,fossil_fuel_production,"Gas production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_share_elec,gas_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from gas,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_share_energy,gas__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from gas,Calculated by Our World in Data based on BP Statistical Review of World Energy -greenhouse_gas_emissions,total_emissions__mtco2,electricity_mix,"Greenhouse-gas emissions produced in the generation of electricity, measured in million tonnes of CO2 equivalent",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_cons_change_pct,hydro__pct_growth,energy_mix,Annual percentage change in hydropower consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_cons_change_twh,hydro__twh_growth__equivalent,energy_mix,"Annual change in hydropower consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_consumption,hydro__twh__equivalent,energy_mix,"Primary energy consumption from hydropower, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_elec_per_capita,per_capita_hydro_generation__kwh,electricity_mix,"Per capita electricity generation from hydropower, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_electricity,hydro_generation__twh,electricity_mix,"Electricity generation from hydropower, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_energy_per_capita,hydro_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from hydropower, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_share_elec,hydro_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from hydropower,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_share_energy,hydro__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from hydropower,Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_cons_change_pct,low_carbon_energy__pct_growth,energy_mix,Annual percentage change in low-carbon energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_cons_change_twh,low_carbon_energy__twh_growth__equivalent,energy_mix,"Annual change in low-carbon energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_consumption,low_carbon_energy__twh__equivalent,energy_mix,"Primary energy consumption from low-carbon sources, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_elec_per_capita,per_capita_low_carbon_generation__kwh,electricity_mix,"Per capita electricity generation from low-carbon sources, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_electricity,low_carbon_generation__twh,electricity_mix,"Electricity generation from low-carbon sources, measured in terawatt-hours. This is the sum of electricity generation from renewables and nuclear power",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_energy_per_capita,low_carbon_energy_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from low-carbon sources, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_share_elec,low_carbon_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from low-carbon sources. This is the sum of electricity from renewables and nuclear,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_share_energy,low_carbon_energy__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from low-carbon sources. This is the sum of primary energy from renewables and nuclear,Calculated by Our World in Data based on BP Statistical Review of World Energy -net_elec_imports,total_net_imports__twh,electricity_mix,"Net electricity imports, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -net_elec_imports_share_demand,net_imports_share_of_demand__pct,electricity_mix,Net electricity imports as a share of electricity demand,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_cons_change_pct,nuclear__pct_growth,energy_mix,Annual percentage change in nuclear consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_cons_change_twh,nuclear__twh_growth__equivalent,energy_mix,"Annual change in nuclear consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_consumption,nuclear__twh__equivalent,energy_mix,"Primary energy consumption from nuclear power, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_elec_per_capita,per_capita_nuclear_generation__kwh,electricity_mix,"Per capita electricity generation from nuclear power, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_electricity,nuclear_generation__twh,electricity_mix,"Electricity generation from nuclear power, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_energy_per_capita,nuclear_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from nuclear, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_share_elec,nuclear_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from nuclear power,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_share_energy,nuclear__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from nuclear power,Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_cons_change_pct,oil__pct_growth,energy_mix,Annual percentage change in oil consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_cons_change_twh,oil__twh_growth,energy_mix,"Annual change in oil consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_consumption,oil__twh,energy_mix,"Primary energy consumption from oil, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_elec_per_capita,per_capita_oil_generation__kwh,electricity_mix,"Per capita electricity generation from oil, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_electricity,oil_generation__twh,electricity_mix,"Electricity generation from oil, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_energy_per_capita,oil_per_capita__kwh,energy_mix,"Per capita primary energy consumption from oil, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_prod_change_pct,annual_change_in_oil_production__pct,fossil_fuel_production,Annual percentage change in oil production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_prod_change_twh,annual_change_in_oil_production__twh,fossil_fuel_production,"Annual change in oil production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_prod_per_capita,oil_production_per_capita__kwh,fossil_fuel_production,"Per capita oil production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_production,oil_production__twh,fossil_fuel_production,"Oil production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_share_elec,oil_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from oil,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_share_energy,oil__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from oil,Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewable_consumption,other_renewables__twh__equivalent,energy_mix,"Primary energy consumption from other renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewable_electricity,other_renewables_including_bioenergy_generation__twh,electricity_mix,"Electricity generation from other renewable sources including biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewable_exc_biofuel_electricity,other_renewables_excluding_bioenergy_generation__twh,electricity_mix,"Electricity generation from other renewable sources excluding biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_cons_change_pct,other_renewables__pct_growth,energy_mix,Annual percentage change in energy consumption from other renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_cons_change_twh,other_renewables__twh_growth__equivalent,energy_mix,"Annual change in other renewable consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_elec_per_capita,per_capita_other_renewables_including_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from other renewables including biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_elec_per_capita_exc_biofuel,per_capita_other_renewables_excluding_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from other renewables excluding biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_energy_per_capita,other_renewables_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from other renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_share_elec,other_renewables_including_bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from other renewables including biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_share_elec_exc_biofuel,other_renewables_excluding_bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from other renewables excluding biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_share_energy,other_renewables__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from other renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -per_capita_electricity,per_capita_total_generation__kwh,electricity_mix,"Electricity generation per capita, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -primary_energy_consumption,primary_energy_consumption__twh,primary_energy_consumption,"Primary energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -renewables_cons_change_pct,renewables__pct_growth,energy_mix,Annual percentage change in renewable energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_cons_change_twh,renewables__twh_growth__equivalent,energy_mix,"Annual change in renewable energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_consumption,renewables__twh__equivalent,energy_mix,"Primary energy consumption from renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_elec_per_capita,per_capita_renewable_generation__kwh,electricity_mix,"Per capita electricity generation from renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_electricity,renewable_generation__twh,electricity_mix,"Electricity generation from renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_energy_per_capita,renewables_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_share_elec,renewable_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_share_energy,renewables__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_cons_change_pct,solar__pct_growth,energy_mix,Annual percentage change in solar consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_cons_change_twh,solar__twh_growth__equivalent,energy_mix,"Annual change in solar consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_consumption,solar__twh__equivalent,energy_mix,"Primary energy consumption from solar, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_elec_per_capita,per_capita_solar_generation__kwh,electricity_mix,"Per capita electricity generation from solar, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_electricity,solar_generation__twh,electricity_mix,"Electricity generation from solar, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_energy_per_capita,solar_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from solar, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_share_elec,solar_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from solar,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_share_energy,solar__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from solar,Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_cons_change_pct,wind__pct_growth,energy_mix,Annual percentage change in wind consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_cons_change_twh,wind__twh_growth__equivalent,energy_mix,"Annual change in wind consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_consumption,wind__twh__equivalent,energy_mix,"Primary energy consumption from wind, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_elec_per_capita,per_capita_wind_generation__kwh,electricity_mix,"Per capita electricity generation from wind, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_electricity,wind_generation__twh,electricity_mix,"Electricity generation from wind, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_energy_per_capita,wind_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from wind, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_share_elec,wind_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from wind,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_share_energy,wind__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from wind,Calculated by Our World in Data based on BP Statistical Review of World Energy diff --git a/etl/steps/archive/garden/energy/2022-12-28/primary_energy_consumption.meta.yml b/etl/steps/archive/garden/energy/2022-12-28/primary_energy_consumption.meta.yml deleted file mode 100644 index ccc4663fda6..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/primary_energy_consumption.meta.yml +++ /dev/null @@ -1,153 +0,0 @@ -dataset: - namespace: energy - version: 2022-12-28 - title: Primary energy consumption (BP & EIA, 2022) - short_name: primary_energy_consumption - description: >- - Primary energy consumption data was compiled by Our World in Data based on two key data sources: - - 1. [BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - - 2. [International energy data from the U.S. Energy Information Administration (EIA)](https://www.eia.gov/international/data/world/total-energy/more-total-energy-data). - - - BP provides the longest and most up-to-date time-series of primary energy. However, it does not provide data for all countries. We have therefore supplemented this dataset with energy data - from the EIA. Where BP provides data for a given country, this data is adopted; for countries where this data is missing, we rely on EIA energy figures. - - - Per capita figures have been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). - - - To calculate energy per unit of GDP, we use total real GDP figures from [the Maddison Project Database, version 2020](https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020). - This dataset is based on Bolt, Jutta and Jan Luiten van Zanden (2020), “Maddison style estimates of the evolution of the world economy. A new 2020 update ”. GDP is measured in 2011$ which are PPP-adjusted. - sources: - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: >- - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, - Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - - * "Australasia (BP)": Australia, New Zealand. - - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - - * "North America (BP)": US (excluding US territories), Canada, Mexico - - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, - Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe", or "Other CIS"). We define our regions in the following way: - - * "Africa" - All African countries + "Other Africa". - - * "Asia" - All Asian countries + "Other Middle East" + "Other CIS" + "Other Asia Pacific". - - * "Europe" - All European countries + "Other Europe". - - * "North America" - All North American countries + "Other Caribbean" + "Other North America". - - * "Oceania" - All Oceanian countries. - - * "South America" - All South American countries + "Other South America". - - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to - other regions already included (e.g. the data for "Other Western Africa" is included in "Other Africa"). Finally, income groups are constructed following the definitions - [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - - name: Our World in Data based on EIA International energy data (2022) - published_by: U.S. Energy Information Administration (EIA) - date_accessed: 2022-07-27 - url: https://www.eia.gov/opendata/bulkfiles.php - description: | - Total energy consumption, extracted from EIA's international energy data from the EIA, downloaded using their [Bulk Download Facility](https://www.eia.gov/opendata/bulkfiles.php). - - EIA's region definitions sometimes differ from Our World in Data's definitions. For example, in EIA's data, Russia is not included in Europe, whereas Our World in Data includes Russia in Europe (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "Europe (EIA)" to refer to EIA's original data using their definition of the region, as well as "Europe", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - name: Maddison Project Database 2020 (Bolt and van Zanden, 2020) - published_by: "Bolt, Jutta and Jan Luiten van Zanden (2020), 'Maddison style estimates of the evolution of the world economy. A new 2020 update'." - date_accessed: 2022-04-12 - url: https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020 -tables: - primary_energy_consumption: - variables: - annual_change_in_primary_energy_consumption__pct: - title: Annual change in primary energy consumption (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in primary energy consumption - annual_change_in_primary_energy_consumption__twh: - title: Annual change in primary energy consumption (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in primary energy consumption - gdp: - title: GDP - short_unit: $ - unit: 2011 int-$ - description: >- - Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over - time (inflation) and price differences between countries. Calculated by multiplying GDP per capita with population. - display: - numDecimalPlaces: 0 - population: - title: Population - unit: people - primary_energy_consumption__twh: - title: Primary energy consumption (TWh) - short_unit: TWh - unit: terawatt-hours - description: Primary energy consumption, measured in terawatt-hours per year. - display: - name: Primary energy consumption - numDecimalPlaces: 0 - primary_energy_consumption_per_gdp__kwh_per_dollar: - title: Primary energy consumption per GDP (kWh/$) - short_unit: kWh - unit: kilowatt-hours per $ - description: Primary energy consumption per unit of gross domestic product, measured in kilowatt-hours per international-$. - display: - name: Energy consumption per dollar - primary_energy_consumption_per_capita__kwh: - title: Primary energy consumption per capita (kWh/person) - short_unit: kWh - unit: kilowatt-hours per capita - description: Primary energy consumption per capita, measured in kilowatt-hours per person per year. - display: - name: Per capita energy consumption - numDecimalPlaces: 0 - source: - title: Source of data - short_unit: source - unit: source diff --git a/etl/steps/archive/garden/energy/2022-12-28/primary_energy_consumption.py b/etl/steps/archive/garden/energy/2022-12-28/primary_energy_consumption.py deleted file mode 100644 index 44a0c364e4c..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/primary_energy_consumption.py +++ /dev/null @@ -1,325 +0,0 @@ -"""Garden step for Primary energy consumption dataset (part of the OWID Energy dataset), based on a combination of BP's -Statistical Review dataset and EIA data on energy consumption. - -""" - -from typing import cast - -import numpy as np -import pandas as pd -from owid import catalog -from shared import CURRENT_DIR, HISTORIC_TO_CURRENT_REGION, add_population -from structlog import get_logger - -from etl.paths import DATA_DIR - -log = get_logger() - -# Namespace and dataset short name for output dataset. -NAMESPACE = "energy" -DATASET_SHORT_NAME = "primary_energy_consumption" -# Metadata file. -METADATA_FILE_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Path to EIA energy consumption dataset. -EIA_DATASET_PATH = DATA_DIR / "garden" / "eia" / "2022-07-27" / "energy_consumption" -# Path to BP statistical review dataset. -BP_DATASET_PATH = DATA_DIR / "garden" / "bp" / "2022-12-28" / "statistical_review" -# Path to GGDC Maddison 2020 GDP dataset. -GGDC_DATASET_PATH = DATA_DIR / "garden" / "ggdc" / "2020-10-01" / "ggdc_maddison" - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - -# Countries whose data have to be removed since they were identified as outliers. -OUTLIERS = ["Gibraltar"] - - -def load_bp_data() -> catalog.Table: - """Load BP data from the local catalog, and rename columns conveniently. - - Returns - ------- - bp_table : catalog.Table - BP data as a table with metadata. - - """ - # Load BP Statistical Review dataset. - bp_dataset = catalog.Dataset(BP_DATASET_PATH) - - # Get table. - bp_table = bp_dataset[bp_dataset.table_names[0]].reset_index() - bp_columns = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "Primary energy consumption (TWh)", - } - bp_table = bp_table[list(bp_columns)].rename(columns=bp_columns) - - # Drop rows with missing values. - bp_table = bp_table.dropna(how="any").reset_index(drop=True) - - return cast(catalog.Table, bp_table) - - -def load_eia_data() -> catalog.Table: - """Load EIA data from the local catalog, and rename columns conveniently. - - Returns - ------- - eia_table : catalog.Table - EIA data as a table with metadata. - - """ - # Load EIA energy consumption dataset. - eia_dataset = catalog.Dataset(EIA_DATASET_PATH) - - # Get table. - eia_table = eia_dataset[eia_dataset.table_names[0]].reset_index() - eia_columns = { - "country": "country", - "year": "year", - "energy_consumption": "Primary energy consumption (TWh)", - } - eia_table = eia_table[list(eia_columns)].rename(columns=eia_columns) - - # Drop rows with missing values. - eia_table = eia_table.dropna(how="any").reset_index(drop=True) - - return cast(catalog.Table, eia_table) - - -def load_ggdc_data() -> catalog.Table: - """Load GGDC data on GDP from the local catalog, and rename columns conveniently. - - Returns - ------- - ggdc_table : catalog.Table - GGDC data as a table with metadata. - - """ - # Load GGDC Maddison 2020 dataset on GDP. - ggdc_dataset = catalog.Dataset(GGDC_DATASET_PATH) - - # Get table. - ggdc_table = ggdc_dataset[ggdc_dataset.table_names[0]].reset_index() - ggdc_columns = { - "country": "country", - "year": "year", - "gdp": "GDP", - } - ggdc_table = ggdc_table[list(ggdc_columns)].rename(columns=ggdc_columns) - - # Drop rows with missing values. - ggdc_table = ggdc_table.dropna(how="any").reset_index(drop=True) - - return cast(catalog.Table, ggdc_table) - - -def combine_bp_and_eia_data(bp_table: catalog.Table, eia_table: catalog.Table) -> pd.DataFrame: - """Combine BP and EIA data. - - Parameters - ---------- - bp_table : catalog.Table - Table from BP Statistical Review dataset. - eia_table : catalog.Table - Table from EIA energy consumption dataset. - - Returns - ------- - combined : pd.DataFrame - Combined data. - - """ - # Check that there are no duplicated rows in any of the two datasets. - assert bp_table[bp_table.duplicated(subset=["country", "year"])].empty, "Duplicated rows in BP data." - assert eia_table[eia_table.duplicated(subset=["country", "year"])].empty, "Duplicated rows in EIA data." - - bp_table["source"] = "bp" - eia_table["source"] = "eia" - # Combine EIA data (which goes further back in the past) with BP data (which is more up-to-date). - # On coincident rows, prioritise BP data. - index_columns = ["country", "year"] - combined = cast(pd.DataFrame, pd.concat([eia_table, bp_table], ignore_index=True)).drop_duplicates( - subset=index_columns, keep="last" - ) - - # Convert to conventional dataframe, and sort conveniently. - combined = pd.DataFrame(combined).sort_values(index_columns).reset_index(drop=True) - - return cast(pd.DataFrame, combined) - - -def add_annual_change(df: pd.DataFrame) -> pd.DataFrame: - """Add annual change variables to combined BP & EIA dataset. - - Parameters - ---------- - df : pd.DataFrame - Combined BP & EIA dataset. - - Returns - ------- - combined : pd.DataFrame - Combined BP & EIA dataset after adding annual change variables. - - """ - combined = df.copy() - - # Calculate annual change. - combined = combined.sort_values(["country", "year"]).reset_index(drop=True) - combined["Annual change in primary energy consumption (%)"] = ( - combined.groupby("country")["Primary energy consumption (TWh)"].pct_change() * 100 - ) - combined["Annual change in primary energy consumption (TWh)"] = combined.groupby("country")[ - "Primary energy consumption (TWh)" - ].diff() - - return combined - - -def add_per_capita_variables(df: pd.DataFrame) -> pd.DataFrame: - """Add a population column and add per-capita variables. - - Parameters - ---------- - df : pd.DataFrame - Data. - - Returns - ------- - df : pd.DataFrame - Data after adding population and per-capita variables. - - """ - df = df.copy() - - # Add population to data. - df = add_population( - df=df, - country_col="country", - year_col="year", - population_col="Population", - warn_on_missing_countries=False, - regions=HISTORIC_TO_CURRENT_REGION, - ) - - # Calculate consumption per capita. - df["Primary energy consumption per capita (kWh)"] = ( - df["Primary energy consumption (TWh)"] / df["Population"] * TWH_TO_KWH - ) - - return df - - -def add_per_gdp_variables(df: pd.DataFrame, ggdc_table: catalog.Table) -> pd.DataFrame: - """Add a GDP column and add per-gdp variables. - - Parameters - ---------- - df : pd.DataFrame - Data. - ggdc_table : catalog.Table - GDP data from the GGDC Maddison dataset. - - Returns - ------- - df : pd.DataFrame - Data after adding GDP and per-gdp variables. - - """ - df = df.copy() - - # Add population to data. - df = pd.merge(df, ggdc_table, on=["country", "year"], how="left") - - # Calculate consumption per GDP. - df["Primary energy consumption per GDP (kWh per $)"] = ( - df["Primary energy consumption (TWh)"] / df["GDP"] * TWH_TO_KWH - ) - - return df - - -def remove_outliers(df: pd.DataFrame) -> pd.DataFrame: - """Remove infinity values and data that has been identified as spurious outliers. - - Parameters - ---------- - df : pd.DataFrame - Data. - - Returns - ------- - df : pd.DataFrame - Data after removing spurious data. - - """ - df = df.copy() - - # Remove spurious values. - df = df.replace(np.inf, np.nan) - - # Remove indexes of outliers from data. - df = df[~df["country"].isin(OUTLIERS)].reset_index(drop=True) - - return df - - -def run(dest_dir: str) -> None: - log.info(f"{DATASET_SHORT_NAME}.start") - - # - # Load data. - # - # Load BP statistical review dataset. - bp_table = load_bp_data() - - # Load EIA data on energy_consumption. - eia_table = load_eia_data() - - # Load GGDC Maddison data on GDP. - ggdc_table = load_ggdc_data() - - # - # Process data. - # - # Combine BP and EIA data. - df = combine_bp_and_eia_data(bp_table=bp_table, eia_table=eia_table) - - # Add annual change. - df = add_annual_change(df=df) - - # Add per-capita variables. - df = add_per_capita_variables(df=df) - - # Add per-GDP variables. - df = add_per_gdp_variables(df=df, ggdc_table=ggdc_table) - - # Remove outliers. - df = remove_outliers(df=df) - - # Create an appropriate index and sort conveniently. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Create new table. - table = catalog.Table(df, underscore=True) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Add table to dataset. - table.metadata.short_name = "primary_energy_consumption" - ds_garden.add(table) - - # Update dataset and table metadata using yaml file. - ds_garden.update_metadata(METADATA_FILE_PATH) - - # Save dataset. - ds_garden.save() - - log.info(f"{DATASET_SHORT_NAME}.end") diff --git a/etl/steps/archive/garden/energy/2022-12-28/shared.py b/etl/steps/archive/garden/energy/2022-12-28/shared.py deleted file mode 100644 index 25488f162a2..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/shared.py +++ /dev/null @@ -1,480 +0,0 @@ -from pathlib import Path -from typing import Any, Dict, List, Optional, Union, cast - -import numpy as np -import pandas as pd -from owid import catalog - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION: Dict[str, Dict[str, Union[str, List[str]]]] = { - "Czechoslovakia": { - "continent": "Europe", - "income_group": "High-income countries", - "regions_included": [ - # Europe - High-income countries. - "Czechia", - "Slovakia", - ], - }, - "East Germany": { - "continent": "Europe", - "income_group": "", - "regions_included": [ - # Europe - High-income countries. - "Germany", - ], - }, - "West Germany": { - "continent": "Europe", - "income_group": "", - "regions_included": [ - # Europe - High-income countries. - "Germany", - ], - }, - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "regions_included": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - "Bonaire Sint Eustatius and Saba", - ], - }, - "Serbia and Montenegro": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - Upper-middle-income countries. - "Serbia", - "Montenegro", - ], - }, - "North Yemen": { - "continent": "Asia", - "income_group": "Low-income countries", - "regions_included": [ - # Asia - Low-income countries. - "Yemen", - ], - }, - "South Yemen": { - "continent": "Asia", - "income_group": "Low-income countries", - "regions_included": [ - # Asia - Low-income countries. - "Yemen", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, - "Yugoslavia": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "regions_included": [ - # Europe - High-income countries. - "Croatia", - "Slovenia", - # Europe - Upper-middle-income countries. - "North Macedonia", - "Bosnia and Herzegovina", - "Serbia", - "Montenegro", - ], - }, -} - -# Historical countries whose population can be built by adding up the population of their successor countries. -# Those historical countries not listed here will have no population data. -BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES = [ - # The following regions split into smaller ones, and can be estimated by the population of the successors. - "Czechoslovakia", - "Netherlands Antilles", - "Serbia and Montenegro", - "USSR", - "Yugoslavia", - # The following countries cannot be replaced by the successor countries. - # 'East Germany', - # 'West Germany', - # 'North Yemen', - # 'South Yemen', -] - - -# Historical countries for which we don't have population, and can't be built from successor countries. -EXPECTED_COUNTRIES_WITHOUT_POPULATION = list( - set(HISTORIC_TO_CURRENT_REGION) - set(BUILD_POPULATION_FOR_HISTORICAL_COUNTRIES) -) - -# Overlaps found between historical regions and successor countries, that we accept in the data. -# We accept them either because they happened close to the transition, or to avoid needing to introduce new -# countries for which we do not have data (like the Russian Empire). -ACCEPTED_OVERLAPS = { - # 1991: {"Georgia", "USSR"}, -} - - -def gather_sources_from_tables( - tables: List[catalog.Table], -) -> List[catalog.meta.Source]: - """Gather unique sources from the metadata.dataset of each table in a list of tables. - - Note: To check if a source is already listed, only the name of the source is considered (not the description or any - other field in the source). - - Parameters - ---------- - tables : list - List of tables with metadata. - - Returns - ------- - known_sources : list - List of unique sources from all tables. - - """ - # Initialise list that will gather all unique metadata sources from the tables. - known_sources: List[catalog.meta.Source] = [] - for table in tables: - # Get list of sources of the dataset of current table. - table_sources = table.metadata.dataset.sources - # Go source by source of current table, and check if its name is not already in the list of known_sources. - for source in table_sources: - # Check if this source's name is different to all known_sources. - if all([source.name != known_source.name for known_source in known_sources]): - # Add the new source to the list. - known_sources.append(source) - - return known_sources - - -def get_countries_in_region( - region: str, region_modifications: Optional[Dict[str, Dict[str, List[str]]]] = None -) -> List[str]: - """Get countries in a region, both for known regions (e.g. "Africa") and custom ones (e.g. "Europe (excl. EU-27)"). - - Parameters - ---------- - region : str - Region name (e.g. "Africa", or "Europe (excl. EU-27)"). - region_modifications : dict or None - If None (or an empty dictionary), the region should be in OWID's countries-regions dataset. - If not None, it should be a dictionary with any (or all) of the following keys: - - "regions_included": List of regions whose countries will be included. - - "regions_excluded": List of regions whose countries will be excluded. - - "countries_included": List of additional individual countries to be included. - - "countries_excluded": List of additional individual countries to be excluded. - NOTE: All regions and countries defined in this dictionary should be in OWID's countries-regions dataset. - - Returns - ------- - countries : list - List of countries in the specified region. - - """ - if region_modifications is None: - region_modifications = {} - - # Check that the fields in the regions_modifications dictionary are well defined. - expected_fields = ["regions_included", "regions_excluded", "countries_included", "countries_excluded"] - assert all([field in expected_fields for field in region_modifications]) - - # Get lists of regions whose countries will be included and excluded. - regions_included = region_modifications.get("regions_included", [region]) - regions_excluded = region_modifications.get("regions_excluded", []) - # Get lists of additional individual countries to include and exclude. - countries_included = region_modifications.get("countries_included", []) - countries_excluded = region_modifications.get("countries_excluded", []) - - # List countries from the list of regions included. - countries_set = set( - sum([geo.list_countries_in_region(region_included) for region_included in regions_included], []) - ) - - # Remove all countries from the list of regions excluded. - countries_set -= set( - sum([geo.list_countries_in_region(region_excluded) for region_excluded in regions_excluded], []) - ) - - # Add the list of individual countries to be included. - countries_set |= set(countries_included) - - # Remove the list of individual countries to be excluded. - countries_set -= set(countries_excluded) - - # Convert set of countries into a sorted list. - countries = sorted(countries_set) - - return countries - - -def load_population(regions: Optional[Dict[Any, Any]] = None) -> pd.DataFrame: - """Load OWID population dataset, and add historical regions to it. - - Returns - ------- - population : pd.DataFrame - Population dataset. - - """ - # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] - - # Add data for historical regions (if not in population) by adding the population of its current successors. - countries_with_population = population["country"].unique() - - # Consider additional regions (e.g. historical regions). - if regions is None: - regions = {} - missing_countries = [country for country in regions if country not in countries_with_population] - for country in missing_countries: - members = regions[country]["regions_included"] - _population = ( - population[population["country"].isin(members)] - .groupby("year") - .agg({"population": "sum", "country": "nunique"}) - .reset_index() - ) - # Select only years for which we have data for all member countries. - _population = _population[_population["country"] == len(members)].reset_index(drop=True) - _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) - - error = "Duplicate country-years found in population. Check if historical regions changed." - assert population[population.duplicated(subset=["country", "year"])].empty, error - - return cast(pd.DataFrame, population) - - -def load_income_groups() -> pd.DataFrame: - """Load dataset of income groups and add historical regions to it. - - Returns - ------- - income_groups : pd.DataFrame - Income groups data. - - """ - # Load the WorldBank dataset for income grups. - income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() - - # Add historical regions to income groups. - for historic_region in HISTORIC_TO_CURRENT_REGION: - historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] - if historic_region not in income_groups["country"]: - historic_region_df = pd.DataFrame( - { - "country": [historic_region], - "income_group": [historic_region_income_group], - } - ) - income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) - - return cast(pd.DataFrame, income_groups) - - -def add_population( - df: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - interpolate_missing_population: bool = False, - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, - regions: Optional[Dict[Any, Any]] = None, - expected_countries_without_population: List[str] = [], -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - This function has been adapted from datautils.geo, because population currently does not include historic regions. - We include them in this function. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - interpolate_missing_population : bool - True to linearly interpolate population on years that are presented in df, but for which we do not have - population data; otherwise False to keep missing population data as nans. - For example, if interpolate_missing_population is True and df has data for all years between 1900 and 1910, - but population is only given for 1900 and 1910, population will be linearly interpolated between those years. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - regions : dict - Definitions of regions whose population also needs to be included. - expected_countries_without_population : list - Countries that are expected to not have population (that should be ignored if warnings are activated). - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Load population dataset. - population = load_population(regions=regions).rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - if interpolate_missing_population: - # For some countries we have population data only on certain years, e.g. 1900, 1910, etc. - # Optionally fill missing years linearly. - countries_in_data = df[country_col].unique() - years_in_data = df[year_col].unique() - - population = population.set_index([country_col, year_col]).reindex( - pd.MultiIndex.from_product([countries_in_data, years_in_data], names=[country_col, year_col]) - ) - - population = population.groupby(country_col).transform( - lambda x: x.interpolate(method="linear", limit_direction="both") - ) - - error = "Countries without population data differs from list of expected countries without population data." - assert set(population[population[population_col].isnull()].reset_index()[country_col]) == set( - expected_countries_without_population - ), error - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population - - -def detect_overlapping_regions( - df, index_columns, region_and_members, country_col="country", year_col="year", ignore_zeros=True -): - """Detect years on which the data for two regions overlap, e.g. a historical region and one of its successors. - - Parameters - ---------- - df : _type_ - Data (with a dummy index). - index_columns : _type_ - Names of index columns. - region_and_members : _type_ - Regions to check for overlaps. Each region must have a dictionary "regions_included", listing the subregions - contained. If the region is historical, "regions_included" would be the list of successor countries. - country_col : str, optional - Name of country column (usually "country"). - year_col : str, optional - Name of year column (usually "year"). - ignore_zeros : bool, optional - True to ignore overlaps of zeros. - - Returns - ------- - all_overlaps : dict - All overlaps found. - - """ - # Sum over all columns to get the total sum of each column for each country-year. - df_total = ( - df.groupby([country_col, year_col]) - .agg({column: "sum" for column in df.columns if column not in index_columns}) - .reset_index() - ) - # Create a list of values that will be ignored in overlaps (usually zero or nothing). - if ignore_zeros: - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - # List all variables in data (ignoring index columns). - variables = [column for column in df.columns if column not in index_columns] - # List all country names found in data. - countries_in_data = df[country_col].unique().tolist() - # List all regions found in data. - regions = [country for country in list(region_and_members) if country in countries_in_data] - # Initialize a dictionary that will store all overlaps found. - all_overlaps = {} - for region in regions: - # List members of current region. - members = [member for member in region_and_members[region]["regions_included"] if member in countries_in_data] - for member in members: - # Select data for current region. - region_values = ( - df_total[df_total[country_col] == region] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=variables, how="all") - ) - # Select data for current member. - member_values = ( - df_total[df_total[country_col] == member] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=variables, how="all") - ) - # Concatenate both selections of data, and select duplicated rows. - combined = pd.concat([region_values, member_values]) - overlaps = combined[combined.duplicated(subset=[year_col], keep=False)] # type: ignore - if len(overlaps) > 0: - # Add the overlap found to the dictionary of all overlaps. - all_overlaps.update({year: set(overlaps[country_col]) for year in overlaps[year_col].unique()}) - - # Sort overlaps conveniently. - all_overlaps = {year: all_overlaps[year] for year in sorted(list(all_overlaps))} - - return all_overlaps diff --git a/etl/steps/archive/garden/energy/2022-12-28/uk_historical_electricity.meta.yml b/etl/steps/archive/garden/energy/2022-12-28/uk_historical_electricity.meta.yml deleted file mode 100644 index fc14f13a220..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/uk_historical_electricity.meta.yml +++ /dev/null @@ -1,91 +0,0 @@ -dataset: - namespace: energy - version: 2022-12-28 - title: UK historical electricity (DUKES, 2022c) - short_name: uk_historical_electricity - description: | - All data prior to 1985 (and prior to 1965 in the case of renewables), is sourced from [the Digest of UK Energy Statistics (DUKES), published by the UK's Department for Business, Energy & Industrial Strategy](https://www.gov.uk/government/statistics/electricity-chapter-5-digest-of-united-kingdom-energy-statistics-dukes). - - All other data is sourced from the [BP's Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html) and [Ember's Yearly Electricity Data](https://ember-climate.org/data-catalogue/yearly-electricity-data/). Where data from BP is available for a given year, we rely on it as the primary source. We then supplement this with data from Ember where data from BP is not available. - sources: - - name: Digest of UK Energy Statistics - published_by: UK's Department for Business, Energy & Industrial Strategy - date_accessed: 2022-09-21 - url: https://www.gov.uk/government/statistical-data-sets/historical-electricity-data - - name: BP Statistical Review of World Energy - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - - name: Ember's Yearly Electricity Data - published_by: Ember - publication_year: 2022 - date_accessed: 2022-12-13 - url: https://ember-climate.org/data-catalogue/yearly-electricity-data/ - - name: Ember's European Electricity Review - published_by: Ember - publication_year: 2022 - date_accessed: 2022-08-01 - url: https://ember-climate.org/insights/research/european-electricity-review-2022/ -tables: - uk_historical_electricity: - variables: - coal_generation: - title: Electricity generation from coal - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - oil_generation: - title: Electricity generation from oil - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - gas_generation: - title: Electricity generation from gas - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - nuclear_generation: - title: Electricity generation from nuclear - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - hydro_generation: - title: Electricity generation from hydropower - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - solar_generation: - title: Electricity generation from solar - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - wind_generation: - title: Electricity generation from wind - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - other_renewables_generation: - title: Electricity generation from other renewables - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - total_generation: - title: Total electricity generation - short_unit: TWh - unit: terawatt-hours - display: - name: Total electricity generation - net_imports: - title: Net electricity imports - short_unit: TWh - unit: terawatt-hours - display: - name: Net electricity imports diff --git a/etl/steps/archive/garden/energy/2022-12-28/uk_historical_electricity.py b/etl/steps/archive/garden/energy/2022-12-28/uk_historical_electricity.py deleted file mode 100644 index 8c8657230e4..00000000000 --- a/etl/steps/archive/garden/energy/2022-12-28/uk_historical_electricity.py +++ /dev/null @@ -1,212 +0,0 @@ -"""Combine UK BEIS' historical electricity with our electricity mix dataset (by BP & Ember) -to obtain a long-run electricity mix in the UK. - -""" - -from typing import cast - -import numpy as np -import pandas as pd -from owid import catalog -from owid.datautils import dataframes -from shared import CURRENT_DIR - -from etl.paths import DATA_DIR - -# Details for dataset to export. -DATASET_SHORT_NAME = "uk_historical_electricity" -DATASET_TITLE = "UK historical electricity" -METADATA_FILE_PATH = CURRENT_DIR / f"{DATASET_SHORT_NAME}.meta.yml" -# Details for datasets to import. -ELECTRICITY_MIX_DATASET_PATH = DATA_DIR / "garden/energy/2022-12-28/electricity_mix" -ELECTRICITY_MIX_TABLE_NAME = "electricity_mix" -UK_BEIS_DATASET_PATH = DATA_DIR / "garden/uk_beis/2022-07-28/uk_historical_electricity" -UK_BEIS_TABLE_NAME = "uk_historical_electricity" - - -def prepare_electricity_mix_data(df_elec: pd.DataFrame) -> pd.DataFrame: - """Select necessary columns from the electricity mix, and select rows corresponding to the UK. - - Parameters - ---------- - df_elec : pd.DataFrame - Data from the main table of the electricity mix dataset. - - Returns - ------- - df_elec : pd.DataFrame - Selected columns and rows from the electricity mix data. - - """ - df_elec = df_elec.copy() - - # Select columns and rename them conveniently. - elec_columns = { - "country": "country", - "year": "year", - "coal_generation__twh": "coal_generation", - "gas_generation__twh": "gas_generation", - "oil_generation__twh": "oil_generation", - "hydro_generation__twh": "hydro_generation", - "nuclear_generation__twh": "nuclear_generation", - "other_renewables_including_bioenergy_generation__twh": "other_renewables_generation", - "solar_generation__twh": "solar_generation", - "total_generation__twh": "total_generation", - "wind_generation__twh": "wind_generation", - "total_net_imports__twh": "net_imports", - } - - # Select necessary columns from electricity mix dataset. - df_elec = df_elec[list(elec_columns)].rename(columns=elec_columns) - - # Select UK data from Ember dataset. - df_elec = df_elec[df_elec["country"] == "United Kingdom"].reset_index(drop=True) - - return df_elec - - -def prepare_beis_data(df_beis: pd.DataFrame) -> pd.DataFrame: - """Select (and rename) columns from the UK historical electricity data from BEIS. - - Parameters - ---------- - df_beis : pd.DataFrame - Combined data for UK historical electricity data from BEIS. - - Returns - ------- - df_beis : pd.DataFrame - Selected columns from the UK historical electricity data. - - """ - df_beis = df_beis.copy() - - # Select columns and rename them conveniently. - beis_columns = { - "country": "country", - "year": "year", - "coal": "coal_generation", - "oil": "oil_generation", - "electricity_generation": "total_generation", - "gas": "gas_generation", - "hydro": "hydro_generation", - "nuclear": "nuclear_generation", - "net_imports": "net_imports", - "implied_efficiency": "implied_efficiency", - "wind_and_solar": "wind_and_solar_generation", - } - df_beis = df_beis[list(beis_columns)].rename(columns=beis_columns) - - return df_beis - - -def combine_beis_and_electricity_mix_data(df_beis: pd.DataFrame, df_elec: pd.DataFrame) -> pd.DataFrame: - """Combine BEIS data on UK historical electricity with the electricity mix data (after having selected rows for only - the UK). - - There are different processing steps done to the data, see comments below in the code. - - Parameters - ---------- - df_beis : pd.DataFrame - Selected data from BEIS on UK historical electricity. - df_elec : pd.DataFrame - Selected data from the electricity mix (after having selected rows for the UK). - - Returns - ------- - df_combined : pd.DataFrame - Combined and processed data with a verified index. - - """ - # In the BEIS dataset, wind and solar are given as one joined variable. - # Check if we can ignore it (since it's better to have the two sources separately). - # Find the earliest year informed in the electricity mix for solar or wind generation. - solar_or_wind_first_year = df_elec[df_elec["wind_generation"].notnull() | df_elec["solar_generation"].notnull()][ - "year" - ].min() - # Now check that, prior to that year, all generation from solar and wind was zero. - assert df_beis[df_beis["year"] < solar_or_wind_first_year]["wind_and_solar_generation"].fillna(0).max() == 0 - # Therefore, since wind and solar is always zero (prior to the beginning of the electricity mix data) - # we can ignore this column from the BEIS dataset. - df_beis = df_beis.drop(columns=["wind_and_solar_generation"]) - # And create two columns of zeros for wind and solar. - df_beis["solar_generation"] = 0 - df_beis["wind_generation"] = 0 - # Similarly, given that in the BEIS dataset there is no data about other renewable sources (apart from hydro, solar - # and wind), we can assume that the contribution from other renewables is zero. - df_beis["other_renewables_generation"] = 0 - # And ensure these new columns do not have any values after the electricity mix data begins. - df_beis.loc[ - df_beis["year"] >= solar_or_wind_first_year, - ["solar_generation", "wind_generation", "other_renewables_generation"], - ] = np.nan - - # BEIS data on fuel input gives raw energy, but we want electricity generation (which is less, given the - # inefficiencies of the process of burning fossil fuels). - # They also include a variable on "implied efficiency", which they obtain by dividing the input energy by the total - # electricity generation. - # We multiply the raw energy by the efficiency to have an estimate of the electricity generated by each fossil fuel. - # This only affects data prior to the beginning of the electricity mix's data (which is 1965 for renewables and - # nuclear, and 1985 for the rest). - for source in ["coal", "oil", "gas"]: - df_beis[f"{source}_generation"] *= df_beis["implied_efficiency"] - - # Drop other unnecessary columns. - df_beis = df_beis.drop(columns=["implied_efficiency"]) - - # Combine BEIS and electricity mix data. - df_combined = dataframes.combine_two_overlapping_dataframes( - df1=df_elec, df2=df_beis, index_columns=["country", "year"] - ) - - # Add an index and sort conveniently. - df_combined = df_combined.set_index(["country", "year"]).sort_index().sort_index(axis=1) - - return cast(pd.DataFrame, df_combined) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read all required datasets. - ds_beis = catalog.Dataset(UK_BEIS_DATASET_PATH) - ds_elec = catalog.Dataset(ELECTRICITY_MIX_DATASET_PATH) - - # Gather all required tables from all datasets. - tb_beis = ds_beis[UK_BEIS_TABLE_NAME] - tb_elec = ds_elec[ELECTRICITY_MIX_TABLE_NAME] - - # Create convenient dataframes. - df_beis = pd.DataFrame(tb_beis).reset_index() - df_elec = pd.DataFrame(tb_elec).reset_index() - # - # Process data. - # - # Prepare electricity mix data. - df_elec = prepare_electricity_mix_data(df_elec=df_elec) - # Prepare BEIS data. - df_beis = prepare_beis_data(df_beis=df_beis) - - # Combine BEIS and electricity mix data. - df_combined = combine_beis_and_electricity_mix_data(df_beis=df_beis, df_elec=df_elec) - - # Create a new table with combined data (and no metadata). - tb_combined = catalog.Table(df_combined) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Add table to dataset. - tb_combined.metadata.short_name = "uk_historical_electricity" - ds_garden.add(tb_combined) - - # Update dataset and table metadata using yaml file. - ds_garden.update_metadata(METADATA_FILE_PATH) - - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-01-04/photovoltaic_cost_and_capacity.meta.yml b/etl/steps/archive/garden/energy/2023-01-04/photovoltaic_cost_and_capacity.meta.yml deleted file mode 100644 index 2b1a86bc023..00000000000 --- a/etl/steps/archive/garden/energy/2023-01-04/photovoltaic_cost_and_capacity.meta.yml +++ /dev/null @@ -1,90 +0,0 @@ -all_sources: -- nemet_2009: &source-nemet_2009 - name: G. G. Nemet (2009) - published_by: | - Interim monitoring of cost dynamics for publicly supported energy technologies. Energy Policy 37(3): 825-835. by Nemet, G. F. (2009). - url: https://www.sciencedirect.com/science/article/abs/pii/S0301421508005910 - date_accessed: '2023-01-04' - publication_date: '2009-03-01' - publication_year: 2009 - description: | - Photovoltaic cost and capacity data between 1975 and 2003 has been taken from Nemet (2009). - - Prices from Nemet (2009) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year -- farmer_lafond_2016: &source-farmer_lafond_2016 - name: J. D. Farmer & F. Lafond (2016) - published_by: | - How predictable is technological progress? J. D. Farmer & F. Lafond, Research Policy Volume 45, Issue 3, April 2016, Pages 647-665. - url: https://www.sciencedirect.com/science/article/pii/S0048733315001699 - date_accessed: '2023-01-04' - publication_date: '2016-04-01' - publication_year: 2016 - description: | - Photovoltaic cost data between 2004 and 2009 has been taken from Farmer & Lafond (2016). - - According to Farmer & Lafond (2016), the data are mostly taken from the Santa-Fe [Performance Curve Database](https://pcdb.santafe.edu/). The database has been constructed from personal communications and from [Colpier and Cornland (2002)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0095), [Goldemberg et al. (2004)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0130), [Lieberman (1984)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0180), [Lipman and Sperling (1999)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0190), [Zhao (1999)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0310), [McDonald and Schrattenholzer (2001)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0205), [Neij et al. (2003)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0235), [Moore (2006)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0215), [Nemet (2006)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0240), [Schilling and Esmundo (2009)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0265). The data on photovoltaic prices has been collected from public releases of Strategies Unlimited, Navigant and SPV Market Research. The data on nuclear energy is from [Koomey and Hultman (2007)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0165) and [Cooper (2009)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0100). The DNA sequencing data is from [Wetterstrand (2015)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0290) (cost per human-size genome), and for each year the last available month (September for 2001-2002 and October afterwards) was taken and corrected for inflation using the US GDP deflator. - - Prices from Farmer & Lafond (2016) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year -- irena_capacity: &source-irena_capacity - name: International Renewable Energy Agency (IRENA) - published_by: "© 2022 by International Renewable Energy Agency (IRENA)" - url: https://www.irena.org/Statistics/Download-query-tools - date_accessed: '2022-10-20' - publication_date: '2022-07-01' - publication_year: 2022 - description: | - Photovoltaic capacity data between 2004 and 2021 has been taken from IRENA. -- irena_costs: &source-irena_costs - name: International Renewable Energy Agency (IRENA) - published_by: "International Renewable Energy Agency (IRENA) \xA9 2022 by IRENA" - url: https://irena.org/publications/2022/Jul/Renewable-Power-Generation-Costs-in-2021 - date_accessed: '2022-10-20' - publication_year: 2022 - description: | - Photovoltaic cost data between 2010 and 2021 has been taken from IRENA. -dataset: - namespace: energy - short_name: photovoltaic_cost_and_capacity - title: Solar photovoltaic cost and capacity (Energy, 2023) - description: | - Prices from Nemet (2009) and from Farmer & LaFond (2016) have been - converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year - version: '2023-01-04' - sources: - - *source-nemet_2009 - - *source-farmer_lafond_2016 - - *source-irena_capacity - - *source-irena_costs - -tables: - photovoltaic_cost_and_capacity: - variables: - cost: - title: Solar photovoltaic module price - short_unit: $/W - unit: 2021 US$ per Watt - description: | - Global average price of solar photovoltaic modules. - - IRENA presents solar PV module price series for a number of different module technologies. Here we have adopted the series for thin film a-Si/u-Si or Global Index (from Q4 2013). - sources: - - *source-nemet_2009 - - *source-farmer_lafond_2016 - - *source-irena_costs - cost_source: - title: Data source for cost data - unit: '' - description: Source for each value of cost data. - cumulative_capacity: - title: Solar photovoltaic cumulative capacity - description: | - Global cumulative capacity of solar photovoltaics. - short_unit: MW - unit: megawatts - sources: - - *source-nemet_2009 - - *source-irena_capacity - cumulative_capacity_source: - title: Data source for cumulative capacity data - unit: '' - description: Source for each value of cumulative capacity data. diff --git a/etl/steps/archive/garden/energy/2023-01-04/photovoltaic_cost_and_capacity.py b/etl/steps/archive/garden/energy/2023-01-04/photovoltaic_cost_and_capacity.py deleted file mode 100644 index 4495bf5e8e2..00000000000 --- a/etl/steps/archive/garden/energy/2023-01-04/photovoltaic_cost_and_capacity.py +++ /dev/null @@ -1,150 +0,0 @@ -"""Combine data from Nemet (2009), Farmer & Lafond (2016) and IRENA on photovoltaic cost and capacity. - -Data content: -* Nemet (2009) provides cumulative capacity data between 1975 and 2003. -* Nemet (2009) provides cost data between 1975 and 2003. -* IRENA provides cumulative capacity data betwen 2000 and 2021. -* IRENA provides cost data betwen 2010 and 2021. -* Farmer & Lafond (2016) provide cost data between 1980 and 2013. - -For each informed year, we need to combine these sources with the following two constraints: -* Having data from the most recent source. -* Avoid (as much as possible) having cost and capacity data on a given year from different sources. - -Therefore, for capacity data, we use Nemet (2009) between 1975 and 2003, and IRENA between 2004 and 2021. -For cost data, we use Nemet (2009) between 1975 and 2003, Farmer & Lafond (2016) between 2004 and 2009, and IRENA between 2010 and 2021. - -""" - -import pandas as pd -from owid import catalog -from owid.datautils.dataframes import combine_two_overlapping_dataframes - -from etl.helpers import PathFinder - -# Get paths and naming conventions for current data step. -paths = PathFinder(__file__) - -# Conversion factors. -# Convert 2004 USD to 2021 USD. -USD2004_TO_USD2021 = 1.42 -# Convert 2013 USD to 2021 USD. -USD2013_TO_USD2021 = 1.19 - - -def prepare_capacity_data(tb_nemet: catalog.Table, tb_irena_capacity: catalog.Table) -> catalog.Table: - # Column "previous_capacity" is equivalent to tb_nemet["yearly_capacity"].shift(1).cumsum() - # As they explain in the paper, "Following Epple et al. (1991), cumulative capacity is lagged one year to account - # for the time it takes to incorporate new techniques obtained as a result of learning from experience." - tb_nemet_capacity = tb_nemet[["year", "cost", "previous_capacity"]].rename( - columns={"previous_capacity": "cumulative_capacity"}, errors="raise" - )[["year", "cumulative_capacity"]] - # Add column of origin of the data. - tb_nemet_capacity["cumulative_capacity_source"] = "Nemet (2009)" - - # I haven't found a precise definition of the variables in IRENA's dataset, but I expect this to be - # cumulative capacity. - tb_irena_capacity = ( - tb_irena_capacity[tb_irena_capacity["country"] == "World"][["year", "solar_photovoltaic"]] - .rename(columns={"solar_photovoltaic": "cumulative_capacity"}, errors="raise") - .reset_index(drop=True) - ) - tb_irena_capacity["cumulative_capacity_source"] = "IRENA" - - # Combine cumulative capacity from Nemet (2009) and IRENA, prioritising the former on ovelapping years. - cumulative_capacity = ( - combine_two_overlapping_dataframes(df1=tb_nemet_capacity, df2=tb_irena_capacity, index_columns=["year"]) - .astype({"year": int}) - .sort_values("year") - .reset_index(drop=True) - ) - - return cumulative_capacity - - -def prepare_cost_data( - tb_nemet: catalog.Table, tb_irena_cost: catalog.Table, tb_farmer_lafond: catalog.Table -) -> catalog.Table: - # Prepare solar photovoltaic cost data from Nemet (2009). - tb_nemet_cost = tb_nemet[["year", "cost"]].copy() - tb_nemet_cost["cost_source"] = "Nemet (2009)" - # Costs are given in "2004 USD/Watt", so we need to convert them to 2021 USD. - tb_nemet_cost["cost"] *= USD2004_TO_USD2021 - - # Prepare solar photovoltaic cost data from Farmer & Lafond (2016). - tb_farmer_lafond = ( - tb_farmer_lafond[["year", "photovoltaics"]] - .dropna() - .reset_index(drop=True) - .rename(columns={"photovoltaics": "cost"}, errors="raise") - ) - tb_farmer_lafond["cost_source"] = "Farmer & Lafond (2016)" - # Costs are given in "2013 USD/Wp", so we need to convert them to 2021 USD. - tb_farmer_lafond["cost"] *= USD2013_TO_USD2021 - - # Prepare solar photovoltaic cost data from IRENA. - tb_irena_cost = tb_irena_cost.drop(columns="country") - - tb_irena_cost["cost_source"] = "IRENA" - # Costs are given in "2021 USD/W", so we do not need to correct them. - - # Combine Nemet (2009) and Farmer & Lafond (2016), prioritizing the former. - combined = combine_two_overlapping_dataframes(df1=tb_nemet_cost, df2=tb_farmer_lafond, index_columns="year") - - # Combine the previous with IRENA, prioritizing the latter. - combined = combine_two_overlapping_dataframes(df1=tb_irena_cost, df2=combined, index_columns="year") - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load Nemet (2009) dataset from Garden. - ds_nemet: catalog.Dataset = paths.load_dependency("nemet_2009") - tb_nemet = ds_nemet["nemet_2009"].reset_index() - - # Load Farmer & Lafond (2016) dataset from Garden. - ds_farmer_lafond: catalog.Dataset = paths.load_dependency("farmer_lafond_2016") - tb_farmer_lafond = ds_farmer_lafond["farmer_lafond_2016"].reset_index() - - # Load IRENA dataset on capacity from Garden. - ds_irena_capacity: catalog.Dataset = paths.load_dependency("renewable_electricity_capacity") - tb_irena_capacity = ds_irena_capacity["renewable_electricity_capacity"].reset_index() - - # Load IRENA dataset on cost from Garden. - ds_irena_cost: catalog.Dataset = paths.load_dependency("renewable_power_generation_costs") - tb_irena_cost = ds_irena_cost["solar_photovoltaic_module_prices"] - - # - # Process data. - # - # Create a table of cumulative solar photovoltaic capacity, by combining Nemet (2009) and IRENA data. - cumulative_capacity = prepare_capacity_data(tb_nemet=tb_nemet, tb_irena_capacity=tb_irena_capacity) - - # Create a table of solar photovoltaic cost, by combining Nemet (2009), Farmer & Lafond (2016) and IRENA data. - cost = prepare_cost_data(tb_nemet=tb_nemet, tb_irena_cost=tb_irena_cost, tb_farmer_lafond=tb_farmer_lafond) - - # Combine capacity and cost data. - tb_combined = pd.merge(cost, cumulative_capacity, on="year", how="outer") - - # Add column for region. - tb_combined = tb_combined.assign(**{"country": "World"}) - - # Set an appropriate index and sort conveniently. - tb_combined = tb_combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - # - # Save outputs. - # - # Create a new dataset with the same metadata as meadow - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Add combined table of cost and capacity to dataset. - tb_combined.metadata.short_name = paths.short_name - ds_garden.add(tb_combined) - - # Update dataset metadata and save dataset. - ds_garden.update_metadata(paths.metadata_path) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-02-20/electricity_mix.meta.yml b/etl/steps/archive/garden/energy/2023-02-20/electricity_mix.meta.yml deleted file mode 100644 index 6f87b79675d..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/electricity_mix.meta.yml +++ /dev/null @@ -1,412 +0,0 @@ -dataset: - namespace: energy - version: 2023-02-20 - title: Electricity mix (BP & Ember, 2023) - short_name: electricity_mix - description: | - Data is compiled by Our World in Data based on three main sources: - - [BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - - [Ember Yearly Electricity Data (2023)](https://ember-climate.org/data-catalogue/yearly-electricity-data/). - - [Ember European Electricity Review (2022)](https://ember-climate.org/insights/research/european-electricity-review-2022/). - - Ember compile their global dataset from various sources including: - - Eurostat: Annual European generation and import data, and monthly data in some cases where better sources are not available. - - ENTSO-E: Monthly European generation and import data. - - EIA: Annual global generation and import data. - - UN: Monthly global generation data in some cases. - - GEM: Annual global coal and gas capacity data. - - IRENA: Annual global capacity data for all non-fossil fuel types, and for Other Fossil where available. - - WRI: Annual global capacity data for Other Fossil where other sources are not available. - - European carbon intensities rely on data from the European Environment Agency (EEA). - - A complete list of data sources for each individual country in Ember's Yearly Electricity Data can be found [here](https://ember-climate.org/app/uploads/2022/07/Ember-Electricity-Data-Methodology.pdf). - - A complete list of data sources for each individual country in Ember's European Electricity Review can be found [here](https://ember-climate.org/app/uploads/2022/02/EER-Methodology.pdf). - - We rely on Ember as the primary source of electricity consumption data. While BP provides primary energy (not just electricity) consumption data and it provides a longer time-series (dating back to 1965) than Ember (which only dates back to 1990), BP does not provide data for all countries or for all sources of electricity (for example, only Ember provides data on electricity from bioenergy). So, where data from Ember is available for a given country and year, we rely on it as the primary source. We then supplement this with data from BP where data from Ember is not available. - - Our World in Data has converted absolute electricity production by source to the share in the mix by dividing each by total electricity production. - - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe", or "Other CIS"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa". - * "Asia" - All Asian countries + "Other Middle East" + "Other CIS" + "Other Asia Pacific". - * "Europe" - All European countries + "Other Europe". - * "North America" - All North American countries + "Other Caribbean" + "Other North America". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa" is included in "Other Africa"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - - [Ember's region definitions](https://ember-climate.org/countries-and-regions/), denoted with "(Ember)", are: - * "G20 (Ember)" - Group of Twenty: Argentina, Australia, Brazil, Canada, China, France, Germany, India, Indonesia, Italy, Japan, Mexico, Russia, Saudi Arabia, South Africa, South Korea, Turkey, United Kingdom, United States and the 27 members of the European Union. - * "G7 (Ember)" - Group of Seven: Canada, France, Germany, Italy, Japan, United Kingdom and United States. - * "Latin America and Caribbean (Ember)": Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, Uruguay, Venezuela, Aruba, British Virgin Islands, Cayman Islands, Falkland Islands, French Guiana, Guadeloupe, Martinique, Montserrat, Puerto Rico, Turks and Caicos Islands and United States Virgin Islands. - * "Middle East (Ember)": Bahrain, Iran, Iraq, Israel, Jordan, Kuwait, Lebanon, Oman, Palestine, Qatar, Saudi Arabia, Syria, United Arab Emirates and Yemen. - * "OECD (Ember)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, United Kingdom, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, and United States. - sources: - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - publication_year: 2022 - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - - name: Our World in Data based on Ember's Yearly Electricity Data (2023) - published_by: Ember - publication_year: 2023 - publication_date: 2023-01-31 - date_accessed: 2023-02-20 - url: https://ember-climate.org/data-catalogue/yearly-electricity-data/ - - name: Our World in Data based on Ember's European Electricity Review (2022) - published_by: Ember - publication_year: 2022 - publication_date: 2022-02-01 - date_accessed: 2022-08-01 - url: https://ember-climate.org/insights/research/european-electricity-review-2022/ -tables: - electricity_mix: - variables: - bioenergy_generation__twh: - title: Electricity from bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy - bioenergy_share_of_electricity__pct: - title: Bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Bioenergy - co2_intensity__gco2_kwh: - title: Carbon intensity of electricity (gCO2/kWh) - short_unit: gCO₂ - unit: grams of CO₂ equivalent per kilowatt-hour - display: - name: Carbon intensity of electricity per kilowatt-hour - coal_generation__twh: - title: Electricity from coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - coal_share_of_electricity__pct: - title: Coal (% electricity) - short_unit: '%' - unit: '%' - display: - name: Coal - fossil_generation__twh: - title: Electricity from fossil fuels (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil fuels - fossil_share_of_electricity__pct: - title: Fossil fuels (% electricity) - short_unit: '%' - unit: '%' - display: - name: Fossil fuels - gas_generation__twh: - title: Electricity from gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas - gas_share_of_electricity__pct: - title: Gas (% electricity) - short_unit: '%' - unit: '%' - display: - name: Gas - hydro_generation__twh: - title: Electricity from hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - hydro_share_of_electricity__pct: - title: Hydro (% electricity) - short_unit: '%' - unit: '%' - display: - name: Hydropower - low_carbon_generation__twh: - title: Low-carbon electricity (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Low-carbon electricity - low_carbon_share_of_electricity__pct: - title: Low-carbon electricity (% electricity) - short_unit: '%' - unit: '%' - display: - name: Share of electricity from low-carbon sources - net_imports_share_of_demand__pct: - title: Net electricity imports as a share of demand (%) - short_unit: '%' - unit: '%' - display: - name: Net electricity imports as a share of demand - nuclear_generation__twh: - title: Electricity from nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - nuclear_share_of_electricity__pct: - title: Nuclear (% electricity) - short_unit: '%' - unit: '%' - display: - name: Nuclear - oil_generation__twh: - title: Electricity from oil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - oil_share_of_electricity__pct: - title: Oil (% electricity) - short_unit: '%' - unit: '%' - display: - name: Oil - other_renewables_excluding_bioenergy_generation__twh: - title: Other renewables excluding bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables, excluding bioenergy - other_renewables_excluding_bioenergy_share_of_electricity__pct: - title: Other renewables excluding bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Other renewables, excluding bioenergy - other_renewables_including_bioenergy_generation__twh: - title: Other renewables including bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables, including bioenergy - other_renewables_including_bioenergy_share_of_electricity__pct: - title: Other renewables including bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Other renewables, including bioenergy - per_capita_bioenergy_generation__kwh: - title: Bioenergy electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Bioenergy electricity per capita - numDecimalPlaces: 0 - per_capita_coal_generation__kwh: - title: Coal electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Coal electricity per capita - numDecimalPlaces: 0 - per_capita_fossil_generation__kwh: - title: Fossil fuel electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Fossil fuel electricity per capita - numDecimalPlaces: 0 - per_capita_gas_generation__kwh: - title: Gas electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Gas electricity per capita - numDecimalPlaces: 0 - per_capita_hydro_generation__kwh: - title: Hydro electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Hydro electricity per capita - numDecimalPlaces: 0 - per_capita_low_carbon_generation__kwh: - title: Low-carbon electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Low-carbon electricity per capita - numDecimalPlaces: 0 - per_capita_nuclear_generation__kwh: - title: Nuclear electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Nuclear electricity per capita - numDecimalPlaces: 0 - per_capita_oil_generation__kwh: - title: Oil electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Oil electricity per capita - numDecimalPlaces: 0 - per_capita_other_renewables_excluding_bioenergy_generation__kwh: - title: Other renewable electricity excluding bioenergy per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Other renewable electricity excluding bioenergy per capita - numDecimalPlaces: 0 - per_capita_other_renewables_including_bioenergy_generation__kwh: - title: Other renewable electricity including bioenergy per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Other renewable electricity including bioenergy per capita - numDecimalPlaces: 0 - per_capita_renewable_generation__kwh: - title: Renewable electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Renewable electricity per capita - numDecimalPlaces: 0 - per_capita_solar_generation__kwh: - title: Solar electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Solar electricity per capita - numDecimalPlaces: 0 - per_capita_solar_and_wind_generation__kwh: - title: Solar and wind electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Solar and wind electricity per capita - numDecimalPlaces: 0 - per_capita_total_generation__kwh: - title: Per capita electricity (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Per capita electricity use - numDecimalPlaces: 0 - per_capita_wind_generation__kwh: - title: Wind electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Wind electricity per capita - numDecimalPlaces: 0 - population: - title: Population - short_unit: people - unit: people - display: - name: Population - primary_energy_consumption__twh: - title: Electricity from primary energy consumption (twh) (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Primary energy consumption - renewable_generation__twh: - title: Electricity from renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables - renewable_share_of_electricity__pct: - title: Renewables (% electricity) - short_unit: '%' - unit: '%' - display: - name: Renewables - numDecimalPlaces: 2 - solar_generation__twh: - title: Electricity from solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - solar_and_wind_generation__twh: - title: Electricity from solar and wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar and wind - solar_share_of_electricity__pct: - title: Solar (% electricity) - short_unit: '%' - unit: '%' - display: - name: Solar - solar_and_wind_share_of_electricity__pct: - title: Solar and wind (% electricity) - short_unit: '%' - unit: '%' - display: - name: Solar and wind - total_demand__twh: - title: Electricity demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Electricity demand - total_electricity_share_of_primary_energy__pct: - title: Electricity as share of primary energy (%) - short_unit: '%' - unit: '%' - display: - name: Electricity as share of primary energy - total_emissions__mtco2: - title: Emissions (MtCO2) - short_unit: million t - unit: million tonnes CO2 equivalent - display: - name: Emissions - total_generation__twh: - title: Electricity generation (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Electricity generation - total_net_imports__twh: - title: Net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Net imports - wind_generation__twh: - title: Electricity from wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - wind_share_of_electricity__pct: - title: Wind (% electricity) - short_unit: '%' - unit: '%' - display: - name: Wind diff --git a/etl/steps/archive/garden/energy/2023-02-20/electricity_mix.py b/etl/steps/archive/garden/energy/2023-02-20/electricity_mix.py deleted file mode 100644 index ee6fca8e12f..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/electricity_mix.py +++ /dev/null @@ -1,309 +0,0 @@ -"""Garden step that combines BP's statistical review with Ember's combined electricity data (combination of the European -Electricity Review and the Yearly Electricity Data) to create the Electricity Mix (BP & Ember) dataset. - -""" - -from typing import Dict, List - -import pandas as pd -from owid import catalog -from owid.datautils.dataframes import combine_two_overlapping_dataframes -from shared import add_population - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 -# Megatonnes to grams. -MT_TO_G = 1e12 - - -def process_bp_data(table_bp: catalog.Table) -> pd.DataFrame: - """Load necessary columns from BP's Statistical Review dataset, and create some new variables (e.g. electricity - generation from fossil fuels). - - Parameters - ---------- - table_bp : catalog.Table - BP's Statistical Review (already processed, with harmonized countries and region aggregates). - - Returns - ------- - df_bp : pd.DataFrame - Processed BP data. - - """ - # Columns to load from BP dataset. - columns = { - "electricity_generation": "total_generation__twh", - "primary_energy_consumption__twh": "primary_energy_consumption__twh", - "hydro_generation__twh": "hydro_generation__twh", - "nuclear_generation__twh": "nuclear_generation__twh", - "solar_generation__twh": "solar_generation__twh", - "wind_generation__twh": "wind_generation__twh", - "geo_biomass_other__twh": "other_renewables_including_bioenergy_generation__twh", - "elec_gen_from_oil": "oil_generation__twh", - "elec_gen_from_coal": "coal_generation__twh", - "elec_gen_from_gas": "gas_generation__twh", - } - table_bp = table_bp[list(columns)].rename(columns=columns, errors="raise") - # New columns to be created by summing other columns. - aggregates: Dict[str, List[str]] = { - "fossil_generation__twh": [ - "oil_generation__twh", - "coal_generation__twh", - "gas_generation__twh", - ], - "renewable_generation__twh": [ - "hydro_generation__twh", - "solar_generation__twh", - "wind_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - ], - "low_carbon_generation__twh": [ - "renewable_generation__twh", - "nuclear_generation__twh", - ], - "solar_and_wind_generation__twh": [ - "solar_generation__twh", - "wind_generation__twh", - ], - } - - # Create a dataframe with a dummy index. - df_bp = pd.DataFrame(table_bp).reset_index() - - # Create new columns, by adding up other columns (and allowing for only one nan in each sum). - for new_column in aggregates: - df_bp[new_column] = df_bp[aggregates[new_column]].sum(axis=1, min_count=len(aggregates[new_column]) - 1) - - return df_bp - - -def process_ember_data(table_ember: catalog.Table) -> pd.DataFrame: - """Load necessary columns from the Combined Electricity dataset and prepare a dataframe with the required variables. - - Parameters - ---------- - table_ember : catalog.Table - Combined Electricity (combination of Ember's Yearly Electricity Data and European Electricity Review). - - Returns - ------- - df_ember : pd.DataFrame - Processed Combined Electricity data. - - """ - # Columns to load from Ember dataset. - columns = { - "generation__bioenergy__twh": "bioenergy_generation__twh", - "generation__gas__twh": "gas_generation__twh", - "generation__coal__twh": "coal_generation__twh", - "generation__other_fossil__twh": "oil_generation__twh", - "generation__renewables__twh": "renewable_generation__twh", - "generation__other_renewables__twh": "other_renewables_excluding_bioenergy_generation__twh", - "generation__clean__twh": "low_carbon_generation__twh", - "generation__hydro__twh": "hydro_generation__twh", - "generation__nuclear__twh": "nuclear_generation__twh", - "generation__solar__twh": "solar_generation__twh", - "generation__wind__twh": "wind_generation__twh", - "generation__fossil__twh": "fossil_generation__twh", - "generation__total_generation__twh": "total_generation__twh", - "demand__total_demand__twh": "total_demand__twh", - "emissions__total_emissions__mtco2": "total_emissions__mtco2", - "emissions__co2_intensity__gco2_kwh": "co2_intensity__gco2_kwh", - "imports__total_net_imports__twh": "total_net_imports__twh", - } - table_ember = table_ember[list(columns)].rename(columns=columns, errors="raise") - - # Create a dataframe with a dummy index. - df_ember = pd.DataFrame(table_ember).reset_index() - - # In BP data, there is a variable "Geo Biomass Other", which combines all other renewables. - # In Ember data, "other rewenables" excludes bioenergy. - # To be able to combine both datasets, create a new variable for generation of other renewables including bioenergy. - df_ember["other_renewables_including_bioenergy_generation__twh"] = ( - df_ember["other_renewables_excluding_bioenergy_generation__twh"] + df_ember["bioenergy_generation__twh"] - ) - - # Create a new variable for solar and wind generation. - df_ember["solar_and_wind_generation__twh"] = df_ember["solar_generation__twh"] + df_ember["wind_generation__twh"] - - return df_ember - - -def add_per_capita_variables(combined: pd.DataFrame, population: pd.DataFrame) -> pd.DataFrame: - """Add per capita variables (in kWh per person) to the combined BP and Ember dataframe. - - The list of variables to make per capita are given in this function. The new variable names will be 'per_capita_' - followed by the original variable's name. - - Parameters - ---------- - combined : pd.DataFrame - Combination of BP's Statistical Review and Ember's Combined Electricity. - population: pd.DataFrame - Population data. - - Returns - ------- - combined : pd.DataFrame - Input dataframe after adding per capita variables. - - """ - combined = combined.copy() - - # Variables to make per capita. - per_capita_variables = [ - "bioenergy_generation__twh", - "coal_generation__twh", - "fossil_generation__twh", - "gas_generation__twh", - "hydro_generation__twh", - "low_carbon_generation__twh", - "nuclear_generation__twh", - "oil_generation__twh", - "other_renewables_excluding_bioenergy_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - "renewable_generation__twh", - "solar_generation__twh", - "total_generation__twh", - "wind_generation__twh", - "solar_and_wind_generation__twh", - ] - # Add a column for population (only for harmonized countries). - combined = add_population(df=combined, population=population, warn_on_missing_countries=False) - - for variable in per_capita_variables: - assert "twh" in variable, f"Variables are assumed to be in TWh, but {variable} is not." - new_column = "per_capita_" + variable.replace("__twh", "__kwh") - combined[new_column] = combined[variable] * TWH_TO_KWH / combined["population"] - - return combined - - -def add_share_variables(combined: pd.DataFrame) -> pd.DataFrame: - """Add variables for the electricity generation as a share of the total electricity generation (as a percentage). - - The following new variables will be created: - * For each source (e.g. coal_generation__twh) in a list given in this function, a new variable will be created - (named, e.g. coal_share_of_electricity__pct). - * Total electricity generation as a share of primary energy consumption. - * Total net electricity imports as a share of total electricity demand. - - Parameters - ---------- - combined : pd.DataFrame - Combination of BP's Statistical Review and Ember's Combined Electricity. - - Returns - ------- - combined : pd.DataFrame - Input dataframe after adding share variables. - - """ - # Variables to make as share of electricity (new variable names will be the name of the original variable followed - # by '_share_of_electricity__pct'). - share_variables = [ - "bioenergy_generation__twh", - "coal_generation__twh", - "fossil_generation__twh", - "gas_generation__twh", - "hydro_generation__twh", - "low_carbon_generation__twh", - "nuclear_generation__twh", - "oil_generation__twh", - "other_renewables_excluding_bioenergy_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - "renewable_generation__twh", - "solar_generation__twh", - "total_generation__twh", - "wind_generation__twh", - "solar_and_wind_generation__twh", - ] - for variable in share_variables: - new_column = variable.replace("_generation__twh", "_share_of_electricity__pct") - combined[new_column] = 100 * combined[variable] / combined["total_generation__twh"] - - # Calculate the percentage of electricity as a share of primary energy. - combined["total_electricity_share_of_primary_energy__pct"] = ( - 100 * combined["total_generation__twh"] / combined["primary_energy_consumption__twh"] - ) - - # Calculate the percentage of electricity demand that is imported. - combined["net_imports_share_of_demand__pct"] = ( - 100 * combined["total_net_imports__twh"] / combined["total_demand__twh"] - ) - - # Sanity check. - error = "Total electricity share does not add up to 100%." - assert all(abs(combined["total_share_of_electricity__pct"].dropna() - 100) < 0.01), error - - # Remove unnecessary columns. - combined = combined.drop(columns=["total_share_of_electricity__pct"]) - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BP's statistical review dataset. - ds_bp: catalog.Dataset = paths.load_dependency("statistical_review") - # Select main table. - table_bp = ds_bp["statistical_review"] - # Create a convenient dataframe. - df_bp = pd.DataFrame(table_bp) - - # Idem for Ember's combined electricity. - ds_ember: catalog.Dataset = paths.load_dependency("combined_electricity") - table_ember = ds_ember["combined_electricity"] - df_ember = pd.DataFrame(table_ember) - - # Idem for population dataset. - ds_population: catalog.Dataset = paths.load_dependency("population") - # Get table from dataset. - tb_population = ds_population["population"] - # Make a dataframe out of the data in the table, with the required columns. - df_population = pd.DataFrame(tb_population) - - # - # Process data. - # - # Prepare BP and Ember data. - df_bp = process_bp_data(table_bp=table_bp) - df_ember = process_ember_data(table_ember=table_ember) - - # Combine both tables, giving priority to Ember data (on overlapping values). - combined = combine_two_overlapping_dataframes(df1=df_ember, df2=df_bp, index_columns=["country", "year"]) - - # Add carbon intensities. - # There is already a variable for this in the Ember dataset, but now that we have combined - # BP and Ember data, intensities should be recalculated for consistency. - combined["co2_intensity__gco2_kwh"] = (combined["total_emissions__mtco2"] * MT_TO_G) / ( - combined["total_generation__twh"] * TWH_TO_KWH - ) - - # Add per capita variables. - combined = add_per_capita_variables(combined=combined, population=df_population) - - # Add "share" variables. - combined = add_share_variables(combined=combined) - - # Set an appropriate index and sort rows and columns conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Convert dataframe into a table (with no metadata). - table = catalog.Table(combined, short_name="electricity_mix") - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[table]) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-02-20/fossil_fuel_production.meta.yml b/etl/steps/archive/garden/energy/2023-02-20/fossil_fuel_production.meta.yml deleted file mode 100644 index 019333dba95..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/fossil_fuel_production.meta.yml +++ /dev/null @@ -1,135 +0,0 @@ -dataset: - namespace: energy - version: 2023-02-20 - title: Fossil fuel production (BP & Shift, 2023) - short_name: fossil_fuel_production - description: | - This dataset on fossil fuel production is generated by combining the latest data from [the BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html) and [The Shift Dataportal](https://www.theshiftdataportal.org/energy). - - BP provide fossil fuel production data from 1965 onwards (and crude prices from 1861 onwards). The Shift Dataportal provides long-term data from 1900, but only extends to 2016. - - To maintain consistency with the energy datasets on Our World in Data, we have taken BP data as preference - meaning if BP provides data for the given country and year, this is used. Where data is not available from BP for a given country, or pre-1965 we rely on data from Shift. - - We have converted primary production in exajoules to terawatt-hours using the conversion factor: 1,000,000 / 3,600 ~ 278. - - Production per capita has been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). - sources: - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: | - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe", or "Other CIS"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa". - * "Asia" - All Asian countries + "Other Middle East" + "Other CIS" + "Other Asia Pacific". - * "Europe" - All European countries + "Other Europe". - * "North America" - All North American countries + "Other Caribbean" + "Other North America". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa" is included in "Other Africa"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - - name: Our World in Data based on The Shift Dataportal (2022) - published_by: The Shift Dataportal - date_accessed: 2022-07-18 - url: https://www.theshiftdataportal.org/energy -tables: - fossil_fuel_production: - variables: - annual_change_in_coal_production__pct: - title: Annual change in coal production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in coal production - annual_change_in_coal_production__twh: - title: Annual change in coal production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in coal production - annual_change_in_gas_production__pct: - title: Annual change in gas production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in gas production - annual_change_in_gas_production__twh: - title: Annual change in gas production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in gas production - annual_change_in_oil_production__pct: - title: Annual change in oil production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in oil production - annual_change_in_oil_production__twh: - title: Annual change in oil production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in oil production - coal_production__twh: - title: Coal production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal production - numDecimalPlaces: 0 - coal_production_per_capita__kwh: - title: Coal production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Coal production per capita - numDecimalPlaces: 0 - gas_production__twh: - title: Gas production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas production - numDecimalPlaces: 0 - gas_production_per_capita__kwh: - title: Gas production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Gas production per capita - numDecimalPlaces: 0 - oil_production__twh: - title: Oil production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil production - numDecimalPlaces: 0 - oil_production_per_capita__kwh: - title: Oil production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Oil production per capita - numDecimalPlaces: 0 diff --git a/etl/steps/archive/garden/energy/2023-02-20/fossil_fuel_production.py b/etl/steps/archive/garden/energy/2023-02-20/fossil_fuel_production.py deleted file mode 100644 index 5f8b03f29a4..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/fossil_fuel_production.py +++ /dev/null @@ -1,264 +0,0 @@ -"""Garden step for Fossil fuel production dataset (part of the OWID Energy dataset), based on a combination of BP's -Statistical Review dataset and Shift data on fossil fuel production. - -""" - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table -from owid.datautils import dataframes -from shared import add_population - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - - -def prepare_bp_data(tb_bp: Table) -> Table: - """Prepare BP data. - - Parameters - ---------- - tb_bp : Table - BP data. - - Returns - ------- - tb_bp : Table - BP data as a table with metadata. - - """ - tb_bp = tb_bp.reset_index() - - bp_columns = { - "country": "country", - "year": "year", - "coal_production__twh": "Coal production (TWh)", - "gas_production__twh": "Gas production (TWh)", - "oil_production__twh": "Oil production (TWh)", - } - tb_bp = tb_bp[list(bp_columns)].rename(columns=bp_columns) - - return tb_bp - - -def prepare_shift_data(tb_shift: Table) -> Table: - """Prepare Shift data. - - Parameters - ---------- - tb_shift : Table - Shift data. - - Returns - ------- - shift_table : Table - Shift data as a table with metadata. - - """ - tb_shift = tb_shift.reset_index() - - shift_columns = { - "country": "country", - "year": "year", - "coal": "Coal production (TWh)", - "gas": "Gas production (TWh)", - "oil": "Oil production (TWh)", - } - tb_shift = tb_shift[list(shift_columns)].rename(columns=shift_columns) - - return tb_shift - - -def combine_bp_and_shift_data(tb_bp: Table, tb_shift: Table) -> pd.DataFrame: - """Combine BP and Shift data. - - Parameters - ---------- - tb_bp : Table - Processed BP table. - tb_shift : Table - Process Shift table. - - Returns - ------- - combined : pd.DataFrame - Combined data. - - """ - # Check that there are no duplicated rows in any of the two datasets. - assert tb_bp[tb_bp.duplicated(subset=["country", "year"])].empty, "Duplicated rows in BP data." - assert tb_shift[tb_shift.duplicated(subset=["country", "year"])].empty, "Duplicated rows in Shift data." - - # Combine Shift data (which goes further back in the past) with BP data (which is more up-to-date). - # On coincident rows, prioritise BP data. - index_columns = ["country", "year"] - combined = dataframes.combine_two_overlapping_dataframes(df1=tb_bp, df2=tb_shift, index_columns=index_columns) - - # Remove rows that only have nan. - combined = combined.dropna(subset=combined.drop(columns=["country", "year"]).columns, how="all") - - # Sort data appropriately. - combined = pd.DataFrame(combined).sort_values(index_columns).reset_index(drop=True) - - return combined - - -def add_annual_change(df: pd.DataFrame) -> pd.DataFrame: - """Add annual change variables to combined BP & Shift dataset. - - Parameters - ---------- - df : pd.DataFrame - Combined BP & Shift dataset. - - Returns - ------- - combined : pd.DataFrame - Combined BP & Shift dataset after adding annual change variables. - - """ - combined = df.copy() - - # Calculate annual change. - combined = combined.sort_values(["country", "year"]).reset_index(drop=True) - for cat in ("Coal", "Oil", "Gas"): - combined[f"Annual change in {cat.lower()} production (%)"] = ( - combined.groupby("country")[f"{cat} production (TWh)"].pct_change() * 100 - ) - combined[f"Annual change in {cat.lower()} production (TWh)"] = combined.groupby("country")[ - f"{cat} production (TWh)" - ].diff() - - return combined - - -def add_per_capita_variables(df: pd.DataFrame, population: pd.DataFrame) -> pd.DataFrame: - """Add per-capita variables to combined BP & Shift dataset. - - Parameters - ---------- - df : pd.DataFrame - Combined BP & Shift dataset. - population : pd.DataFrame - Population data. - - Returns - ------- - combined : pd.DataFrame - Combined BP & Shift dataset after adding per-capita variables. - - """ - df = df.copy() - - # List countries for which we expect to have no population. - # These are countries and regions defined by BP and Shift. - expected_countries_without_population = [ - country for country in df["country"].unique() if (("(BP)" in country) or ("(Shift)" in country)) - ] - # Add population to data. - combined = add_population( - df=df, - population=population, - country_col="country", - year_col="year", - population_col="population", - warn_on_missing_countries=False, - interpolate_missing_population=True, - expected_countries_without_population=expected_countries_without_population, - ) - - # Calculate production per capita. - for cat in ("Coal", "Oil", "Gas"): - combined[f"{cat} production per capita (kWh)"] = ( - combined[f"{cat} production (TWh)"] / combined["population"] * TWH_TO_KWH - ) - combined = combined.drop(errors="raise", columns=["population"]) - - return combined - - -def remove_spurious_values(df: pd.DataFrame) -> pd.DataFrame: - """Remove spurious infinity values. - - These values are generated when calculating the annual change of a variable that is zero or nan the previous year. - - Parameters - ---------- - df : pd.DataFrame - Data that may contain infinity values. - - Returns - ------- - df : pd.DataFrame - Corrected data. - - """ - # Replace any infinity value by nan. - df = df.replace([np.inf, -np.inf], np.nan) - - # Remove rows that only have nan. - df = df.dropna(subset=df.drop(columns=["country", "year"]).columns, how="all").reset_index(drop=True) - - return df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BP statistical review dataset. - ds_bp: Dataset = paths.load_dependency("statistical_review") - # Read main table from dataset. - tb_bp = ds_bp["statistical_review"] - - # Load Shift data. - ds_shift: Dataset = paths.load_dependency("fossil_fuel_production") - # Read main table from dataset. - tb_shift = ds_shift["fossil_fuel_production"] - - # Load population dataset from garden. - ds_population: Dataset = paths.load_dependency("population") - # Get table from dataset. - tb_population = ds_population["population"] - # Make a dataframe out of the data in the table, with the required columns. - df_population = pd.DataFrame(tb_population) - - # - # Process data. - # - # Prepare BP data. - tb_bp = prepare_bp_data(tb_bp=tb_bp) - - # Prepare Shift data on fossil fuel production. - tb_shift = prepare_shift_data(tb_shift=tb_shift) - - # Combine BP and Shift data. - df = combine_bp_and_shift_data(tb_bp=tb_bp, tb_shift=tb_shift) - - # Add annual change. - df = add_annual_change(df=df) - - # Add per-capita variables. - df = add_per_capita_variables(df=df, population=df_population) - - # Remove spurious values and rows that only have nans. - df = remove_spurious_values(df=df) - - # Create an appropriate index and sort conveniently. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Create new table. - table = Table(df, short_name="fossil_fuel_production") - - # - # Save outputs. - # - # Create a new dataset with the same metadata as in Meadow. - ds_garden = create_dataset(dest_dir, tables=[table]) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-02-20/global_primary_energy.meta.yml b/etl/steps/archive/garden/energy/2023-02-20/global_primary_energy.meta.yml deleted file mode 100644 index c081843798e..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/global_primary_energy.meta.yml +++ /dev/null @@ -1,272 +0,0 @@ -dataset: - namespace: energy - version: 2023-02-20 - title: Global Primary Energy (Smil & BP, 2023) - short_name: global_primary_energy - description: | - This dataset comprises of a combination of data from Appendix A of Vaclav Smil's Updated and Revised Edition of his book, 'Energy Transitions: Global and National Perspectives' (2017) and BP's Statistical Review of World Energy (2022). - - All data prior to the year 1965 is sourced from Smil (2017). All data from 1965 onwards, with the exception of traditional biomass is sourced from BP Statistical Review. Smil's estimates of traditional biomass are only available until 2015. For the years 2016 onwards, we have assumed a similar level of traditional biomass consumption. This is approximately in line with recent trends in traditional biomass from Smil's data. - - Our World in Data has normalized all BP fossil fuels data to terawatt-hours (TWh) using a conversion factor of 1,000,000 / 3,600 (~277.778) to convert from exajoules (EJ) to TWh. - - This dataset includes primary energy data using two methodologies: - (1) 'direct' primary energy, which does not take account of the inefficiencies in fossil fuel production. Fossil fuel data is compared to electricity generation (not in input equivalents) of nuclear and renewables. - (2) 'substitution' primary energy, which does take account of inefficiencies in fossil fuel production. This converts non-fossil energy to their 'input equivalents': The amount of primary energy that would be needed if they had the same inefficiencies as fossil fuels. This is the methodology adopted by BP when all data is compared in exajoules. - -tables: - global_primary_energy: - variables: - biofuels__twh_direct_energy: - title: Biofuels (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Modern biofuels - biofuels__twh_substituted_energy: - title: Biofuels (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Modern biofuels - coal__twh_direct_energy: - title: Coal (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - coal__twh_substituted_energy: - title: Coal (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - # data_source - gas__twh_direct_energy: - title: Gas (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - gas__twh_substituted_energy: - title: Gas (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - hydropower__twh_direct_energy: - title: Hydropower (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - hydropower__twh_substituted_energy: - title: Hydropower (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - nuclear__twh_direct_energy: - title: Nuclear (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - nuclear__twh_substituted_energy: - title: Nuclear (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - oil__twh_direct_energy: - title: Oil (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - oil__twh_substituted_energy: - title: Oil (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - other_renewables__twh_direct_energy: - title: Other renewables (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - other_renewables__twh_substituted_energy: - title: Other renewables (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - solar__twh_direct_energy: - title: Solar (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - solar__twh_substituted_energy: - title: Solar (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - traditional_biomass__twh_direct_energy: - title: Traditional biomass (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Traditional biomass - traditional_biomass__twh_substituted_energy: - title: Traditional biomass (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Traditional biomass - wind__twh_direct_energy: - title: Wind (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - wind__twh_substituted_energy: - title: Wind (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - total_consumption__twh_direct_energy: - title: Total consumption (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Total consumption - total_consumption__twh_substituted_energy: - title: Total consumption (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Total consumption - biofuels__pct_of_direct_energy: - title: Biofuels (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Modern biofuels - biofuels__pct_of_substituted_energy: - title: Biofuels (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Modern biofuels - coal__pct_of_direct_energy: - title: Coal (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Coal - coal__pct_of_substituted_energy: - title: Coal (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Coal - gas__pct_of_direct_energy: - title: Gas (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Natural gas - gas__pct_of_substituted_energy: - title: Gas (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Natural gas - hydropower__pct_of_direct_energy: - title: Hydropower (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Hydropower - hydropower__pct_of_substituted_energy: - title: Hydropower (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Hydropower - nuclear__pct_of_direct_energy: - title: Nuclear (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Nuclear - nuclear__pct_of_substituted_energy: - title: Nuclear (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Nuclear - oil__pct_of_direct_energy: - title: Oil (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Oil - oil__pct_of_substituted_energy: - title: Oil (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Oil - other_renewables__pct_of_direct_energy: - title: Other renewables (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Other renewables - other_renewables__pct_of_substituted_energy: - title: Other renewables (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Other renewables - solar__pct_of_direct_energy: - title: Solar (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Solar - solar__pct_of_substituted_energy: - title: Solar (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Solar - traditional_biomass__pct_of_direct_energy: - title: Traditional biomass (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Traditional biomass - traditional_biomass__pct_of_substituted_energy: - title: Traditional biomass (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Traditional biomass - wind__pct_of_direct_energy: - title: Wind (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Wind - wind__pct_of_substituted_energy: - title: Wind (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Wind diff --git a/etl/steps/archive/garden/energy/2023-02-20/global_primary_energy.py b/etl/steps/archive/garden/energy/2023-02-20/global_primary_energy.py deleted file mode 100644 index 7aeda794b0c..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/global_primary_energy.py +++ /dev/null @@ -1,214 +0,0 @@ -"""Garden step that combines Vaclav Smil's Global Primary Energy with BP's Statistical Review of World Energy. - -""" - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table -from owid.datautils.dataframes import combine_two_overlapping_dataframes -from shared import gather_sources_from_tables - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Exajoules to terawatt-hours. -EJ_TO_TWH = 1e6 / 3600 - -# Average efficiency factor assumed to convert direct energy to input-equivalent energy of Smil's data. -# This factor will be used for hydropower, nuclear, other renewables, solar and wind -# (for which there is data until 1960). -# In practice, it only affects hydropower, since all other non-fossil sources are zero prior to 1960. -# All other energy sources in Smil's data will not be affected by this factor. -EFFICIENCY_FACTOR = 0.36 - - -def prepare_bp_data(tb_bp: Table) -> pd.DataFrame: - df_bp = pd.DataFrame(tb_bp).reset_index() - - # BP gives generation of direct energy in TWh, and, for non-fossil sources of electricity, - # consumption of input-equivalent energy in EJ. - # The input-equivalent energy is the amount of energy that would be required to generate a given amount of (direct) - # electricity if non-fossil sources were as inefficient as a standard thermal power plant. - # Therefore, direct and substituted energies for Biofuels, Coal, Gas and Oil are identical. - # On the other hand, direct and substituted energy are different for non-fossil electricity sources, namely - # Hydropower, Nuclear, Solar, Other renewables, and Wind. - # The difference is of a factor of ~38%, which is roughly the efficiency of a standard power plant. - # More specifically, BP assumes (for Biofuels, Coal, Gas and Oil) an efficiency factor that grows from 36% - # (until year 2000) to 40.6% (in 2021), to better reflect changes in efficiency over time. - # In the case of biomass used in electricity (included in 'Other renewables'), - # BP assumes a constant factor of 32% for all years. - # For more details: - # https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/pdfs/energy-economics/statistical-review/bp-stats-review-2022-methodology.pdf - bp_columns = { - "country": "country", - "year": "year", - # Fossil sources (direct energy). - "biofuels_consumption__twh__total": "biofuels__twh_direct_energy", - "coal_consumption__twh": "coal__twh_direct_energy", - "gas_consumption__twh": "gas__twh_direct_energy", - "oil_consumption__twh": "oil__twh_direct_energy", - # Non-fossil electricity sources (direct energy). - "geo_biomass_other__twh": "other_renewables__twh_direct_energy", - "hydro_generation__twh": "hydropower__twh_direct_energy", - "nuclear_generation__twh": "nuclear__twh_direct_energy", - "solar_generation__twh": "solar__twh_direct_energy", - "wind_generation__twh": "wind__twh_direct_energy", - # Non-fossil electricity sources (substituted energy). - "geo_biomass_other__ej": "other_renewables__ej_substituted_energy", - "hydro_consumption__ej": "hydropower__ej_substituted_energy", - "nuclear_consumption__ej": "nuclear__ej_substituted_energy", - "solar_consumption__ej": "solar__ej_substituted_energy", - "wind_consumption__ej": "wind__ej_substituted_energy", - } - df_bp = df_bp[list(bp_columns)].rename(columns=bp_columns) - # Convert all units to TWh. - for column in df_bp.columns: - if "_ej_" in column: - # Create a new column in TWh instead of EJ. - df_bp[column.replace("_ej_", "_twh_")] = df_bp[column] * EJ_TO_TWH - # Remove the column in EJ. - df_bp = df_bp.drop(columns=column) - # For completeness, create columns of substituted energy for fossil sources (even if they would coincide with - # direct energy). - for fossil_source in ["biofuels", "coal", "gas", "oil"]: - df_bp[f"{fossil_source}__twh_substituted_energy"] = df_bp[f"{fossil_source}__twh_direct_energy"] - - # Select only data for the World (which is the only region informed in Smil's data). - df_bp = df_bp[df_bp["country"] == "World"].reset_index(drop=True) - - return df_bp - - -def prepare_smil_data(tb_smil: Table) -> pd.DataFrame: - df_smil = pd.DataFrame(tb_smil).reset_index() - - # Create columns for input-equivalent energy. - # To do this, we follow a similar approach to BP: - # We create input-equivalent energy by dividing direct energy consumption of non-fossil electricity sources - # (hydropower, nuclear, other renewables, solar and wind) by a factor of 36% - # (called EFFICIENCY_FACTOR, defined above). - # This is the efficiency factor of a typical thermal plant assumed by BP between 1965 and 2000, and we assume this - # factor also applies for the period 1800 to 1965. - # For biomass power (included in other renewables), BP assumed a constant factor of 32%. - # However, since we cannot separate biomass from the rest of sources in 'other renewables', - # we use the same 36% factor as all other non-fossil sources. - for source in ["hydropower", "nuclear", "other_renewables", "solar", "wind"]: - df_smil[f"{source}__twh_substituted_energy"] = df_smil[f"{source}__twh_direct_energy"] / EFFICIENCY_FACTOR - # For fossil sources (including biofuels and traditional biomass), direct and substituted energy are the same. - for source in ["biofuels", "coal", "gas", "oil", "traditional_biomass"]: - df_smil[f"{source}__twh_substituted_energy"] = df_smil[f"{source}__twh_direct_energy"] - - return df_smil - - -def combine_bp_and_smil_data(df_bp: pd.DataFrame, df_smil: pd.DataFrame) -> pd.DataFrame: - df_bp = df_bp.copy() - df_smil = df_smil.copy() - - # Add a new column that informs of the source of the data. - df_bp["data_source"] = "BP" - df_smil["data_source"] = "Smil" - # Combine both dataframes, prioritizing BP's data on overlapping rows. - combined = combine_two_overlapping_dataframes( - df1=df_bp, df2=df_smil, index_columns=["country", "year"] - ).sort_values(["year"]) - - # Replace by numpy nans. - combined = combined.fillna(np.nan) - - # We do not have data for traditional biomass after 2015 (BP does not provide it). - # So, to be able to visualize the complete mix of global energy consumption, - # we extrapolate Smil's data for traditional biomass from 2015 onwards, by repeating its last value. - missing_years_mask = combined["year"] >= df_smil["year"].max() - combined.loc[missing_years_mask, "traditional_biomass__twh_direct_energy"] = combined[missing_years_mask][ - "traditional_biomass__twh_direct_energy" - ].ffill() - combined.loc[missing_years_mask, "traditional_biomass__twh_substituted_energy"] = combined[missing_years_mask][ - "traditional_biomass__twh_substituted_energy" - ].ffill() - - # Create an index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return combined - - -def add_total_consumption_and_percentages(combined: pd.DataFrame) -> pd.DataFrame: - # Create a column with the total direct energy (ensuring there is at least one non-nan value). - combined["total_consumption__twh_direct_energy"] = combined[ - [column for column in combined.columns if "direct_energy" in column] - ].sum(axis=1, min_count=1) - # Create a column with the total substituted energy (ensuring there is at least one non-nan value). - combined["total_consumption__twh_substituted_energy"] = combined[ - [column for column in combined.columns if "substituted_energy" in column] - ].sum(axis=1, min_count=1) - # Add share variables. - sources = [ - "biofuels", - "coal", - "gas", - "hydropower", - "nuclear", - "oil", - "other_renewables", - "solar", - "traditional_biomass", - "wind", - ] - for source in sources: - # Add percentage of each source with respect to the total direct energy. - combined[f"{source}__pct_of_direct_energy"] = ( - 100 * combined[f"{source}__twh_direct_energy"] / combined["total_consumption__twh_direct_energy"] - ) - # Add percentage of each source with respect to the total substituted energy. - combined[f"{source}__pct_of_substituted_energy"] = ( - 100 * combined[f"{source}__twh_substituted_energy"] / combined["total_consumption__twh_substituted_energy"] - ) - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BP statistical review dataset. - ds_bp: Dataset = paths.load_dependency("statistical_review") - # Read main table from dataset. - tb_bp = ds_bp["statistical_review"] - - # Load Smil dataset. - ds_smil: Dataset = paths.load_dependency("global_primary_energy") - # Read main table from dataset. - tb_smil = ds_smil["global_primary_energy"] - - # - # Process data. - # - # Prepare BP data. - df_bp = prepare_bp_data(tb_bp=tb_bp) - - # Prepare Smil data. - df_smil = prepare_smil_data(tb_smil=tb_smil) - - # Combine BP and Smil data. - combined = combine_bp_and_smil_data(df_bp=df_bp, df_smil=df_smil) - - # Add variables for total consumption and variables of % share of each source. - combined = add_total_consumption_and_percentages(combined=combined) - - # Create a new table with combined data (and no metadata). - tb_combined = Table(combined, short_name="global_primary_energy") - - # - # Save outputs. - # - # Create a new empty garden dataset to gather metadata sources from all tables' original dataset sources. - ds_garden = Dataset.create_empty(dest_dir) - ds_garden.metadata.sources = gather_sources_from_tables(tables=[tb_bp, tb_smil]) - - # Save garden dataset. - ds_garden = create_dataset(dest_dir, tables=[tb_combined], default_metadata=ds_garden.metadata) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-02-20/owid_energy.meta.yml b/etl/steps/archive/garden/energy/2023-02-20/owid_energy.meta.yml deleted file mode 100644 index a684fa8a9c2..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/owid_energy.meta.yml +++ /dev/null @@ -1,16 +0,0 @@ -dataset: - namespace: energy - version: 2023-02-20 - title: Energy dataset (OWID, 2023) - short_name: owid_energy - description: | - OWID Energy dataset. - - This dataset will be loaded by [the energy-data repository](https://github.com/owid/energy-data), to create a csv file of the dataset that can be downloaded in one click. - -# Dataset sources will be created in the step by combining all component datasets' sources. -# Also, table metadata will be built from the tables' metadata and the content of owid_energy_variable_mapping.csv. - -tables: - owid_energy: - variables: {} diff --git a/etl/steps/archive/garden/energy/2023-02-20/owid_energy.py b/etl/steps/archive/garden/energy/2023-02-20/owid_energy.py deleted file mode 100644 index 55edcf41321..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/owid_energy.py +++ /dev/null @@ -1,180 +0,0 @@ -"""Garden step that combines various datasets related to energy and produces the OWID Energy dataset. - -Datasets combined: -* Energy mix from BP. -* Fossil fuel production (BP & Shift). -* Primary energy consumption (BP & EIA). -* Electricity mix (BP & Ember). - -Auxiliary datasets: -* Regions (OWID). -* Population (OWID based on various sources). -* GDP (GGDC Maddison). - -""" - -from typing import Dict, cast - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.meta import Source -from owid.datautils import dataframes -from shared import add_population, gather_sources_from_tables - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Path to file with mapping of variable names from one of the datasets to the final energy dataset. -VARIABLE_MAPPING_FILE = paths.directory / "owid_energy_variable_mapping.csv" - - -def combine_tables_data_and_metadata( - tables: Dict[str, Table], - population: Table, - countries_regions: Table, - gdp: pd.DataFrame, - variable_mapping: pd.DataFrame, -) -> Table: - """Combine data and metadata of a list of tables, map variable names and add variables metadata. - - Parameters - ---------- - tables : dict - Dictionary where the key is the short name of the table, and the value is the actual table, for all tables to be - combined. - population: Table - Population data. - countries_regions : Table - Main table from countries-regions dataset. - gdp: pd.DataFrame - GDP (from owid catalog, after converting into a dataframe, resetting index, and selecting country, year and gdp - columns). - variable_mapping : pd.DataFrame - Dataframe (with columns variable, source_variable, source_dataset, description, source) that specifies the names - of variables to take from each table, and their new name in the output table. It also gives a description of the - variable, and the sources of the table. - - Returns - ------- - tb_combined : Table - Combined table with metadata. - - """ - # Merge all tables as a dataframe (without metadata). - dfs = [pd.DataFrame(table) for table in tables.values()] - df_combined = dataframes.multi_merge(dfs, on=["country", "year"], how="outer") - - # Add ISO codes for countries (regions that are not in countries-regions dataset will have nan iso_code). - df_combined = pd.merge(df_combined, countries_regions, left_on="country", right_on="name", how="left") - - # Add population and gdp of countries (except for dataset-specific regions e.g. those ending in (BP) or (Shift)). - df_combined = add_population(df=df_combined, population=population, warn_on_missing_countries=False) - df_combined = pd.merge(df_combined, gdp, on=["country", "year"], how="left") - - # Check that there were no repetition in column names. - error = "Repeated columns in combined data." - assert len([column for column in set(df_combined.columns) if "_x" in column]) == 0, error - - # Create a table with combined data and no metadata. - tb_combined = Table(df_combined, short_name="owid_energy") - - # List the names of the variables described in the variable mapping file. - source_variables = variable_mapping.index.get_level_values(0).tolist() - - # Gather original metadata for each variable, add the descriptions and sources from the variable mapping file. - for source_variable in source_variables: - variable_metadata = variable_mapping.loc[source_variable] - source_dataset = variable_metadata["source_dataset"] - # Check that the variable indeed exists in the original dataset that the variable mapping says. - # Ignore columns "country", "year" (assigned to a dummy dataset 'various_datasets'), "population" (that comes - # from key_indicators) and "iso_alpha3" (that comes from countries_regions dataset). - if source_dataset not in [ - "various_datasets", - "countries_regions", - "key_indicators", - "maddison_gdp", - ]: - error = f"Variable {source_variable} not found in any of the original datasets." - assert source_variable in tables[source_dataset].columns, error - tb_combined[source_variable].metadata = tables[source_dataset][source_variable].metadata - - # Update metadata with the content of the variable mapping file. - tb_combined[source_variable].metadata.description = variable_metadata["description"] - tb_combined[source_variable].metadata.sources = [Source(name=variable_metadata["source"])] - - # Select only variables in the mapping file, and rename variables according to the mapping. - tb_combined = tb_combined[source_variables].rename(columns=variable_mapping.to_dict()["variable"]) - - # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). - columns_that_must_have_data = [ - column for column in tb_combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] - ] - tb_combined = tb_combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - - # Sanity check. - columns_with_inf = [column for column in tb_combined.columns if len(tb_combined[tb_combined[column] == np.inf]) > 0] - assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" - - # Set index and sort conveniently. - tb_combined = tb_combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - return cast(Table, tb_combined) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read all required datasets. - ds_energy_mix: Dataset = paths.load_dependency("energy_mix") - ds_fossil_fuels: Dataset = paths.load_dependency("fossil_fuel_production") - ds_primary_energy: Dataset = paths.load_dependency("primary_energy_consumption") - ds_electricity_mix: Dataset = paths.load_dependency("electricity_mix") - ds_population: Dataset = paths.load_dependency("population") - ds_ggdc: Dataset = paths.load_dependency("ggdc_maddison") - - # Gather all required tables from all datasets. - tb_energy_mix = ds_energy_mix["energy_mix"].reset_index() - tb_fossil_fuels = ds_fossil_fuels["fossil_fuel_production"].reset_index() - tb_primary_energy = ds_primary_energy["primary_energy_consumption"].reset_index() - tb_electricity_mix = ds_electricity_mix["electricity_mix"].reset_index() - tb_population = ds_population["population"].reset_index() - tb_regions = cast(Dataset, paths.load_dependency("regions"))["regions"] - tb_ggdc = ds_ggdc["maddison_gdp"].reset_index()[["country", "year", "gdp"]].dropna() - - # Load mapping from variable names in the component dataset to the final variable name in the output dataset. - variable_mapping = pd.read_csv(VARIABLE_MAPPING_FILE).set_index(["source_variable"]) - - # - # Process data. - # - # Combine all tables. - tables = { - "energy_mix": tb_energy_mix.drop(columns=["country_code"], errors="ignore"), - "fossil_fuel_production": tb_fossil_fuels, - "primary_energy_consumption": tb_primary_energy.drop(columns=["gdp", "population", "source"], errors="ignore"), - "electricity_mix": tb_electricity_mix.drop( - columns=["population", "primary_energy_consumption__twh"], errors="ignore" - ), - } - tb_combined = combine_tables_data_and_metadata( - tables=tables, - population=tb_population, - countries_regions=tb_regions, - gdp=tb_ggdc, - variable_mapping=variable_mapping, - ) - - # - # Save outputs. - # - # Gather metadata sources from all tables' original dataset sources. - ds_garden = Dataset.create_empty(dest_dir) - ds_garden.metadata.sources = gather_sources_from_tables(tables=list(tables.values())) - - # Create a new garden dataset. - ds_garden = create_dataset(dest_dir, tables=[tb_combined], default_metadata=ds_garden.metadata) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-02-20/owid_energy_variable_mapping.csv b/etl/steps/archive/garden/energy/2023-02-20/owid_energy_variable_mapping.csv deleted file mode 100644 index 8c6c44b25af..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/owid_energy_variable_mapping.csv +++ /dev/null @@ -1,130 +0,0 @@ -variable,source_variable,source_dataset,description,source -country,country,various_datasets,Geographic location,Our World in Data -year,year,various_datasets,Year of observation,Our World in Data -iso_code,iso_alpha3,countries_regions,ISO 3166-1 alpha-3 three-letter country codes,International Organization for Standardization -population,population,key_indicators,"Population","Calculated by Our World in Data based on different sources (https://ourworldindata.org/population-sources)" -gdp,gdp,maddison_gdp,"Total real gross domestic product, inflation-adjusted",Maddison Project Database -biofuel_cons_change_pct,biofuels__pct_growth,energy_mix,Annual percentage change in biofuel consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_cons_change_twh,biofuels__twh_growth,energy_mix,"Annual change in biofuel consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_cons_per_capita,biofuels_per_capita__kwh,energy_mix,"Per capita primary energy consumption from biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_consumption,biofuels__twh,energy_mix,"Primary energy consumption from biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_elec_per_capita,per_capita_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_electricity,bioenergy_generation__twh,electricity_mix,"Electricity generation from biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_share_elec,bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_share_energy,biofuels__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy -carbon_intensity_elec,co2_intensity__gco2_kwh,electricity_mix,"Carbon intensity of electricity production, measured in grams of carbon dioxide emitted per kilowatt-hour",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_cons_change_pct,coal__pct_growth,energy_mix,Annual percentage change in coal consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_cons_change_twh,coal__twh_growth,energy_mix,"Annual change in coal consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_cons_per_capita,coal_per_capita__kwh,energy_mix,"Per capita primary energy consumption from coal, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_consumption,coal__twh,energy_mix,"Primary energy consumption from coal, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_elec_per_capita,per_capita_coal_generation__kwh,electricity_mix,"Per capita electricity generation from coal, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_electricity,coal_generation__twh,electricity_mix,"Electricity generation from coal, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_prod_change_pct,annual_change_in_coal_production__pct,fossil_fuel_production,Annual percentage change in coal production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_prod_change_twh,annual_change_in_coal_production__twh,fossil_fuel_production,"Annual change in coal production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_prod_per_capita,coal_production_per_capita__kwh,fossil_fuel_production,"Per capita coal production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_production,coal_production__twh,fossil_fuel_production,"Coal production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_share_elec,coal_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from coal,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_share_energy,coal__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from coal,Calculated by Our World in Data based on BP Statistical Review of World Energy -electricity_demand,total_demand__twh,electricity_mix,"Electricity demand, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -electricity_generation,total_generation__twh,electricity_mix,"Electricity generation, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -electricity_share_energy,total_electricity_share_of_primary_energy__pct,electricity_mix,"Electricity generation as a share of primary energy",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -energy_cons_change_pct,annual_change_in_primary_energy_consumption__pct,primary_energy_consumption,Annual percentage change in primary energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_cons_change_twh,annual_change_in_primary_energy_consumption__twh,primary_energy_consumption,"Annual change in primary energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_per_capita,primary_energy_consumption_per_capita__kwh,primary_energy_consumption,"Primary energy consumption per capita, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_per_gdp,primary_energy_consumption_per_gdp__kwh_per_dollar,primary_energy_consumption,Energy consumption per unit of GDP. This is measured in kilowatt-hours per 2011 international-$.,"Calculated by Our World in Data based on BP Statistical Review of World Energy, EIA International Energy Data and Maddison Project Database" -fossil_cons_change_pct,fossil_fuels__pct_growth,energy_mix,Annual percentage change in fossil fuel consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_cons_change_twh,fossil_fuels__twh_growth,energy_mix,"Annual change in fossil fuel consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_elec_per_capita,per_capita_fossil_generation__kwh,electricity_mix,"Per capita electricity generation from fossil fuels, measured in kilowatt-hours. This is the sum of electricity generated from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_electricity,fossil_generation__twh,electricity_mix,"Electricity generation from fossil fuels, measured in terawatt-hours. This is the sum of electricity generation from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_energy_per_capita,fossil_fuels_per_capita__kwh,energy_mix,"Per capita fossil fuel consumption, measured in kilowatt-hours. This is the sum of primary energy from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_fuel_consumption,fossil_fuels__twh,energy_mix,"Fossil fuel consumption, measured in terawatt-hours. This is the sum of primary energy from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_share_elec,fossil_share_of_electricity__pct,electricity_mix,"Share of electricity generation that comes from fossil fuels (coal, oil and gas combined)",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_share_energy,fossil_fuels__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from fossil fuels,Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_cons_change_pct,gas__pct_growth,energy_mix,Annual percentage change in gas consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_cons_change_twh,gas__twh_growth,energy_mix,"Annual change in gas consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_consumption,gas__twh,energy_mix,"Primary energy consumption from gas, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_elec_per_capita,per_capita_gas_generation__kwh,electricity_mix,"Per capita electricity generation from gas, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_electricity,gas_generation__twh,electricity_mix,"Electricity generation from gas, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_energy_per_capita,gas_per_capita__kwh,energy_mix,"Per capita primary energy consumption from gas, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_prod_change_pct,annual_change_in_gas_production__pct,fossil_fuel_production,Annual percentage change in gas production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_prod_change_twh,annual_change_in_gas_production__twh,fossil_fuel_production,"Annual change in gas production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_prod_per_capita,gas_production_per_capita__kwh,fossil_fuel_production,"Per capita gas production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_production,gas_production__twh,fossil_fuel_production,"Gas production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_share_elec,gas_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from gas,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_share_energy,gas__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from gas,Calculated by Our World in Data based on BP Statistical Review of World Energy -greenhouse_gas_emissions,total_emissions__mtco2,electricity_mix,"Greenhouse-gas emissions produced in the generation of electricity, measured in million tonnes of CO2 equivalent",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_cons_change_pct,hydro__pct_growth,energy_mix,Annual percentage change in hydropower consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_cons_change_twh,hydro__twh_growth__equivalent,energy_mix,"Annual change in hydropower consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_consumption,hydro__twh__equivalent,energy_mix,"Primary energy consumption from hydropower, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_elec_per_capita,per_capita_hydro_generation__kwh,electricity_mix,"Per capita electricity generation from hydropower, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_electricity,hydro_generation__twh,electricity_mix,"Electricity generation from hydropower, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_energy_per_capita,hydro_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from hydropower, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_share_elec,hydro_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from hydropower,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_share_energy,hydro__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from hydropower,Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_cons_change_pct,low_carbon_energy__pct_growth,energy_mix,Annual percentage change in low-carbon energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_cons_change_twh,low_carbon_energy__twh_growth__equivalent,energy_mix,"Annual change in low-carbon energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_consumption,low_carbon_energy__twh__equivalent,energy_mix,"Primary energy consumption from low-carbon sources, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_elec_per_capita,per_capita_low_carbon_generation__kwh,electricity_mix,"Per capita electricity generation from low-carbon sources, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_electricity,low_carbon_generation__twh,electricity_mix,"Electricity generation from low-carbon sources, measured in terawatt-hours. This is the sum of electricity generation from renewables and nuclear power",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_energy_per_capita,low_carbon_energy_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from low-carbon sources, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_share_elec,low_carbon_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from low-carbon sources. This is the sum of electricity from renewables and nuclear,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_share_energy,low_carbon_energy__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from low-carbon sources. This is the sum of primary energy from renewables and nuclear,Calculated by Our World in Data based on BP Statistical Review of World Energy -net_elec_imports,total_net_imports__twh,electricity_mix,"Net electricity imports, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -net_elec_imports_share_demand,net_imports_share_of_demand__pct,electricity_mix,Net electricity imports as a share of electricity demand,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_cons_change_pct,nuclear__pct_growth,energy_mix,Annual percentage change in nuclear consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_cons_change_twh,nuclear__twh_growth__equivalent,energy_mix,"Annual change in nuclear consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_consumption,nuclear__twh__equivalent,energy_mix,"Primary energy consumption from nuclear power, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_elec_per_capita,per_capita_nuclear_generation__kwh,electricity_mix,"Per capita electricity generation from nuclear power, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_electricity,nuclear_generation__twh,electricity_mix,"Electricity generation from nuclear power, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_energy_per_capita,nuclear_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from nuclear, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_share_elec,nuclear_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from nuclear power,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_share_energy,nuclear__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from nuclear power,Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_cons_change_pct,oil__pct_growth,energy_mix,Annual percentage change in oil consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_cons_change_twh,oil__twh_growth,energy_mix,"Annual change in oil consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_consumption,oil__twh,energy_mix,"Primary energy consumption from oil, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_elec_per_capita,per_capita_oil_generation__kwh,electricity_mix,"Per capita electricity generation from oil, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_electricity,oil_generation__twh,electricity_mix,"Electricity generation from oil, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_energy_per_capita,oil_per_capita__kwh,energy_mix,"Per capita primary energy consumption from oil, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_prod_change_pct,annual_change_in_oil_production__pct,fossil_fuel_production,Annual percentage change in oil production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_prod_change_twh,annual_change_in_oil_production__twh,fossil_fuel_production,"Annual change in oil production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_prod_per_capita,oil_production_per_capita__kwh,fossil_fuel_production,"Per capita oil production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_production,oil_production__twh,fossil_fuel_production,"Oil production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_share_elec,oil_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from oil,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_share_energy,oil__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from oil,Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewable_consumption,other_renewables__twh__equivalent,energy_mix,"Primary energy consumption from other renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewable_electricity,other_renewables_including_bioenergy_generation__twh,electricity_mix,"Electricity generation from other renewable sources including biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewable_exc_biofuel_electricity,other_renewables_excluding_bioenergy_generation__twh,electricity_mix,"Electricity generation from other renewable sources excluding biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_cons_change_pct,other_renewables__pct_growth,energy_mix,Annual percentage change in energy consumption from other renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_cons_change_twh,other_renewables__twh_growth__equivalent,energy_mix,"Annual change in other renewable consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_elec_per_capita,per_capita_other_renewables_including_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from other renewables including biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_elec_per_capita_exc_biofuel,per_capita_other_renewables_excluding_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from other renewables excluding biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_energy_per_capita,other_renewables_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from other renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_share_elec,other_renewables_including_bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from other renewables including biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_share_elec_exc_biofuel,other_renewables_excluding_bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from other renewables excluding biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_share_energy,other_renewables__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from other renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -per_capita_electricity,per_capita_total_generation__kwh,electricity_mix,"Electricity generation per capita, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -primary_energy_consumption,primary_energy_consumption__twh,primary_energy_consumption,"Primary energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -renewables_cons_change_pct,renewables__pct_growth,energy_mix,Annual percentage change in renewable energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_cons_change_twh,renewables__twh_growth__equivalent,energy_mix,"Annual change in renewable energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_consumption,renewables__twh__equivalent,energy_mix,"Primary energy consumption from renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_elec_per_capita,per_capita_renewable_generation__kwh,electricity_mix,"Per capita electricity generation from renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_electricity,renewable_generation__twh,electricity_mix,"Electricity generation from renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_energy_per_capita,renewables_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_share_elec,renewable_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_share_energy,renewables__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_cons_change_pct,solar__pct_growth,energy_mix,Annual percentage change in solar consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_cons_change_twh,solar__twh_growth__equivalent,energy_mix,"Annual change in solar consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_consumption,solar__twh__equivalent,energy_mix,"Primary energy consumption from solar, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_elec_per_capita,per_capita_solar_generation__kwh,electricity_mix,"Per capita electricity generation from solar, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_electricity,solar_generation__twh,electricity_mix,"Electricity generation from solar, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_energy_per_capita,solar_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from solar, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_share_elec,solar_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from solar,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_share_energy,solar__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from solar,Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_cons_change_pct,wind__pct_growth,energy_mix,Annual percentage change in wind consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_cons_change_twh,wind__twh_growth__equivalent,energy_mix,"Annual change in wind consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_consumption,wind__twh__equivalent,energy_mix,"Primary energy consumption from wind, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_elec_per_capita,per_capita_wind_generation__kwh,electricity_mix,"Per capita electricity generation from wind, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_electricity,wind_generation__twh,electricity_mix,"Electricity generation from wind, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_energy_per_capita,wind_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from wind, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_share_elec,wind_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from wind,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_share_energy,wind__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from wind,Calculated by Our World in Data based on BP Statistical Review of World Energy diff --git a/etl/steps/archive/garden/energy/2023-02-20/primary_energy_consumption.meta.yml b/etl/steps/archive/garden/energy/2023-02-20/primary_energy_consumption.meta.yml deleted file mode 100644 index b32d357b973..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/primary_energy_consumption.meta.yml +++ /dev/null @@ -1,117 +0,0 @@ -dataset: - namespace: energy - version: 2023-02-20 - title: Primary energy consumption (BP & EIA, 2023) - short_name: primary_energy_consumption - description: | - Primary energy consumption data was compiled by Our World in Data based on two key data sources: - 1. [BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - 2. [International energy data from the U.S. Energy Information Administration (EIA)](https://www.eia.gov/international/data/world/total-energy/more-total-energy-data). - - BP provides the longest and most up-to-date time-series of primary energy. However, it does not provide data for all countries. We have therefore supplemented this dataset with energy data from the EIA. Where BP provides data for a given country, this data is adopted; for countries where this data is missing, we rely on EIA energy figures. - - Per capita figures have been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). - - To calculate energy per unit of GDP, we use total real GDP figures from [the Maddison Project Database, version 2020](https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020). - This dataset is based on Bolt, Jutta and Jan Luiten van Zanden (2020), “Maddison style estimates of the evolution of the world economy. A new 2020 update ”. GDP is measured in 2011$ which are PPP-adjusted. - sources: - - name: Our World in Data based on BP Statistical Review of World Energy (2022) - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - description: | - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe", or "Other CIS"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa". - * "Asia" - All Asian countries + "Other Middle East" + "Other CIS" + "Other Asia Pacific". - * "Europe" - All European countries + "Other Europe". - * "North America" - All North American countries + "Other Caribbean" + "Other North America". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa" is included in "Other Africa"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - - name: Our World in Data based on EIA International energy data (2022) - published_by: U.S. Energy Information Administration (EIA) - date_accessed: 2022-07-27 - url: https://www.eia.gov/opendata/bulkfiles.php - description: | - Total energy consumption, extracted from EIA's international energy data from the EIA, downloaded using their [Bulk Download Facility](https://www.eia.gov/opendata/bulkfiles.php). - - EIA's region definitions sometimes differ from Our World in Data's definitions. For example, in EIA's data, Russia is not included in Europe, whereas Our World in Data includes Russia in Europe (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "Europe (EIA)" to refer to EIA's original data using their definition of the region, as well as "Europe", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - name: Maddison Project Database 2020 (Bolt and van Zanden, 2020) - published_by: Bolt, Jutta and Jan Luiten van Zanden (2020), “Maddison style estimates of the evolution of the world economy. A new 2020 update“. - date_accessed: 2022-04-12 - url: https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020 -tables: - primary_energy_consumption: - variables: - annual_change_in_primary_energy_consumption__pct: - title: Annual change in primary energy consumption (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in primary energy consumption - annual_change_in_primary_energy_consumption__twh: - title: Annual change in primary energy consumption (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in primary energy consumption - gdp: - title: GDP - short_unit: $ - unit: 2011 int-$ - description: >- - Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over - time (inflation) and price differences between countries. Calculated by multiplying GDP per capita with population. - display: - numDecimalPlaces: 0 - population: - title: Population - unit: people - primary_energy_consumption__twh: - title: Primary energy consumption (TWh) - short_unit: TWh - unit: terawatt-hours - description: Primary energy consumption, measured in terawatt-hours per year. - display: - name: Primary energy consumption - numDecimalPlaces: 0 - primary_energy_consumption_per_gdp__kwh_per_dollar: - title: Primary energy consumption per GDP (kWh/$) - short_unit: kWh - unit: kilowatt-hours per $ - description: Primary energy consumption per unit of gross domestic product, measured in kilowatt-hours per international-$. - display: - name: Energy consumption per dollar - primary_energy_consumption_per_capita__kwh: - title: Primary energy consumption per capita (kWh/person) - short_unit: kWh - unit: kilowatt-hours per capita - description: Primary energy consumption per capita, measured in kilowatt-hours per person per year. - display: - name: Per capita energy consumption - numDecimalPlaces: 0 - source: - title: Source of data - short_unit: source - unit: source diff --git a/etl/steps/archive/garden/energy/2023-02-20/primary_energy_consumption.py b/etl/steps/archive/garden/energy/2023-02-20/primary_energy_consumption.py deleted file mode 100644 index 0eba6c00f47..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/primary_energy_consumption.py +++ /dev/null @@ -1,330 +0,0 @@ -"""Garden step for Primary energy consumption dataset (part of the OWID Energy dataset), based on a combination of BP's -Statistical Review dataset and EIA data on energy consumption. - -""" - -from typing import cast - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table -from shared import add_population - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - -# Countries whose data have to be removed since they were identified as outliers. -OUTLIERS = ["Gibraltar"] - - -def prepare_bp_data(tb_bp: Table) -> Table: - """Prepare BP data. - - Parameters - ---------- - tb_bp : Table - BP data. - - Returns - ------- - tb_bp : Table - BP data as a table with metadata. - - """ - tb_bp = tb_bp.reset_index() - - bp_columns = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "Primary energy consumption (TWh)", - } - tb_bp = tb_bp[list(bp_columns)].rename(columns=bp_columns) - - # Drop rows with missing values. - tb_bp = tb_bp.dropna(how="any").reset_index(drop=True) - - return cast(Table, tb_bp) - - -def prepare_eia_data(tb_eia: Table) -> Table: - """Prepare EIA data. - - Parameters - ---------- - tb_eia : Table - EIA data. - - Returns - ------- - eia_table : Table - EIA data as a table with metadata. - - """ - tb_eia = tb_eia.reset_index() - - eia_columns = { - "country": "country", - "year": "year", - "energy_consumption": "Primary energy consumption (TWh)", - } - tb_eia = tb_eia[list(eia_columns)].rename(columns=eia_columns) - - # Drop rows with missing values. - tb_eia = tb_eia.dropna(how="any").reset_index(drop=True) - - return cast(Table, tb_eia) - - -def prepare_ggdc_data(tb_ggdc: Table) -> Table: - """Prepare GGDC data. - - Parameters - ---------- - tb_ggdc : Table - GGDC data. - - Returns - ------- - ggdc_table : Table - GGDC data as a table with metadata. - - """ - tb_ggdc = tb_ggdc.reset_index() - - ggdc_columns = { - "country": "country", - "year": "year", - "gdp": "GDP", - } - tb_ggdc = tb_ggdc[list(ggdc_columns)].rename(columns=ggdc_columns) - - # Drop rows with missing values. - tb_ggdc = tb_ggdc.dropna(how="any").reset_index(drop=True) - - return cast(Table, tb_ggdc) - - -def combine_bp_and_eia_data(tb_bp: Table, tb_eia: Table) -> pd.DataFrame: - """Combine BP and EIA data. - - Parameters - ---------- - tb_bp : Table - Table from BP Statistical Review dataset. - tb_eia : Table - Table from EIA energy consumption dataset. - - Returns - ------- - combined : pd.DataFrame - Combined data. - - """ - # Check that there are no duplicated rows in any of the two datasets. - assert tb_bp[tb_bp.duplicated(subset=["country", "year"])].empty, "Duplicated rows in BP data." - assert tb_eia[tb_eia.duplicated(subset=["country", "year"])].empty, "Duplicated rows in EIA data." - - tb_bp["source"] = "bp" - tb_eia["source"] = "eia" - # Combine EIA data (which goes further back in the past) with BP data (which is more up-to-date). - # On coincident rows, prioritise BP data. - index_columns = ["country", "year"] - combined = cast(pd.DataFrame, pd.concat([tb_eia, tb_bp], ignore_index=True)).drop_duplicates( - subset=index_columns, keep="last" - ) - - # Convert to conventional dataframe, and sort conveniently. - combined = pd.DataFrame(combined).sort_values(index_columns).reset_index(drop=True) - - return cast(pd.DataFrame, combined) - - -def add_annual_change(df: pd.DataFrame) -> pd.DataFrame: - """Add annual change variables to combined BP & EIA dataset. - - Parameters - ---------- - df : pd.DataFrame - Combined BP & EIA dataset. - - Returns - ------- - combined : pd.DataFrame - Combined BP & EIA dataset after adding annual change variables. - - """ - combined = df.copy() - - # Calculate annual change. - combined = combined.sort_values(["country", "year"]).reset_index(drop=True) - combined["Annual change in primary energy consumption (%)"] = ( - combined.groupby("country")["Primary energy consumption (TWh)"].pct_change() * 100 - ) - combined["Annual change in primary energy consumption (TWh)"] = combined.groupby("country")[ - "Primary energy consumption (TWh)" - ].diff() - - return combined - - -def add_per_capita_variables(df: pd.DataFrame, population: pd.DataFrame) -> pd.DataFrame: - """Add a population column and add per-capita variables. - - Parameters - ---------- - df : pd.DataFrame - Data. - population : pd.DataFrame - Population data. - - Returns - ------- - df : pd.DataFrame - Data after adding population and per-capita variables. - - """ - df = df.copy() - - # Add population to data. - df = add_population( - df=df, - population=population, - country_col="country", - year_col="year", - population_col="Population", - warn_on_missing_countries=False, - ) - - # Calculate consumption per capita. - df["Primary energy consumption per capita (kWh)"] = ( - df["Primary energy consumption (TWh)"] / df["Population"] * TWH_TO_KWH - ) - - return df - - -def add_per_gdp_variables(df: pd.DataFrame, ggdc_table: Table) -> pd.DataFrame: - """Add a GDP column and add per-gdp variables. - - Parameters - ---------- - df : pd.DataFrame - Data. - ggdc_table : Table - GDP data from the GGDC Maddison dataset. - - Returns - ------- - df : pd.DataFrame - Data after adding GDP and per-gdp variables. - - """ - df = df.copy() - - # Add population to data. - df = pd.merge(df, ggdc_table, on=["country", "year"], how="left") - - # Calculate consumption per GDP. - df["Primary energy consumption per GDP (kWh per $)"] = ( - df["Primary energy consumption (TWh)"] / df["GDP"] * TWH_TO_KWH - ) - - return df - - -def remove_outliers(df: pd.DataFrame) -> pd.DataFrame: - """Remove infinity values and data that has been identified as spurious outliers. - - Parameters - ---------- - df : pd.DataFrame - Data. - - Returns - ------- - df : pd.DataFrame - Data after removing spurious data. - - """ - df = df.copy() - - # Remove spurious values. - df = df.replace(np.inf, np.nan) - - # Remove indexes of outliers from data. - df = df[~df["country"].isin(OUTLIERS)].reset_index(drop=True) - - return df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BP statistical review dataset. - ds_bp: Dataset = paths.load_dependency("statistical_review") - # Read main table from dataset. - tb_bp = ds_bp["statistical_review"] - - # Load EIA dataset on energy consumption. - ds_eia: Dataset = paths.load_dependency("energy_consumption") - # Read main table from dataset. - tb_eia = ds_eia["energy_consumption"] - - # Load GGDC Maddison data on GDP. - ds_ggdc: Dataset = paths.load_dependency("ggdc_maddison") - # Read main table from dataset. - tb_ggdc = ds_ggdc["maddison_gdp"] - - # Load population dataset from garden. - ds_population: Dataset = paths.load_dependency("population") - # Get table from dataset. - tb_population = ds_population["population"] - # Make a dataframe out of the data in the table, with the required columns. - df_population = pd.DataFrame(tb_population) - - # - # Process data. - # - # Prepare BP data. - tb_bp = prepare_bp_data(tb_bp=tb_bp) - - # Prepare EIA data. - tb_eia = prepare_eia_data(tb_eia=tb_eia) - - # Prepare GGDC data. - tb_ggdc = prepare_ggdc_data(tb_ggdc=tb_ggdc) - - # Combine BP and EIA data. - df = combine_bp_and_eia_data(tb_bp=tb_bp, tb_eia=tb_eia) - - # Add annual change. - df = add_annual_change(df=df) - - # Add per-capita variables. - df = add_per_capita_variables(df=df, population=df_population) - - # Add per-GDP variables. - df = add_per_gdp_variables(df=df, ggdc_table=tb_ggdc) - - # Remove outliers. - df = remove_outliers(df=df) - - # Create an appropriate index and sort conveniently. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Create new table. - table = Table(df, short_name="primary_energy_consumption") - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset(dest_dir, tables=[table]) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-02-20/shared.py b/etl/steps/archive/garden/energy/2023-02-20/shared.py deleted file mode 100644 index b0e82b979d1..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/shared.py +++ /dev/null @@ -1,134 +0,0 @@ -from typing import List - -import pandas as pd -from owid import catalog - -from etl.data_helpers import geo - - -def gather_sources_from_tables( - tables: List[catalog.Table], -) -> List[catalog.meta.Source]: - """Gather unique sources from the metadata.dataset of each table in a list of tables. - - Note: To check if a source is already listed, only the name of the source is considered (not the description or any - other field in the source). - - Parameters - ---------- - tables : list - List of tables with metadata. - - Returns - ------- - known_sources : list - List of unique sources from all tables. - - """ - # Initialise list that will gather all unique metadata sources from the tables. - known_sources: List[catalog.meta.Source] = [] - for table in tables: - # Get list of sources of the dataset of current table. - table_sources = table.metadata.dataset.sources - # Go source by source of current table, and check if its name is not already in the list of known_sources. - for source in table_sources: - # Check if this source's name is different to all known_sources. - if all([source.name != known_source.name for known_source in known_sources]): - # Add the new source to the list. - known_sources.append(source) - - return known_sources - - -def add_population( - df: pd.DataFrame, - population: pd.DataFrame, - country_col: str = "country", - year_col: str = "year", - population_col: str = "population", - interpolate_missing_population: bool = False, - warn_on_missing_countries: bool = True, - show_full_warning: bool = True, - expected_countries_without_population: List[str] = [], -) -> pd.DataFrame: - """Add a column of OWID population to the countries in the data, including population of historical regions. - - Parameters - ---------- - df : pd.DataFrame - Data without a column for population (after harmonizing elements, items and country names). - population : pd.DataFrame - Population data. - country_col : str - Name of country column in data. - year_col : str - Name of year column in data. - population_col : str - Name for new population column in data. - interpolate_missing_population : bool - True to linearly interpolate population on years that are presented in df, but for which we do not have - population data; otherwise False to keep missing population data as nans. - For example, if interpolate_missing_population is True and df has data for all years between 1900 and 1910, - but population is only given for 1900 and 1910, population will be linearly interpolated between those years. - warn_on_missing_countries : bool - True to warn if population is not found for any of the countries in the data. - show_full_warning : bool - True to show affected countries if the previous warning is raised. - regions : dict - Definitions of regions whose population also needs to be included. - expected_countries_without_population : list - Countries that are expected to not have population (that should be ignored if warnings are activated). - - Returns - ------- - df_with_population : pd.DataFrame - Data after adding a column for population for all countries in the data. - - """ - - # Prepare population dataset. - population = population.reset_index().rename( - columns={ - "country": country_col, - "year": year_col, - "population": population_col, - } - )[[country_col, year_col, population_col]] - - # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) - if len(missing_countries) > 0: - if warn_on_missing_countries: - geo.warn_on_list_of_entities( - list_of_entities=missing_countries, - warning_message=( - f"{len(missing_countries)} countries not found in population" - " dataset. They will remain in the dataset, but have nan" - " population." - ), - show_list=show_full_warning, - ) - - if interpolate_missing_population: - # For some countries we have population data only on certain years, e.g. 1900, 1910, etc. - # Optionally fill missing years linearly. - countries_in_data = df[country_col].unique() - years_in_data = df[year_col].unique() - - population = population.set_index([country_col, year_col]).reindex( - pd.MultiIndex.from_product([countries_in_data, years_in_data], names=[country_col, year_col]) - ) - - population = population.groupby(country_col).transform( - lambda x: x.interpolate(method="linear", limit_direction="both") - ) - - error = "Countries without population data differs from list of expected countries without population data." - assert set(population[population[population_col].isnull()].reset_index()[country_col]) == set( - expected_countries_without_population - ), error - - # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") - - return df_with_population diff --git a/etl/steps/archive/garden/energy/2023-02-20/uk_historical_electricity.meta.yml b/etl/steps/archive/garden/energy/2023-02-20/uk_historical_electricity.meta.yml deleted file mode 100644 index ce5838830bc..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/uk_historical_electricity.meta.yml +++ /dev/null @@ -1,91 +0,0 @@ -dataset: - namespace: energy - version: 2023-02-20 - title: UK historical electricity (DUKES, 2023) - short_name: uk_historical_electricity - description: | - All data prior to 1985 (and prior to 1965 in the case of renewables), is sourced from [the Digest of UK Energy Statistics (DUKES), published by the UK's Department for Business, Energy & Industrial Strategy](https://www.gov.uk/government/statistics/electricity-chapter-5-digest-of-united-kingdom-energy-statistics-dukes). - - All other data is sourced from the [BP's Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html) and [Ember's Yearly Electricity Data](https://ember-climate.org/data-catalogue/yearly-electricity-data/). Where data from BP is available for a given year, we rely on it as the primary source. We then supplement this with data from Ember where data from BP is not available. - sources: - - name: Digest of UK Energy Statistics - published_by: UK's Department for Business, Energy & Industrial Strategy - date_accessed: 2022-09-21 - url: https://www.gov.uk/government/statistical-data-sets/historical-electricity-data - - name: BP Statistical Review of World Energy - published_by: BP Statistical Review of World Energy - date_accessed: 2022-07-08 - url: https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html - - name: Ember's Yearly Electricity Data - published_by: Ember - publication_year: 2023 - date_accessed: 2023-02-20 - url: https://ember-climate.org/data-catalogue/yearly-electricity-data/ - - name: Ember's European Electricity Review - published_by: Ember - publication_year: 2022 - date_accessed: 2022-08-01 - url: https://ember-climate.org/insights/research/european-electricity-review-2022/ -tables: - uk_historical_electricity: - variables: - coal_generation: - title: Electricity generation from coal - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - oil_generation: - title: Electricity generation from oil - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - gas_generation: - title: Electricity generation from gas - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - nuclear_generation: - title: Electricity generation from nuclear - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - hydro_generation: - title: Electricity generation from hydropower - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - solar_generation: - title: Electricity generation from solar - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - wind_generation: - title: Electricity generation from wind - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - other_renewables_generation: - title: Electricity generation from other renewables - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - total_generation: - title: Total electricity generation - short_unit: TWh - unit: terawatt-hours - display: - name: Total electricity generation - net_imports: - title: Net electricity imports - short_unit: TWh - unit: terawatt-hours - display: - name: Net electricity imports diff --git a/etl/steps/archive/garden/energy/2023-02-20/uk_historical_electricity.py b/etl/steps/archive/garden/energy/2023-02-20/uk_historical_electricity.py deleted file mode 100644 index 571e0a9f0fe..00000000000 --- a/etl/steps/archive/garden/energy/2023-02-20/uk_historical_electricity.py +++ /dev/null @@ -1,199 +0,0 @@ -"""Combine UK BEIS' historical electricity with our electricity mix dataset (by BP & Ember) to obtain a long-run -electricity mix in the UK. - -""" - -from typing import cast - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table -from owid.datautils import dataframes - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def prepare_electricity_mix_data(df_elec: pd.DataFrame) -> pd.DataFrame: - """Select necessary columns from the electricity mix, and select rows corresponding to the UK. - - Parameters - ---------- - df_elec : pd.DataFrame - Data from the main table of the electricity mix dataset. - - Returns - ------- - df_elec : pd.DataFrame - Selected columns and rows from the electricity mix data. - - """ - df_elec = df_elec.copy() - - # Select columns and rename them conveniently. - elec_columns = { - "country": "country", - "year": "year", - "coal_generation__twh": "coal_generation", - "gas_generation__twh": "gas_generation", - "oil_generation__twh": "oil_generation", - "hydro_generation__twh": "hydro_generation", - "nuclear_generation__twh": "nuclear_generation", - "other_renewables_including_bioenergy_generation__twh": "other_renewables_generation", - "solar_generation__twh": "solar_generation", - "total_generation__twh": "total_generation", - "wind_generation__twh": "wind_generation", - "total_net_imports__twh": "net_imports", - } - - # Select necessary columns from electricity mix dataset. - df_elec = df_elec[list(elec_columns)].rename(columns=elec_columns) - - # Select UK data from Ember dataset. - df_elec = df_elec[df_elec["country"] == "United Kingdom"].reset_index(drop=True) - - return df_elec - - -def prepare_beis_data(df_beis: pd.DataFrame) -> pd.DataFrame: - """Select (and rename) columns from the UK historical electricity data from BEIS. - - Parameters - ---------- - df_beis : pd.DataFrame - Combined data for UK historical electricity data from BEIS. - - Returns - ------- - df_beis : pd.DataFrame - Selected columns from the UK historical electricity data. - - """ - df_beis = df_beis.copy() - - # Select columns and rename them conveniently. - beis_columns = { - "country": "country", - "year": "year", - "coal": "coal_generation", - "oil": "oil_generation", - "electricity_generation": "total_generation", - "gas": "gas_generation", - "hydro": "hydro_generation", - "nuclear": "nuclear_generation", - "net_imports": "net_imports", - "implied_efficiency": "implied_efficiency", - "wind_and_solar": "wind_and_solar_generation", - } - df_beis = df_beis[list(beis_columns)].rename(columns=beis_columns) - - return df_beis - - -def combine_beis_and_electricity_mix_data(df_beis: pd.DataFrame, df_elec: pd.DataFrame) -> pd.DataFrame: - """Combine BEIS data on UK historical electricity with the electricity mix data (after having selected rows for only - the UK). - - There are different processing steps done to the data, see comments below in the code. - - Parameters - ---------- - df_beis : pd.DataFrame - Selected data from BEIS on UK historical electricity. - df_elec : pd.DataFrame - Selected data from the electricity mix (after having selected rows for the UK). - - Returns - ------- - df_combined : pd.DataFrame - Combined and processed data with a verified index. - - """ - # In the BEIS dataset, wind and solar are given as one joined variable. - # Check if we can ignore it (since it's better to have the two sources separately). - # Find the earliest year informed in the electricity mix for solar or wind generation. - solar_or_wind_first_year = df_elec[df_elec["wind_generation"].notnull() | df_elec["solar_generation"].notnull()][ - "year" - ].min() - # Now check that, prior to that year, all generation from solar and wind was zero. - assert df_beis[df_beis["year"] < solar_or_wind_first_year]["wind_and_solar_generation"].fillna(0).max() == 0 - # Therefore, since wind and solar is always zero (prior to the beginning of the electricity mix data) - # we can ignore this column from the BEIS dataset. - df_beis = df_beis.drop(columns=["wind_and_solar_generation"]) - # And create two columns of zeros for wind and solar. - df_beis["solar_generation"] = 0 - df_beis["wind_generation"] = 0 - # Similarly, given that in the BEIS dataset there is no data about other renewable sources (apart from hydro, solar - # and wind), we can assume that the contribution from other renewables is zero. - df_beis["other_renewables_generation"] = 0 - # And ensure these new columns do not have any values after the electricity mix data begins. - df_beis.loc[ - df_beis["year"] >= solar_or_wind_first_year, - ["solar_generation", "wind_generation", "other_renewables_generation"], - ] = np.nan - - # BEIS data on fuel input gives raw energy, but we want electricity generation (which is less, given the - # inefficiencies of the process of burning fossil fuels). - # They also include a variable on "implied efficiency", which they obtain by dividing the input energy by the total - # electricity generation. - # We multiply the raw energy by the efficiency to have an estimate of the electricity generated by each fossil fuel. - # This only affects data prior to the beginning of the electricity mix's data (which is 1965 for renewables and - # nuclear, and 1985 for the rest). - for source in ["coal", "oil", "gas"]: - df_beis[f"{source}_generation"] *= df_beis["implied_efficiency"] - - # Drop other unnecessary columns. - df_beis = df_beis.drop(columns=["implied_efficiency"]) - - # Combine BEIS and electricity mix data. - df_combined = dataframes.combine_two_overlapping_dataframes( - df1=df_elec, df2=df_beis, index_columns=["country", "year"] - ) - - # Add an index and sort conveniently. - df_combined = df_combined.set_index(["country", "year"]).sort_index().sort_index(axis=1) - - return cast(pd.DataFrame, df_combined) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BEIS dataset. - ds_beis: Dataset = paths.load_dependency("uk_historical_electricity") - # Read main table from dataset. - tb_beis = ds_beis["uk_historical_electricity"] - # Create a convenient dataframe out of the table. - df_beis = pd.DataFrame(tb_beis).reset_index() - - # Load electricity mix dataset. - ds_elec: Dataset = paths.load_dependency("electricity_mix") - # Read main table from dataset. - tb_elec = ds_elec["electricity_mix"] - # Create a convenient dataframe out of the table. - df_elec = pd.DataFrame(tb_elec).reset_index() - - # - # Process data. - # - # Prepare electricity mix data. - df_elec = prepare_electricity_mix_data(df_elec=df_elec) - - # Prepare BEIS data. - df_beis = prepare_beis_data(df_beis=df_beis) - - # Combine BEIS and electricity mix data. - df_combined = combine_beis_and_electricity_mix_data(df_beis=df_beis, df_elec=df_elec) - - # Create a new table with combined data (and no metadata). - tb_combined = Table(df_combined, short_name="uk_historical_electricity") - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset(dest_dir, tables=[tb_combined]) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-06-01/electricity_mix.meta.yml b/etl/steps/archive/garden/energy/2023-06-01/electricity_mix.meta.yml deleted file mode 100644 index 927e7973624..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/electricity_mix.meta.yml +++ /dev/null @@ -1,391 +0,0 @@ -dataset: - title: Electricity mix (BP & Ember, 2023b) - description: | - Data is compiled by Our World in Data based on three main sources: - - [BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - - [Ember Yearly Electricity Data (2023)](https://ember-climate.org/data-catalogue/yearly-electricity-data/). - - [Ember European Electricity Review (2022)](https://ember-climate.org/insights/research/european-electricity-review-2022/). - - Ember compile their global dataset from various sources including: - - Eurostat: Annual European generation and import data, and monthly data in some cases where better sources are not available. - - ENTSO-E: Monthly European generation and import data. - - EIA: Annual global generation and import data. - - UN: Monthly global generation data in some cases. - - GEM: Annual global coal and gas capacity data. - - IRENA: Annual global capacity data for all non-fossil fuel types, and for Other Fossil where available. - - WRI: Annual global capacity data for Other Fossil where other sources are not available. - - European carbon intensities rely on data from the European Environment Agency (EEA). - - A complete list of data sources for each individual country in Ember's Yearly Electricity Data can be found [here](https://ember-climate.org/app/uploads/2022/07/Ember-Electricity-Data-Methodology.pdf). - - A complete list of data sources for each individual country in Ember's European Electricity Review can be found [here](https://ember-climate.org/app/uploads/2022/02/EER-Methodology.pdf). - - We rely on Ember as the primary source of electricity consumption data. While BP provides primary energy (not just electricity) consumption data and it provides a longer time-series (dating back to 1965) than Ember (which only dates back to 1990), BP does not provide data for all countries or for all sources of electricity (for example, only Ember provides data on electricity from bioenergy). So, where data from Ember is available for a given country and year, we rely on it as the primary source. We then supplement this with data from BP where data from Ember is not available. - - Our World in Data has converted absolute electricity production by source to the share in the mix by dividing each by total electricity production. - - BP's region definitions sometimes differ from Our World in Data's definitions. For example, BP's North America includes only Canada, Mexico and United States, whereas Our World in Data's North America includes countries in Central America (see a map with [our region definitions](https://ourworldindata.org/world-region-map-definitions)). For this reason, we include in the dataset regions like "North America (BP)" to refer to BP's original data using their definition of the region, as well as "North America", which is data aggregated by Our World in Data using our definition. These aggregates are constructed by adding up (when possible) the contributions from the countries in the region. - - [BP's region definitions](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy/using-the-review/definitions-and-explanatory-notes.html#accordion_Regional%20definitions), denoted with "(BP)", are: - * "Asia Pacific (BP)": Brunei, Cambodia, China (Mainland), China Hong Kong SAR (Special Administrative Region), China Macau SAR (Special Administrative Region), Indonesia, Japan, Laos, Malaysia, Mongolia, North Korea, Philippines, Singapore, South Asia (Afghanistan, Bangladesh, India, Myanmar, Nepal, Pakistan and Sri Lanka), South Korea, Taiwan, Thailand, Vietnam, Australia, New Zealand, Papua New Guinea and Oceania. - * "Australasia (BP)": Australia, New Zealand. - * "CIS (BP)" - Commonwealth of Independent States: Armenia, Azerbaijan, Belarus, Kazakhstan, Kyrgyzstan, Moldova, Russian Federation, Tajikistan, Turkmenistan, Uzbekistan. - * "Caribbean (BP)": Atlantic islands between the US Gulf Coast and South America, including Puerto Rico, US Virgin Islands and Bermuda. - * "Central America (BP)": Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama - * "Eastern Africa (BP)": Territories on the east coast of Africa from Sudan to Mozambique. Also Madagascar, Malawi, Uganda, Zambia, Zimbabwe. - * "Europe (BP)": European members of the OECD plus Albania, Bosnia-Herzegovina, Bulgaria, Croatia, Cyprus, Georgia, Gibraltar, Latvia, Lithuania, Malta, Montenegro, North Macedonia, Romania, Serbia and Ukraine. - * "Middle Africa (BP)": Angola, Cameroon, Central African Republic, Chad, Democratic Republic of Congo, Republic of Congo, Equatorial Guinea, Gabon, Sao Tome & Principe. - * "Middle East (BP)": Arabian Peninsula, Iran, Iraq, Israel, Jordan, Lebanon, Syria. - * "Non-OECD (BP)" - Organization for Economic Co-operation and Development: All countries that are not members of the OECD. - * "North America (BP)": US (excluding US territories), Canada, Mexico - * "Northern Africa (BP)": Territories on the north coast of Africa from Egypt to Western Sahara. - * "OECD (BP)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, UK, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, US. - * "OPEC (BP)" - Organization of the Petroleum Exporting Countries: Iran, Iraq, Kuwait, Saudi Arabia, United Arab Emirates, Algeria, Libya, Angola, Equatorial Guinea, Gabon, Nigeria, Republic of Congo, Venezuela. - * "South and Central America (BP)": Caribbean (including Puerto Rico and US Virgin Islands), Bermuda, Central and South America. - * "Southern Africa (BP)": Botswana, Lesotho, Namibia, South Africa, Swaziland. - * "Western Africa (BP)": Territories on the west coast of Africa from Mauritania to Nigeria, including Burkina Faso, Cape Verde, Mali and Niger. - - Additionally, BP includes some regions that are not explicitly defined (e.g. "Other Europe", or "Other CIS"). We define our regions in the following way: - * "Africa" - All African countries + "Other Africa". - * "Asia" - All Asian countries + "Other Middle East" + "Other CIS" + "Other Asia Pacific". - * "Europe" - All European countries + "Other Europe". - * "North America" - All North American countries + "Other Caribbean" + "Other North America". - * "Oceania" - All Oceanian countries. - * "South America" - All South American countries + "Other South America". - Where the individual countries in each region are defined [in this map](https://ourworldindata.org/world-region-map-definitions). Additional BP regions are ignored, since they belong to other regions already included (e.g. the data for "Other Western Africa" is included in "Other Africa"). Finally, income groups are constructed following the definitions [in this map](https://ourworldindata.org/grapher/world-banks-income-groups). - - [Ember's region definitions](https://ember-climate.org/countries-and-regions/), denoted with "(Ember)", are: - * "G20 (Ember)" - Group of Twenty: Argentina, Australia, Brazil, Canada, China, France, Germany, India, Indonesia, Italy, Japan, Mexico, Russia, Saudi Arabia, South Africa, South Korea, Turkey, United Kingdom, United States and the 27 members of the European Union. - * "G7 (Ember)" - Group of Seven: Canada, France, Germany, Italy, Japan, United Kingdom and United States. - * "Latin America and Caribbean (Ember)": Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, Uruguay, Venezuela, Aruba, British Virgin Islands, Cayman Islands, Falkland Islands, French Guiana, Guadeloupe, Martinique, Montserrat, Puerto Rico, Turks and Caicos Islands and United States Virgin Islands. - * "Middle East (Ember)": Bahrain, Iran, Iraq, Israel, Jordan, Kuwait, Lebanon, Oman, Palestine, Qatar, Saudi Arabia, Syria, United Arab Emirates and Yemen. - * "OECD (Ember)" - Organization For Economic Co-operation and Development: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Iceland, Ireland, Italy, Latvia, Lithuania, Luxembourg, Netherlands, Norway, Poland, Portugal, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, United Kingdom, Australia, Canada, Chile, Colombia, Israel, Japan, Mexico, New Zealand, South Korea, and United States. -tables: - electricity_mix: - variables: - bioenergy_generation__twh: - title: Electricity from bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Bioenergy - bioenergy_share_of_electricity__pct: - title: Bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Bioenergy - co2_intensity__gco2_kwh: - title: Carbon intensity of electricity (gCO2/kWh) - short_unit: "gCO₂" - unit: "grams of CO₂ equivalent per kilowatt-hour" - display: - name: Carbon intensity of electricity per kilowatt-hour - coal_generation__twh: - title: Electricity from coal (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - coal_share_of_electricity__pct: - title: Coal (% electricity) - short_unit: '%' - unit: '%' - display: - name: Coal - fossil_generation__twh: - title: Electricity from fossil fuels (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Fossil fuels - fossil_share_of_electricity__pct: - title: Fossil fuels (% electricity) - short_unit: '%' - unit: '%' - display: - name: Fossil fuels - gas_generation__twh: - title: Electricity from gas (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas - gas_share_of_electricity__pct: - title: Gas (% electricity) - short_unit: '%' - unit: '%' - display: - name: Gas - hydro_generation__twh: - title: Electricity from hydro (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - hydro_share_of_electricity__pct: - title: Hydro (% electricity) - short_unit: '%' - unit: '%' - display: - name: Hydropower - low_carbon_generation__twh: - title: Low-carbon electricity (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Low-carbon electricity - low_carbon_share_of_electricity__pct: - title: Low-carbon electricity (% electricity) - short_unit: '%' - unit: '%' - display: - name: Share of electricity from low-carbon sources - net_imports_share_of_demand__pct: - title: Net electricity imports as a share of demand (%) - short_unit: '%' - unit: '%' - display: - name: Net electricity imports as a share of demand - nuclear_generation__twh: - title: Electricity from nuclear (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - nuclear_share_of_electricity__pct: - title: Nuclear (% electricity) - short_unit: '%' - unit: '%' - display: - name: Nuclear - oil_generation__twh: - title: Electricity from oil (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - oil_share_of_electricity__pct: - title: Oil (% electricity) - short_unit: '%' - unit: '%' - display: - name: Oil - other_renewables_excluding_bioenergy_generation__twh: - title: Other renewables excluding bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables, excluding bioenergy - other_renewables_excluding_bioenergy_share_of_electricity__pct: - title: Other renewables excluding bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Other renewables, excluding bioenergy - other_renewables_including_bioenergy_generation__twh: - title: Other renewables including bioenergy (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables, including bioenergy - other_renewables_including_bioenergy_share_of_electricity__pct: - title: Other renewables including bioenergy (% electricity) - short_unit: '%' - unit: '%' - display: - name: Other renewables, including bioenergy - per_capita_bioenergy_generation__kwh: - title: Bioenergy electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Bioenergy electricity per capita - numDecimalPlaces: 0 - per_capita_coal_generation__kwh: - title: Coal electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Coal electricity per capita - numDecimalPlaces: 0 - per_capita_fossil_generation__kwh: - title: Fossil fuel electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Fossil fuel electricity per capita - numDecimalPlaces: 0 - per_capita_gas_generation__kwh: - title: Gas electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Gas electricity per capita - numDecimalPlaces: 0 - per_capita_hydro_generation__kwh: - title: Hydro electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Hydro electricity per capita - numDecimalPlaces: 0 - per_capita_low_carbon_generation__kwh: - title: Low-carbon electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Low-carbon electricity per capita - numDecimalPlaces: 0 - per_capita_nuclear_generation__kwh: - title: Nuclear electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Nuclear electricity per capita - numDecimalPlaces: 0 - per_capita_oil_generation__kwh: - title: Oil electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Oil electricity per capita - numDecimalPlaces: 0 - per_capita_other_renewables_excluding_bioenergy_generation__kwh: - title: Other renewable electricity excluding bioenergy per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Other renewable electricity excluding bioenergy per capita - numDecimalPlaces: 0 - per_capita_other_renewables_including_bioenergy_generation__kwh: - title: Other renewable electricity including bioenergy per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Other renewable electricity including bioenergy per capita - numDecimalPlaces: 0 - per_capita_renewable_generation__kwh: - title: Renewable electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Renewable electricity per capita - numDecimalPlaces: 0 - per_capita_solar_generation__kwh: - title: Solar electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Solar electricity per capita - numDecimalPlaces: 0 - per_capita_solar_and_wind_generation__kwh: - title: Solar and wind electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Solar and wind electricity per capita - numDecimalPlaces: 0 - per_capita_total_generation__kwh: - title: Per capita electricity (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Per capita electricity use - numDecimalPlaces: 0 - per_capita_wind_generation__kwh: - title: Wind electricity per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Wind electricity per capita - numDecimalPlaces: 0 - population: - title: Population - short_unit: people - unit: people - display: - name: Population - primary_energy_consumption__twh: - title: Electricity from primary energy consumption (twh) (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Primary energy consumption - renewable_generation__twh: - title: Electricity from renewables (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Renewables - renewable_share_of_electricity__pct: - title: Renewables (% electricity) - short_unit: '%' - unit: '%' - display: - name: Renewables - numDecimalPlaces: 2 - solar_generation__twh: - title: Electricity from solar (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - solar_and_wind_generation__twh: - title: Electricity from solar and wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar and wind - solar_share_of_electricity__pct: - title: Solar (% electricity) - short_unit: '%' - unit: '%' - display: - name: Solar - solar_and_wind_share_of_electricity__pct: - title: Solar and wind (% electricity) - short_unit: '%' - unit: '%' - display: - name: Solar and wind - total_demand__twh: - title: Electricity demand (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Electricity demand - total_electricity_share_of_primary_energy__pct: - title: Electricity as share of primary energy (%) - short_unit: '%' - unit: '%' - display: - name: Electricity as share of primary energy - total_emissions__mtco2: - title: Emissions (MtCO2) - short_unit: million t - unit: million tonnes CO2 equivalent - display: - name: Emissions - total_generation__twh: - title: Electricity generation (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Electricity generation - total_net_imports__twh: - title: Net imports (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Net imports - wind_generation__twh: - title: Electricity from wind (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - wind_share_of_electricity__pct: - title: Wind (% electricity) - short_unit: '%' - unit: '%' - display: - name: Wind diff --git a/etl/steps/archive/garden/energy/2023-06-01/electricity_mix.py b/etl/steps/archive/garden/energy/2023-06-01/electricity_mix.py deleted file mode 100644 index ee7b83d435d..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/electricity_mix.py +++ /dev/null @@ -1,300 +0,0 @@ -"""Garden step that combines BP's statistical review with Ember's combined electricity data (combination of the European -Electricity Review and the Yearly Electricity Data) to create the Electricity Mix (BP & Ember) dataset. - -""" - -from typing import Dict, List - -from owid.catalog import Dataset, Table -from owid.datautils.dataframes import combine_two_overlapping_dataframes - -from etl.data_helpers.geo import add_population_to_dataframe -from etl.helpers import PathFinder, create_dataset_with_combined_metadata - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 -# Megatonnes to grams. -MT_TO_G = 1e12 - - -def process_bp_data(table_bp: Table) -> Table: - """Load necessary columns from BP's Statistical Review dataset, and create some new variables (e.g. electricity - generation from fossil fuels). - - Parameters - ---------- - table_bp : Table - BP's Statistical Review (already processed, with harmonized countries and region aggregates). - - Returns - ------- - df_bp : Table - Processed BP data. - - """ - # Columns to load from BP dataset. - columns = { - "electricity_generation": "total_generation__twh", - "primary_energy_consumption__twh": "primary_energy_consumption__twh", - "hydro_generation__twh": "hydro_generation__twh", - "nuclear_generation__twh": "nuclear_generation__twh", - "solar_generation__twh": "solar_generation__twh", - "wind_generation__twh": "wind_generation__twh", - "geo_biomass_other__twh": "other_renewables_including_bioenergy_generation__twh", - "elec_gen_from_oil": "oil_generation__twh", - "elec_gen_from_coal": "coal_generation__twh", - "elec_gen_from_gas": "gas_generation__twh", - } - table_bp = table_bp[list(columns)].rename(columns=columns, errors="raise") - # New columns to be created by summing other columns. - aggregates: Dict[str, List[str]] = { - "fossil_generation__twh": [ - "oil_generation__twh", - "coal_generation__twh", - "gas_generation__twh", - ], - "renewable_generation__twh": [ - "hydro_generation__twh", - "solar_generation__twh", - "wind_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - ], - "low_carbon_generation__twh": [ - "renewable_generation__twh", - "nuclear_generation__twh", - ], - "solar_and_wind_generation__twh": [ - "solar_generation__twh", - "wind_generation__twh", - ], - } - - # Create a table with a dummy index. - tb_bp = table_bp.reset_index() - - # Create new columns, by adding up other columns (and allowing for only one nan in each sum). - for new_column in aggregates: - tb_bp[new_column] = tb_bp[aggregates[new_column]].sum(axis=1, min_count=len(aggregates[new_column]) - 1) - - return tb_bp - - -def process_ember_data(table_ember: Table) -> Table: - """Load necessary columns from the Combined Electricity dataset and prepare a dataframe with the required variables. - - Parameters - ---------- - table_ember : Table - Combined Electricity (combination of Ember's Yearly Electricity Data and European Electricity Review). - - Returns - ------- - df_ember : Table - Processed Combined Electricity data. - - """ - # Columns to load from Ember dataset. - columns = { - "generation__bioenergy__twh": "bioenergy_generation__twh", - "generation__gas__twh": "gas_generation__twh", - "generation__coal__twh": "coal_generation__twh", - "generation__other_fossil__twh": "oil_generation__twh", - "generation__renewables__twh": "renewable_generation__twh", - "generation__other_renewables__twh": "other_renewables_excluding_bioenergy_generation__twh", - "generation__clean__twh": "low_carbon_generation__twh", - "generation__hydro__twh": "hydro_generation__twh", - "generation__nuclear__twh": "nuclear_generation__twh", - "generation__solar__twh": "solar_generation__twh", - "generation__wind__twh": "wind_generation__twh", - "generation__fossil__twh": "fossil_generation__twh", - "generation__total_generation__twh": "total_generation__twh", - "demand__total_demand__twh": "total_demand__twh", - "emissions__total_emissions__mtco2": "total_emissions__mtco2", - "emissions__co2_intensity__gco2_kwh": "co2_intensity__gco2_kwh", - "imports__total_net_imports__twh": "total_net_imports__twh", - } - table_ember = table_ember[list(columns)].rename(columns=columns, errors="raise") - - # Create a table with a dummy index. - tb_ember = table_ember.reset_index() - - # In BP data, there is a variable "Geo Biomass Other", which combines all other renewables. - # In Ember data, "other rewenables" excludes bioenergy. - # To be able to combine both datasets, create a new variable for generation of other renewables including bioenergy. - tb_ember["other_renewables_including_bioenergy_generation__twh"] = ( - tb_ember["other_renewables_excluding_bioenergy_generation__twh"] + tb_ember["bioenergy_generation__twh"] - ) - - # Create a new variable for solar and wind generation. - tb_ember["solar_and_wind_generation__twh"] = tb_ember["solar_generation__twh"] + tb_ember["wind_generation__twh"] - - return tb_ember - - -def add_per_capita_variables(combined: Table, ds_population: Dataset) -> Table: - """Add per capita variables (in kWh per person) to the combined BP and Ember dataframe. - - The list of variables to make per capita are given in this function. The new variable names will be 'per_capita_' - followed by the original variable's name. - - Parameters - ---------- - combined : Table - Combination of BP's Statistical Review and Ember's Combined Electricity. - ds_population: Dataset - Population dataset. - - Returns - ------- - combined : Table - Input dataframe after adding per capita variables. - - """ - combined = combined.copy() - - # Variables to make per capita. - per_capita_variables = [ - "bioenergy_generation__twh", - "coal_generation__twh", - "fossil_generation__twh", - "gas_generation__twh", - "hydro_generation__twh", - "low_carbon_generation__twh", - "nuclear_generation__twh", - "oil_generation__twh", - "other_renewables_excluding_bioenergy_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - "renewable_generation__twh", - "solar_generation__twh", - "total_generation__twh", - "wind_generation__twh", - "solar_and_wind_generation__twh", - ] - # Add a column for population (only for harmonized countries). - combined = add_population_to_dataframe(df=combined, ds_population=ds_population, warn_on_missing_countries=False) - - for variable in per_capita_variables: - assert "twh" in variable, f"Variables are assumed to be in TWh, but {variable} is not." - new_column = "per_capita_" + variable.replace("__twh", "__kwh") - combined[new_column] = combined[variable] * TWH_TO_KWH / combined["population"] - - return combined - - -def add_share_variables(combined: Table) -> Table: - """Add variables for the electricity generation as a share of the total electricity generation (as a percentage). - - The following new variables will be created: - * For each source (e.g. coal_generation__twh) in a list given in this function, a new variable will be created - (named, e.g. coal_share_of_electricity__pct). - * Total electricity generation as a share of primary energy consumption. - * Total net electricity imports as a share of total electricity demand. - - Parameters - ---------- - combined : Table - Combination of BP's Statistical Review and Ember's Combined Electricity. - - Returns - ------- - combined : Table - Input dataframe after adding share variables. - - """ - # Variables to make as share of electricity (new variable names will be the name of the original variable followed - # by '_share_of_electricity__pct'). - share_variables = [ - "bioenergy_generation__twh", - "coal_generation__twh", - "fossil_generation__twh", - "gas_generation__twh", - "hydro_generation__twh", - "low_carbon_generation__twh", - "nuclear_generation__twh", - "oil_generation__twh", - "other_renewables_excluding_bioenergy_generation__twh", - "other_renewables_including_bioenergy_generation__twh", - "renewable_generation__twh", - "solar_generation__twh", - "total_generation__twh", - "wind_generation__twh", - "solar_and_wind_generation__twh", - ] - for variable in share_variables: - new_column = variable.replace("_generation__twh", "_share_of_electricity__pct") - combined[new_column] = 100 * combined[variable] / combined["total_generation__twh"] - - # Calculate the percentage of electricity as a share of primary energy. - combined["total_electricity_share_of_primary_energy__pct"] = ( - 100 * combined["total_generation__twh"] / combined["primary_energy_consumption__twh"] - ) - - # Calculate the percentage of electricity demand that is imported. - combined["net_imports_share_of_demand__pct"] = ( - 100 * combined["total_net_imports__twh"] / combined["total_demand__twh"] - ) - - # Sanity check. - error = "Total electricity share does not add up to 100%." - assert all(abs(combined["total_share_of_electricity__pct"].dropna() - 100) < 0.01), error - - # Remove unnecessary columns. - combined = combined.drop(columns=["total_share_of_electricity__pct"]) - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BP's statistical review dataset and read its main table. - ds_bp: Dataset = paths.load_dependency("statistical_review") - table_bp = ds_bp["statistical_review"] - - # Load Ember's combined electricity dataset and read its main table. - ds_ember: Dataset = paths.load_dependency("combined_electricity") - table_ember = ds_ember["combined_electricity"] - - # Load population dataset. - ds_population: Dataset = paths.load_dependency("population") - - # - # Process data. - # - # Prepare BP and Ember data. - tb_bp = process_bp_data(table_bp=table_bp) - tb_ember = process_ember_data(table_ember=table_ember) - - # Combine both tables, giving priority to Ember data (on overlapping values). - combined = combine_two_overlapping_dataframes(df1=tb_ember, df2=tb_bp, index_columns=["country", "year"]) - - # Add carbon intensities. - # There is already a variable for this in the Ember dataset, but now that we have combined - # BP and Ember data, intensities should be recalculated for consistency. - combined["co2_intensity__gco2_kwh"] = (combined["total_emissions__mtco2"] * MT_TO_G) / ( - combined["total_generation__twh"] * TWH_TO_KWH - ) - - # Add per capita variables. - combined = add_per_capita_variables(combined=combined, ds_population=ds_population) - - # Add "share" variables. - combined = add_share_variables(combined=combined) - - # Set an appropriate index and sort rows and columns conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Set a table name. - combined.metadata.short_name = paths.short_name - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset_with_combined_metadata(dest_dir=dest_dir, datasets=[ds_bp, ds_ember], tables=[combined]) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-06-01/fossil_fuel_production.meta.yml b/etl/steps/archive/garden/energy/2023-06-01/fossil_fuel_production.meta.yml deleted file mode 100644 index 1da62f05000..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/fossil_fuel_production.meta.yml +++ /dev/null @@ -1,93 +0,0 @@ -dataset: - title: Fossil fuel production (BP & Shift, 2023b) - description: | - This dataset on fossil fuel production is generated by combining the latest data from [the BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html) and [The Shift Dataportal](https://www.theshiftdataportal.org/energy). - - BP provide fossil fuel production data from 1965 onwards (and crude prices from 1861 onwards). The Shift Dataportal provides long-term data from 1900, but only extends to 2016. - - To maintain consistency with the energy datasets on Our World in Data, we have taken BP data as preference - meaning if BP provides data for the given country and year, this is used. Where data is not available from BP for a given country, or pre-1965 we rely on data from Shift. - - We have converted primary production in exajoules to terawatt-hours using the conversion factor: 1,000,000 / 3,600 ~ 278. - - Production per capita has been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). -tables: - fossil_fuel_production: - variables: - annual_change_in_coal_production__pct: - title: Annual change in coal production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in coal production - annual_change_in_coal_production__twh: - title: Annual change in coal production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in coal production - annual_change_in_gas_production__pct: - title: Annual change in gas production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in gas production - annual_change_in_gas_production__twh: - title: Annual change in gas production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in gas production - annual_change_in_oil_production__pct: - title: Annual change in oil production (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in oil production - annual_change_in_oil_production__twh: - title: Annual change in oil production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in oil production - coal_production__twh: - title: Coal production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal production - numDecimalPlaces: 0 - coal_production_per_capita__kwh: - title: Coal production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Coal production per capita - numDecimalPlaces: 0 - gas_production__twh: - title: Gas production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Gas production - numDecimalPlaces: 0 - gas_production_per_capita__kwh: - title: Gas production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Gas production per capita - numDecimalPlaces: 0 - oil_production__twh: - title: Oil production (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil production - numDecimalPlaces: 0 - oil_production_per_capita__kwh: - title: Oil production per capita (kWh) - short_unit: kWh - unit: kilowatt-hours - display: - name: Oil production per capita - numDecimalPlaces: 0 diff --git a/etl/steps/archive/garden/energy/2023-06-01/fossil_fuel_production.py b/etl/steps/archive/garden/energy/2023-06-01/fossil_fuel_production.py deleted file mode 100644 index 47716612e77..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/fossil_fuel_production.py +++ /dev/null @@ -1,257 +0,0 @@ -"""Garden step for Fossil fuel production dataset (part of the OWID Energy dataset), based on a combination of BP's -Statistical Review dataset and Shift data on fossil fuel production. - -""" - -import numpy as np -from owid.catalog import Dataset, Table -from owid.datautils import dataframes - -from etl.data_helpers.geo import add_population_to_dataframe -from etl.helpers import PathFinder, create_dataset_with_combined_metadata - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - - -def prepare_bp_data(tb_bp: Table) -> Table: - """Prepare BP data. - - Parameters - ---------- - tb_bp : Table - BP data. - - Returns - ------- - tb_bp : Table - BP data as a table with metadata. - - """ - tb_bp = tb_bp.reset_index() - - bp_columns = { - "country": "country", - "year": "year", - "coal_production__twh": "Coal production (TWh)", - "gas_production__twh": "Gas production (TWh)", - "oil_production__twh": "Oil production (TWh)", - } - tb_bp = tb_bp[list(bp_columns)].rename(columns=bp_columns) - - return tb_bp - - -def prepare_shift_data(tb_shift: Table) -> Table: - """Prepare Shift data. - - Parameters - ---------- - tb_shift : Table - Shift data. - - Returns - ------- - shift_table : Table - Shift data as a table with metadata. - - """ - tb_shift = tb_shift.reset_index() - - shift_columns = { - "country": "country", - "year": "year", - "coal": "Coal production (TWh)", - "gas": "Gas production (TWh)", - "oil": "Oil production (TWh)", - } - tb_shift = tb_shift[list(shift_columns)].rename(columns=shift_columns) - - return tb_shift - - -def combine_bp_and_shift_data(tb_bp: Table, tb_shift: Table) -> Table: - """Combine BP and Shift data. - - Parameters - ---------- - tb_bp : Table - Processed BP table. - tb_shift : Table - Process Shift table. - - Returns - ------- - combined : Table - Combined data. - - """ - # Check that there are no duplicated rows in any of the two datasets. - assert tb_bp[tb_bp.duplicated(subset=["country", "year"])].empty, "Duplicated rows in BP data." - assert tb_shift[tb_shift.duplicated(subset=["country", "year"])].empty, "Duplicated rows in Shift data." - - # Combine Shift data (which goes further back in the past) with BP data (which is more up-to-date). - # On coincident rows, prioritise BP data. - index_columns = ["country", "year"] - combined = dataframes.combine_two_overlapping_dataframes(df1=tb_bp, df2=tb_shift, index_columns=index_columns) - - # Remove rows that only have nan. - combined = combined.dropna(subset=combined.drop(columns=["country", "year"]).columns, how="all") - - # Sort data appropriately. - combined = combined.sort_values(index_columns).reset_index(drop=True) - - return combined - - -def add_annual_change(tb: Table) -> Table: - """Add annual change variables to combined BP & Shift dataset. - - Parameters - ---------- - tb : Table - Combined BP & Shift dataset. - - Returns - ------- - combined : Table - Combined BP & Shift dataset after adding annual change variables. - - """ - combined = tb.copy() - - # Calculate annual change. - combined = combined.sort_values(["country", "year"]).reset_index(drop=True) - for cat in ("Coal", "Oil", "Gas"): - combined[f"Annual change in {cat.lower()} production (%)"] = ( - combined.groupby("country")[f"{cat} production (TWh)"].pct_change() * 100 - ) - combined[f"Annual change in {cat.lower()} production (TWh)"] = combined.groupby("country")[ - f"{cat} production (TWh)" - ].diff() - - return combined - - -def add_per_capita_variables(tb: Table, ds_population: Dataset) -> Table: - """Add per-capita variables to combined BP & Shift dataset. - - Parameters - ---------- - tb : Table - Combined BP & Shift dataset. - ds_population : Dataset - Population dataset. - - Returns - ------- - combined : Table - Combined BP & Shift dataset after adding per-capita variables. - - """ - tb = tb.copy() - - # List countries for which we expect to have no population. - # These are countries and regions defined by BP and Shift. - expected_countries_without_population = [ - country for country in tb["country"].unique() if (("(BP)" in country) or ("(Shift)" in country)) - ] - # Add population to data. - combined = add_population_to_dataframe( - df=tb, - ds_population=ds_population, - country_col="country", - year_col="year", - population_col="population", - warn_on_missing_countries=False, - interpolate_missing_population=True, - expected_countries_without_population=expected_countries_without_population, - ) - - # Calculate production per capita. - for cat in ("Coal", "Oil", "Gas"): - combined[f"{cat} production per capita (kWh)"] = ( - combined[f"{cat} production (TWh)"] / combined["population"] * TWH_TO_KWH - ) - combined = combined.drop(errors="raise", columns=["population"]) - - return combined - - -def remove_spurious_values(tb: Table) -> Table: - """Remove spurious infinity values. - - These values are generated when calculating the annual change of a variable that is zero or nan the previous year. - - Parameters - ---------- - tb : Table - Data that may contain infinity values. - - Returns - ------- - tb : Table - Corrected data. - - """ - # Replace any infinity value by nan. - tb = tb.replace([np.inf, -np.inf], np.nan) - - # Remove rows that only have nan. - tb = tb.dropna(subset=tb.drop(columns=["country", "year"]).columns, how="all").reset_index(drop=True) - - return tb - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BP statistical review dataset and read its main table. - ds_bp: Dataset = paths.load_dependency("statistical_review") - tb_bp = ds_bp["statistical_review"] - - # Load Shift dataset and read its main table. - ds_shift: Dataset = paths.load_dependency("fossil_fuel_production") - tb_shift = ds_shift["fossil_fuel_production"] - - # Load population dataset. - ds_population: Dataset = paths.load_dependency("population") - - # - # Process data. - # - # Prepare BP data. - tb_bp = prepare_bp_data(tb_bp=tb_bp) - - # Prepare Shift data on fossil fuel production. - tb_shift = prepare_shift_data(tb_shift=tb_shift) - - # Combine BP and Shift data. - tb = combine_bp_and_shift_data(tb_bp=tb_bp, tb_shift=tb_shift) - - # Add annual change. - tb = add_annual_change(tb=tb) - - # Add per-capita variables. - tb = add_per_capita_variables(tb=tb, ds_population=ds_population) - - # Remove spurious values and rows that only have nans. - tb = remove_spurious_values(tb=tb) - - # Create an appropriate index and sort conveniently. - tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Update table name. - tb.metadata.short_name = paths.short_name - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset_with_combined_metadata(dest_dir, datasets=[ds_bp, ds_shift], tables=[tb]) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-06-01/global_primary_energy.meta.yml b/etl/steps/archive/garden/energy/2023-06-01/global_primary_energy.meta.yml deleted file mode 100644 index 7a396d1893f..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/global_primary_energy.meta.yml +++ /dev/null @@ -1,268 +0,0 @@ -dataset: - title: Global Primary Energy (Smil & BP, 2023b) - description: | - This dataset comprises of a combination of data from Appendix A of Vaclav Smil's Updated and Revised Edition of his book, 'Energy Transitions: Global and National Perspectives' (2017) and BP's Statistical Review of World Energy (2022). - - All data prior to the year 1965 is sourced from Smil (2017). All data from 1965 onwards, with the exception of traditional biomass is sourced from BP Statistical Review. Smil's estimates of traditional biomass are only available until 2015. For the years 2016 onwards, we have assumed a similar level of traditional biomass consumption. This is approximately in line with recent trends in traditional biomass from Smil's data. - - Our World in Data has normalized all BP fossil fuels data to terawatt-hours (TWh) using a conversion factor of 1,000,000 / 3,600 (~277.778) to convert from exajoules (EJ) to TWh. - - This dataset includes primary energy data using two methodologies: - (1) 'direct' primary energy, which does not take account of the inefficiencies in fossil fuel production. Fossil fuel data is compared to electricity generation (not in input equivalents) of nuclear and renewables. - (2) 'substitution' primary energy, which does take account of inefficiencies in fossil fuel production. This converts non-fossil energy to their 'input equivalents': The amount of primary energy that would be needed if they had the same inefficiencies as fossil fuels. This is the methodology adopted by BP when all data is compared in exajoules. - -tables: - global_primary_energy: - variables: - biofuels__twh_direct_energy: - title: Biofuels (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Modern biofuels - biofuels__twh_substituted_energy: - title: Biofuels (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Modern biofuels - coal__twh_direct_energy: - title: Coal (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - coal__twh_substituted_energy: - title: Coal (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - gas__twh_direct_energy: - title: Gas (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - gas__twh_substituted_energy: - title: Gas (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - hydropower__twh_direct_energy: - title: Hydropower (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - hydropower__twh_substituted_energy: - title: Hydropower (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - nuclear__twh_direct_energy: - title: Nuclear (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - nuclear__twh_substituted_energy: - title: Nuclear (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - oil__twh_direct_energy: - title: Oil (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - oil__twh_substituted_energy: - title: Oil (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - other_renewables__twh_direct_energy: - title: Other renewables (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - other_renewables__twh_substituted_energy: - title: Other renewables (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - solar__twh_direct_energy: - title: Solar (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - solar__twh_substituted_energy: - title: Solar (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - traditional_biomass__twh_direct_energy: - title: Traditional biomass (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Traditional biomass - traditional_biomass__twh_substituted_energy: - title: Traditional biomass (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Traditional biomass - wind__twh_direct_energy: - title: Wind (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - wind__twh_substituted_energy: - title: Wind (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - total_consumption__twh_direct_energy: - title: Total consumption (TWh, direct energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Total consumption - total_consumption__twh_substituted_energy: - title: Total consumption (TWh, substituted energy) - short_unit: TWh - unit: terawatt-hours - display: - name: Total consumption - biofuels__pct_of_direct_energy: - title: Biofuels (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Modern biofuels - biofuels__pct_of_substituted_energy: - title: Biofuels (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Modern biofuels - coal__pct_of_direct_energy: - title: Coal (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Coal - coal__pct_of_substituted_energy: - title: Coal (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Coal - gas__pct_of_direct_energy: - title: Gas (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Natural gas - gas__pct_of_substituted_energy: - title: Gas (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Natural gas - hydropower__pct_of_direct_energy: - title: Hydropower (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Hydropower - hydropower__pct_of_substituted_energy: - title: Hydropower (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Hydropower - nuclear__pct_of_direct_energy: - title: Nuclear (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Nuclear - nuclear__pct_of_substituted_energy: - title: Nuclear (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Nuclear - oil__pct_of_direct_energy: - title: Oil (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Oil - oil__pct_of_substituted_energy: - title: Oil (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Oil - other_renewables__pct_of_direct_energy: - title: Other renewables (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Other renewables - other_renewables__pct_of_substituted_energy: - title: Other renewables (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Other renewables - solar__pct_of_direct_energy: - title: Solar (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Solar - solar__pct_of_substituted_energy: - title: Solar (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Solar - traditional_biomass__pct_of_direct_energy: - title: Traditional biomass (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Traditional biomass - traditional_biomass__pct_of_substituted_energy: - title: Traditional biomass (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Traditional biomass - wind__pct_of_direct_energy: - title: Wind (%, direct energy) - short_unit: "%" - unit: "%" - display: - name: Wind - wind__pct_of_substituted_energy: - title: Wind (%, substituted energy) - short_unit: "%" - unit: "%" - display: - name: Wind diff --git a/etl/steps/archive/garden/energy/2023-06-01/global_primary_energy.py b/etl/steps/archive/garden/energy/2023-06-01/global_primary_energy.py deleted file mode 100644 index 36e89dd0410..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/global_primary_energy.py +++ /dev/null @@ -1,210 +0,0 @@ -"""Garden step that combines Vaclav Smil's Global Primary Energy with BP's Statistical Review of World Energy. - -""" - -import numpy as np -from owid.catalog import Dataset, Table -from owid.datautils.dataframes import combine_two_overlapping_dataframes - -from etl.helpers import PathFinder, create_dataset_with_combined_metadata - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Exajoules to terawatt-hours. -EJ_TO_TWH = 1e6 / 3600 - -# Average efficiency factor assumed to convert direct energy to input-equivalent energy of Smil's data. -# This factor will be used for hydropower, nuclear, other renewables, solar and wind -# (for which there is data until 1960). -# In practice, it only affects hydropower, since all other non-fossil sources are zero prior to 1960. -# All other energy sources in Smil's data will not be affected by this factor. -EFFICIENCY_FACTOR = 0.36 - - -def prepare_bp_data(tb_bp: Table) -> Table: - tb_bp = tb_bp.reset_index() - - # BP gives generation of direct energy in TWh, and, for non-fossil sources of electricity, - # consumption of input-equivalent energy in EJ. - # The input-equivalent energy is the amount of energy that would be required to generate a given amount of (direct) - # electricity if non-fossil sources were as inefficient as a standard thermal power plant. - # Therefore, direct and substituted energies for Biofuels, Coal, Gas and Oil are identical. - # On the other hand, direct and substituted energy are different for non-fossil electricity sources, namely - # Hydropower, Nuclear, Solar, Other renewables, and Wind. - # The difference is of a factor of ~38%, which is roughly the efficiency of a standard power plant. - # More specifically, BP assumes (for Biofuels, Coal, Gas and Oil) an efficiency factor that grows from 36% - # (until year 2000) to 40.6% (in 2021), to better reflect changes in efficiency over time. - # In the case of biomass used in electricity (included in 'Other renewables'), - # BP assumes a constant factor of 32% for all years. - # For more details: - # https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/pdfs/energy-economics/statistical-review/bp-stats-review-2022-methodology.pdf - bp_columns = { - "country": "country", - "year": "year", - # Fossil sources (direct energy). - "biofuels_consumption__twh__total": "biofuels__twh_direct_energy", - "coal_consumption__twh": "coal__twh_direct_energy", - "gas_consumption__twh": "gas__twh_direct_energy", - "oil_consumption__twh": "oil__twh_direct_energy", - # Non-fossil electricity sources (direct energy). - "geo_biomass_other__twh": "other_renewables__twh_direct_energy", - "hydro_generation__twh": "hydropower__twh_direct_energy", - "nuclear_generation__twh": "nuclear__twh_direct_energy", - "solar_generation__twh": "solar__twh_direct_energy", - "wind_generation__twh": "wind__twh_direct_energy", - # Non-fossil electricity sources (substituted energy). - "geo_biomass_other__ej": "other_renewables__ej_substituted_energy", - "hydro_consumption__ej": "hydropower__ej_substituted_energy", - "nuclear_consumption__ej": "nuclear__ej_substituted_energy", - "solar_consumption__ej": "solar__ej_substituted_energy", - "wind_consumption__ej": "wind__ej_substituted_energy", - } - tb_bp = tb_bp[list(bp_columns)].rename(columns=bp_columns) - # Convert all units to TWh. - for column in tb_bp.columns: - if "_ej_" in column: - # Create a new column in TWh instead of EJ. - tb_bp[column.replace("_ej_", "_twh_")] = tb_bp[column] * EJ_TO_TWH - # Remove the column in EJ. - tb_bp = tb_bp.drop(columns=column) - # For completeness, create columns of substituted energy for fossil sources (even if they would coincide with - # direct energy). - for fossil_source in ["biofuels", "coal", "gas", "oil"]: - tb_bp[f"{fossil_source}__twh_substituted_energy"] = tb_bp[f"{fossil_source}__twh_direct_energy"] - - # Select only data for the World (which is the only region informed in Smil's data). - tb_bp = tb_bp[tb_bp["country"] == "World"].reset_index(drop=True) - - return tb_bp - - -def prepare_smil_data(tb_smil: Table) -> Table: - tb_smil = tb_smil.reset_index() - - # Create columns for input-equivalent energy. - # To do this, we follow a similar approach to BP: - # We create input-equivalent energy by dividing direct energy consumption of non-fossil electricity sources - # (hydropower, nuclear, other renewables, solar and wind) by a factor of 36% - # (called EFFICIENCY_FACTOR, defined above). - # This is the efficiency factor of a typical thermal plant assumed by BP between 1965 and 2000, and we assume this - # factor also applies for the period 1800 to 1965. - # For biomass power (included in other renewables), BP assumed a constant factor of 32%. - # However, since we cannot separate biomass from the rest of sources in 'other renewables', - # we use the same 36% factor as all other non-fossil sources. - for source in ["hydropower", "nuclear", "other_renewables", "solar", "wind"]: - tb_smil[f"{source}__twh_substituted_energy"] = tb_smil[f"{source}__twh_direct_energy"] / EFFICIENCY_FACTOR - # For fossil sources (including biofuels and traditional biomass), direct and substituted energy are the same. - for source in ["biofuels", "coal", "gas", "oil", "traditional_biomass"]: - tb_smil[f"{source}__twh_substituted_energy"] = tb_smil[f"{source}__twh_direct_energy"] - - return tb_smil - - -def combine_bp_and_smil_data(tb_bp: Table, tb_smil: Table) -> Table: - tb_bp = tb_bp.copy() - tb_smil = tb_smil.copy() - - # Add a new column that informs of the source of the data. - tb_bp["data_source"] = "BP" - tb_smil["data_source"] = "Smil" - # Combine both tables, prioritizing BP's data on overlapping rows. - combined = combine_two_overlapping_dataframes( - df1=tb_bp, df2=tb_smil, index_columns=["country", "year"] - ).sort_values(["year"]) - - # Replace by numpy nans. - combined = combined.fillna(np.nan) - - # We do not have data for traditional biomass after 2015 (BP does not provide it). - # So, to be able to visualize the complete mix of global energy consumption, - # we extrapolate Smil's data for traditional biomass from 2015 onwards, by repeating its last value. - missing_years_mask = combined["year"] >= tb_smil["year"].max() - combined.loc[missing_years_mask, "traditional_biomass__twh_direct_energy"] = combined[missing_years_mask][ - "traditional_biomass__twh_direct_energy" - ].ffill() - combined.loc[missing_years_mask, "traditional_biomass__twh_substituted_energy"] = combined[missing_years_mask][ - "traditional_biomass__twh_substituted_energy" - ].ffill() - for source in ["hydropower", "nuclear", "other_renewables", "solar", "wind"]: - combined[ - f"{source}__twh_substituted_energy" - ].metadata.description = 'Figures are based on gross generation and do not account for cross-border electricity supply. "Input-equivalent" energy is the amount of fuel that would be required by thermal power stations to generate the reported electricity output.' - - # Create an index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return combined - - -def add_total_consumption_and_percentages(combined: Table) -> Table: - # Create a column with the total direct energy (ensuring there is at least one non-nan value). - combined["total_consumption__twh_direct_energy"] = combined[ - [column for column in combined.columns if "direct_energy" in column] - ].sum(axis=1, min_count=1) - # Create a column with the total substituted energy (ensuring there is at least one non-nan value). - combined["total_consumption__twh_substituted_energy"] = combined[ - [column for column in combined.columns if "substituted_energy" in column] - ].sum(axis=1, min_count=1) - # Add share variables. - sources = [ - "biofuels", - "coal", - "gas", - "hydropower", - "nuclear", - "oil", - "other_renewables", - "solar", - "traditional_biomass", - "wind", - ] - for source in sources: - # Add percentage of each source with respect to the total direct energy. - combined[f"{source}__pct_of_direct_energy"] = ( - 100 * combined[f"{source}__twh_direct_energy"] / combined["total_consumption__twh_direct_energy"] - ) - # Add percentage of each source with respect to the total substituted energy. - combined[f"{source}__pct_of_substituted_energy"] = ( - 100 * combined[f"{source}__twh_substituted_energy"] / combined["total_consumption__twh_substituted_energy"] - ) - - return combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BP statistical review dataset and read its main table. - ds_bp: Dataset = paths.load_dependency("statistical_review") - tb_bp = ds_bp["statistical_review"] - - # Load Smil dataset and read its main table. - ds_smil: Dataset = paths.load_dependency("global_primary_energy") - tb_smil = ds_smil["global_primary_energy"] - - # - # Process data. - # - # Prepare BP data. - tb_bp = prepare_bp_data(tb_bp=tb_bp) - - # Prepare Smil data. - tb_smil = prepare_smil_data(tb_smil=tb_smil) - - # Combine BP and Smil data. - combined = combine_bp_and_smil_data(tb_bp=tb_bp, tb_smil=tb_smil) - - # Add variables for total consumption and variables of % share of each source. - combined = add_total_consumption_and_percentages(combined=combined) - - # Update table name. - combined.metadata.short_name = paths.short_name - - # - # Save outputs. - # - # Save garden dataset. - ds_garden = create_dataset_with_combined_metadata(dest_dir, datasets=[ds_bp, ds_smil], tables=[combined]) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-06-01/owid_energy.meta.yml b/etl/steps/archive/garden/energy/2023-06-01/owid_energy.meta.yml deleted file mode 100644 index 46dba0e979d..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/owid_energy.meta.yml +++ /dev/null @@ -1,13 +0,0 @@ -dataset: - title: Energy dataset (OWID, 2023) - description: | - OWID Energy dataset. - - This dataset will be loaded by [the energy-data repository](https://github.com/owid/energy-data), to create a csv file of the dataset that can be downloaded in one click. - -# Dataset sources will be created in the step by combining all component datasets' sources. -# Also, table metadata will be built from the tables' metadata and the content of owid_energy_variable_mapping.csv. - -tables: - owid_energy: - variables: {} diff --git a/etl/steps/archive/garden/energy/2023-06-01/owid_energy.py b/etl/steps/archive/garden/energy/2023-06-01/owid_energy.py deleted file mode 100644 index 4f5229652c3..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/owid_energy.py +++ /dev/null @@ -1,213 +0,0 @@ -"""Garden step that combines various datasets related to energy and produces the OWID Energy dataset. - -Datasets combined: -* Energy mix from BP. -* Fossil fuel production (BP & Shift). -* Primary energy consumption (BP & EIA). -* Electricity mix (BP & Ember). - -Auxiliary datasets: -* Regions (OWID). -* Population (OWID based on various sources). -* GDP (GGDC Maddison). - -""" - -from typing import Dict, List, cast - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.meta import Source -from owid.datautils import dataframes - -from etl.data_helpers.geo import add_population_to_dataframe -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Path to file with mapping of variable names from one of the datasets to the final energy dataset. -VARIABLE_MAPPING_FILE = paths.directory / "owid_energy_variable_mapping.csv" - - -def gather_sources_from_tables(tables: List[Table]) -> List[Source]: - """Gather unique sources from the metadata.dataset of each table in a list of tables. - - Note: To check if a source is already listed, only the name of the source is considered (not the description or any - other field in the source). - - Parameters - ---------- - tables : list - List of tables with metadata. - - Returns - ------- - known_sources : list - List of unique sources from all tables. - - """ - # Initialise list that will gather all unique metadata sources from the tables. - known_sources: List[Source] = [] - for table in tables: - # Get list of sources of the dataset of current table. - table_sources = table.metadata.dataset.sources - # Go source by source of current table, and check if its name is not already in the list of known_sources. - for source in table_sources: - # Check if this source's name is different to all known_sources. - if all([source.name != known_source.name for known_source in known_sources]): - # Add the new source to the list. - known_sources.append(source) - - return known_sources - - -def combine_tables_data_and_metadata( - tables: Dict[str, Table], - ds_population: Dataset, - countries_regions: Table, - gdp: Table, - variable_mapping: pd.DataFrame, -) -> Table: - """Combine data and metadata of a list of tables, map variable names and add variables metadata. - - Parameters - ---------- - tables : dict - Dictionary where the key is the short name of the table, and the value is the actual table, for all tables to be - combined. - ds_population: Dataset - Population dataset. - countries_regions : Table - Main table from countries-regions dataset. - gdp: Table - GDP (from owid catalog, after resetting index, and selecting country, year and gdp - columns). - variable_mapping : pd.DataFrame - Dataframe (with columns variable, source_variable, source_dataset, description, source) that specifies the names - of variables to take from each table, and their new name in the output table. It also gives a description of the - variable, and the sources of the table. - - Returns - ------- - tb_combined : Table - Combined table with metadata. - - """ - # Merge all tables as a dataframe (without metadata). - dfs = [pd.DataFrame(table) for table in tables.values()] - df_combined = dataframes.multi_merge(dfs, on=["country", "year"], how="outer") - - # Add ISO codes for countries (regions that are not in countries-regions dataset will have nan iso_code). - df_combined = pd.merge(df_combined, countries_regions, left_on="country", right_on="name", how="left") - - # Add population and gdp of countries (except for dataset-specific regions e.g. those ending in (BP) or (Shift)). - df_combined = add_population_to_dataframe( - df=df_combined, ds_population=ds_population, warn_on_missing_countries=False - ) - df_combined = pd.merge(df_combined, gdp, on=["country", "year"], how="left") - - # Check that there were no repetition in column names. - error = "Repeated columns in combined data." - assert len([column for column in set(df_combined.columns) if "_x" in column]) == 0, error - - # Create a table with combined data and no metadata. - tb_combined = Table(df_combined, short_name="owid_energy") - - # List the names of the variables described in the variable mapping file. - source_variables = variable_mapping.index.get_level_values(0).tolist() - - # Gather original metadata for each variable, add the descriptions and sources from the variable mapping file. - for source_variable in source_variables: - variable_metadata = variable_mapping.loc[source_variable] - source_dataset = variable_metadata["source_dataset"] - # Check that the variable indeed exists in the original dataset that the variable mapping says. - # Ignore columns "country", "year" (assigned to a dummy dataset 'various_datasets'), "population" (that comes - # from key_indicators) and "iso_alpha3" (that comes from countries_regions dataset). - if source_dataset not in [ - "various_datasets", - "countries_regions", - "key_indicators", - "maddison_gdp", - ]: - error = f"Variable {source_variable} not found in any of the original datasets." - assert source_variable in tables[source_dataset].columns, error - tb_combined[source_variable].metadata = tables[source_dataset][source_variable].metadata - - # Update metadata with the content of the variable mapping file. - tb_combined[source_variable].metadata.description = variable_metadata["description"] - tb_combined[source_variable].metadata.sources = [Source(name=variable_metadata["source"])] - - # Select only variables in the mapping file, and rename variables according to the mapping. - tb_combined = tb_combined[source_variables].rename(columns=variable_mapping.to_dict()["variable"]) - - # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). - columns_that_must_have_data = [ - column for column in tb_combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] - ] - tb_combined = tb_combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - - # Sanity check. - columns_with_inf = [column for column in tb_combined.columns if len(tb_combined[tb_combined[column] == np.inf]) > 0] - assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" - - # Set index and sort conveniently. - tb_combined = tb_combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - return cast(Table, tb_combined) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read all required datasets. - ds_energy_mix: Dataset = paths.load_dependency("energy_mix") - ds_fossil_fuels: Dataset = paths.load_dependency("fossil_fuel_production") - ds_primary_energy: Dataset = paths.load_dependency("primary_energy_consumption") - ds_electricity_mix: Dataset = paths.load_dependency("electricity_mix") - ds_population: Dataset = paths.load_dependency("population") - ds_ggdc: Dataset = paths.load_dependency("ggdc_maddison") - - # Gather all required tables from all datasets. - tb_energy_mix = ds_energy_mix["energy_mix"].reset_index() - tb_fossil_fuels = ds_fossil_fuels["fossil_fuel_production"].reset_index() - tb_primary_energy = ds_primary_energy["primary_energy_consumption"].reset_index() - tb_electricity_mix = ds_electricity_mix["electricity_mix"].reset_index() - tb_regions = cast(Dataset, paths.load_dependency("regions"))["regions"] - tb_ggdc = ds_ggdc["maddison_gdp"].reset_index()[["country", "year", "gdp"]].dropna() - - # Load mapping from variable names in the component dataset to the final variable name in the output dataset. - variable_mapping = pd.read_csv(VARIABLE_MAPPING_FILE).set_index(["source_variable"]) - - # - # Process data. - # - # Combine all tables. - tables = { - "energy_mix": tb_energy_mix.drop(columns=["country_code"], errors="ignore"), - "fossil_fuel_production": tb_fossil_fuels, - "primary_energy_consumption": tb_primary_energy.drop(columns=["gdp", "population", "source"], errors="ignore"), - "electricity_mix": tb_electricity_mix.drop( - columns=["population", "primary_energy_consumption__twh"], errors="ignore" - ), - } - tb_combined = combine_tables_data_and_metadata( - tables=tables, - ds_population=ds_population, - countries_regions=tb_regions, - gdp=tb_ggdc, - variable_mapping=variable_mapping, - ) - - # - # Save outputs. - # - # Gather metadata sources from all tables' original dataset sources. - ds_garden = Dataset.create_empty(dest_dir) - ds_garden.metadata.sources = gather_sources_from_tables(tables=list(tables.values())) - - # Create a new garden dataset. - ds_garden = create_dataset(dest_dir, tables=[tb_combined], default_metadata=ds_garden.metadata) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-06-01/owid_energy_variable_mapping.csv b/etl/steps/archive/garden/energy/2023-06-01/owid_energy_variable_mapping.csv deleted file mode 100644 index 8c6c44b25af..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/owid_energy_variable_mapping.csv +++ /dev/null @@ -1,130 +0,0 @@ -variable,source_variable,source_dataset,description,source -country,country,various_datasets,Geographic location,Our World in Data -year,year,various_datasets,Year of observation,Our World in Data -iso_code,iso_alpha3,countries_regions,ISO 3166-1 alpha-3 three-letter country codes,International Organization for Standardization -population,population,key_indicators,"Population","Calculated by Our World in Data based on different sources (https://ourworldindata.org/population-sources)" -gdp,gdp,maddison_gdp,"Total real gross domestic product, inflation-adjusted",Maddison Project Database -biofuel_cons_change_pct,biofuels__pct_growth,energy_mix,Annual percentage change in biofuel consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_cons_change_twh,biofuels__twh_growth,energy_mix,"Annual change in biofuel consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_cons_per_capita,biofuels_per_capita__kwh,energy_mix,"Per capita primary energy consumption from biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_consumption,biofuels__twh,energy_mix,"Primary energy consumption from biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -biofuel_elec_per_capita,per_capita_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_electricity,bioenergy_generation__twh,electricity_mix,"Electricity generation from biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_share_elec,bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -biofuel_share_energy,biofuels__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy -carbon_intensity_elec,co2_intensity__gco2_kwh,electricity_mix,"Carbon intensity of electricity production, measured in grams of carbon dioxide emitted per kilowatt-hour",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_cons_change_pct,coal__pct_growth,energy_mix,Annual percentage change in coal consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_cons_change_twh,coal__twh_growth,energy_mix,"Annual change in coal consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_cons_per_capita,coal_per_capita__kwh,energy_mix,"Per capita primary energy consumption from coal, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_consumption,coal__twh,energy_mix,"Primary energy consumption from coal, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -coal_elec_per_capita,per_capita_coal_generation__kwh,electricity_mix,"Per capita electricity generation from coal, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_electricity,coal_generation__twh,electricity_mix,"Electricity generation from coal, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_prod_change_pct,annual_change_in_coal_production__pct,fossil_fuel_production,Annual percentage change in coal production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_prod_change_twh,annual_change_in_coal_production__twh,fossil_fuel_production,"Annual change in coal production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_prod_per_capita,coal_production_per_capita__kwh,fossil_fuel_production,"Per capita coal production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_production,coal_production__twh,fossil_fuel_production,"Coal production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -coal_share_elec,coal_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from coal,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -coal_share_energy,coal__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from coal,Calculated by Our World in Data based on BP Statistical Review of World Energy -electricity_demand,total_demand__twh,electricity_mix,"Electricity demand, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -electricity_generation,total_generation__twh,electricity_mix,"Electricity generation, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -electricity_share_energy,total_electricity_share_of_primary_energy__pct,electricity_mix,"Electricity generation as a share of primary energy",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -energy_cons_change_pct,annual_change_in_primary_energy_consumption__pct,primary_energy_consumption,Annual percentage change in primary energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_cons_change_twh,annual_change_in_primary_energy_consumption__twh,primary_energy_consumption,"Annual change in primary energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_per_capita,primary_energy_consumption_per_capita__kwh,primary_energy_consumption,"Primary energy consumption per capita, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -energy_per_gdp,primary_energy_consumption_per_gdp__kwh_per_dollar,primary_energy_consumption,Energy consumption per unit of GDP. This is measured in kilowatt-hours per 2011 international-$.,"Calculated by Our World in Data based on BP Statistical Review of World Energy, EIA International Energy Data and Maddison Project Database" -fossil_cons_change_pct,fossil_fuels__pct_growth,energy_mix,Annual percentage change in fossil fuel consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_cons_change_twh,fossil_fuels__twh_growth,energy_mix,"Annual change in fossil fuel consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_elec_per_capita,per_capita_fossil_generation__kwh,electricity_mix,"Per capita electricity generation from fossil fuels, measured in kilowatt-hours. This is the sum of electricity generated from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_electricity,fossil_generation__twh,electricity_mix,"Electricity generation from fossil fuels, measured in terawatt-hours. This is the sum of electricity generation from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_energy_per_capita,fossil_fuels_per_capita__kwh,energy_mix,"Per capita fossil fuel consumption, measured in kilowatt-hours. This is the sum of primary energy from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_fuel_consumption,fossil_fuels__twh,energy_mix,"Fossil fuel consumption, measured in terawatt-hours. This is the sum of primary energy from coal, oil and gas.",Calculated by Our World in Data based on BP Statistical Review of World Energy -fossil_share_elec,fossil_share_of_electricity__pct,electricity_mix,"Share of electricity generation that comes from fossil fuels (coal, oil and gas combined)",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -fossil_share_energy,fossil_fuels__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from fossil fuels,Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_cons_change_pct,gas__pct_growth,energy_mix,Annual percentage change in gas consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_cons_change_twh,gas__twh_growth,energy_mix,"Annual change in gas consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_consumption,gas__twh,energy_mix,"Primary energy consumption from gas, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_elec_per_capita,per_capita_gas_generation__kwh,electricity_mix,"Per capita electricity generation from gas, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_electricity,gas_generation__twh,electricity_mix,"Electricity generation from gas, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_energy_per_capita,gas_per_capita__kwh,energy_mix,"Per capita primary energy consumption from gas, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -gas_prod_change_pct,annual_change_in_gas_production__pct,fossil_fuel_production,Annual percentage change in gas production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_prod_change_twh,annual_change_in_gas_production__twh,fossil_fuel_production,"Annual change in gas production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_prod_per_capita,gas_production_per_capita__kwh,fossil_fuel_production,"Per capita gas production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_production,gas_production__twh,fossil_fuel_production,"Gas production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -gas_share_elec,gas_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from gas,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -gas_share_energy,gas__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from gas,Calculated by Our World in Data based on BP Statistical Review of World Energy -greenhouse_gas_emissions,total_emissions__mtco2,electricity_mix,"Greenhouse-gas emissions produced in the generation of electricity, measured in million tonnes of CO2 equivalent",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_cons_change_pct,hydro__pct_growth,energy_mix,Annual percentage change in hydropower consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_cons_change_twh,hydro__twh_growth__equivalent,energy_mix,"Annual change in hydropower consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_consumption,hydro__twh__equivalent,energy_mix,"Primary energy consumption from hydropower, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_elec_per_capita,per_capita_hydro_generation__kwh,electricity_mix,"Per capita electricity generation from hydropower, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_electricity,hydro_generation__twh,electricity_mix,"Electricity generation from hydropower, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_energy_per_capita,hydro_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from hydropower, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -hydro_share_elec,hydro_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from hydropower,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -hydro_share_energy,hydro__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from hydropower,Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_cons_change_pct,low_carbon_energy__pct_growth,energy_mix,Annual percentage change in low-carbon energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_cons_change_twh,low_carbon_energy__twh_growth__equivalent,energy_mix,"Annual change in low-carbon energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_consumption,low_carbon_energy__twh__equivalent,energy_mix,"Primary energy consumption from low-carbon sources, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_elec_per_capita,per_capita_low_carbon_generation__kwh,electricity_mix,"Per capita electricity generation from low-carbon sources, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_electricity,low_carbon_generation__twh,electricity_mix,"Electricity generation from low-carbon sources, measured in terawatt-hours. This is the sum of electricity generation from renewables and nuclear power",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_energy_per_capita,low_carbon_energy_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from low-carbon sources, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -low_carbon_share_elec,low_carbon_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from low-carbon sources. This is the sum of electricity from renewables and nuclear,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -low_carbon_share_energy,low_carbon_energy__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from low-carbon sources. This is the sum of primary energy from renewables and nuclear,Calculated by Our World in Data based on BP Statistical Review of World Energy -net_elec_imports,total_net_imports__twh,electricity_mix,"Net electricity imports, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -net_elec_imports_share_demand,net_imports_share_of_demand__pct,electricity_mix,Net electricity imports as a share of electricity demand,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_cons_change_pct,nuclear__pct_growth,energy_mix,Annual percentage change in nuclear consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_cons_change_twh,nuclear__twh_growth__equivalent,energy_mix,"Annual change in nuclear consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_consumption,nuclear__twh__equivalent,energy_mix,"Primary energy consumption from nuclear power, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_elec_per_capita,per_capita_nuclear_generation__kwh,electricity_mix,"Per capita electricity generation from nuclear power, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_electricity,nuclear_generation__twh,electricity_mix,"Electricity generation from nuclear power, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_energy_per_capita,nuclear_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from nuclear, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -nuclear_share_elec,nuclear_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from nuclear power,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -nuclear_share_energy,nuclear__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from nuclear power,Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_cons_change_pct,oil__pct_growth,energy_mix,Annual percentage change in oil consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_cons_change_twh,oil__twh_growth,energy_mix,"Annual change in oil consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_consumption,oil__twh,energy_mix,"Primary energy consumption from oil, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_elec_per_capita,per_capita_oil_generation__kwh,electricity_mix,"Per capita electricity generation from oil, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_electricity,oil_generation__twh,electricity_mix,"Electricity generation from oil, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_energy_per_capita,oil_per_capita__kwh,energy_mix,"Per capita primary energy consumption from oil, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -oil_prod_change_pct,annual_change_in_oil_production__pct,fossil_fuel_production,Annual percentage change in oil production,Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_prod_change_twh,annual_change_in_oil_production__twh,fossil_fuel_production,"Annual change in oil production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_prod_per_capita,oil_production_per_capita__kwh,fossil_fuel_production,"Per capita oil production, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_production,oil_production__twh,fossil_fuel_production,"Oil production, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and The Shift Dataportal -oil_share_elec,oil_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from oil,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -oil_share_energy,oil__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from oil,Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewable_consumption,other_renewables__twh__equivalent,energy_mix,"Primary energy consumption from other renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewable_electricity,other_renewables_including_bioenergy_generation__twh,electricity_mix,"Electricity generation from other renewable sources including biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewable_exc_biofuel_electricity,other_renewables_excluding_bioenergy_generation__twh,electricity_mix,"Electricity generation from other renewable sources excluding biofuels, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_cons_change_pct,other_renewables__pct_growth,energy_mix,Annual percentage change in energy consumption from other renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_cons_change_twh,other_renewables__twh_growth__equivalent,energy_mix,"Annual change in other renewable consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_elec_per_capita,per_capita_other_renewables_including_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from other renewables including biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_elec_per_capita_exc_biofuel,per_capita_other_renewables_excluding_bioenergy_generation__kwh,electricity_mix,"Per capita electricity generation from other renewables excluding biofuels, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_energy_per_capita,other_renewables_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from other renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -other_renewables_share_elec,other_renewables_including_bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from other renewables including biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_share_elec_exc_biofuel,other_renewables_excluding_bioenergy_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from other renewables excluding biofuels,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -other_renewables_share_energy,other_renewables__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from other renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -per_capita_electricity,per_capita_total_generation__kwh,electricity_mix,"Electricity generation per capita, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -primary_energy_consumption,primary_energy_consumption__twh,primary_energy_consumption,"Primary energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data -renewables_cons_change_pct,renewables__pct_growth,energy_mix,Annual percentage change in renewable energy consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_cons_change_twh,renewables__twh_growth__equivalent,energy_mix,"Annual change in renewable energy consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_consumption,renewables__twh__equivalent,energy_mix,"Primary energy consumption from renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_elec_per_capita,per_capita_renewable_generation__kwh,electricity_mix,"Per capita electricity generation from renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_electricity,renewable_generation__twh,electricity_mix,"Electricity generation from renewables, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_energy_per_capita,renewables_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from renewables, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -renewables_share_elec,renewable_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -renewables_share_energy,renewables__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from renewables,Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_cons_change_pct,solar__pct_growth,energy_mix,Annual percentage change in solar consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_cons_change_twh,solar__twh_growth__equivalent,energy_mix,"Annual change in solar consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_consumption,solar__twh__equivalent,energy_mix,"Primary energy consumption from solar, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_elec_per_capita,per_capita_solar_generation__kwh,electricity_mix,"Per capita electricity generation from solar, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_electricity,solar_generation__twh,electricity_mix,"Electricity generation from solar, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_energy_per_capita,solar_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from solar, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -solar_share_elec,solar_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from solar,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -solar_share_energy,solar__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from solar,Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_cons_change_pct,wind__pct_growth,energy_mix,Annual percentage change in wind consumption,Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_cons_change_twh,wind__twh_growth__equivalent,energy_mix,"Annual change in wind consumption, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_consumption,wind__twh__equivalent,energy_mix,"Primary energy consumption from wind, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_elec_per_capita,per_capita_wind_generation__kwh,electricity_mix,"Per capita electricity generation from wind, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_electricity,wind_generation__twh,electricity_mix,"Electricity generation from wind, measured in terawatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_energy_per_capita,wind_per_capita__kwh__equivalent,energy_mix,"Per capita primary energy consumption from wind, measured in kilowatt-hours",Calculated by Our World in Data based on BP Statistical Review of World Energy -wind_share_elec,wind_share_of_electricity__pct,electricity_mix,Share of electricity generation that comes from wind,Calculated by Our World in Data based on BP Statistical Review of World Energy and Ember Global and European Electricity Review -wind_share_energy,wind__pct_equivalent_primary_energy,energy_mix,Share of primary energy consumption that comes from wind,Calculated by Our World in Data based on BP Statistical Review of World Energy diff --git a/etl/steps/archive/garden/energy/2023-06-01/primary_energy_consumption.meta.yml b/etl/steps/archive/garden/energy/2023-06-01/primary_energy_consumption.meta.yml deleted file mode 100644 index 94f7d7ad975..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/primary_energy_consumption.meta.yml +++ /dev/null @@ -1,67 +0,0 @@ -dataset: - title: Primary energy consumption (BP & EIA, 2023b) - description: | - Primary energy consumption data was compiled by Our World in Data based on two key data sources: - 1. [BP Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html). - 2. [International energy data from the U.S. Energy Information Administration (EIA)](https://www.eia.gov/international/data/world/total-energy/more-total-energy-data). - - BP provides the longest and most up-to-date time-series of primary energy. However, it does not provide data for all countries. We have therefore supplemented this dataset with energy data from the EIA. Where BP provides data for a given country, this data is adopted; for countries where this data is missing, we rely on EIA energy figures. - - Per capita figures have been calculated using a population dataset that is built and maintained by Our World in Data, based on [different sources](https://ourworldindata.org/population-sources). - - To calculate energy per unit of GDP, we use total real GDP figures from [the Maddison Project Database, version 2020](https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020). - This dataset is based on Bolt, Jutta and Jan Luiten van Zanden (2020), “Maddison style estimates of the evolution of the world economy. A new 2020 update ”. GDP is measured in 2011$ which are PPP-adjusted. -tables: - primary_energy_consumption: - variables: - annual_change_in_primary_energy_consumption__pct: - title: Annual change in primary energy consumption (%) - short_unit: '%' - unit: '%' - display: - name: Annual change in primary energy consumption - annual_change_in_primary_energy_consumption__twh: - title: Annual change in primary energy consumption (TWh) - short_unit: TWh - unit: terawatt-hours - display: - name: Annual change in primary energy consumption - gdp: - title: GDP - short_unit: $ - unit: 2011 int-$ - description: >- - Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over - time (inflation) and price differences between countries. Calculated by multiplying GDP per capita with population. - display: - numDecimalPlaces: 0 - population: - title: Population - unit: people - primary_energy_consumption__twh: - title: Primary energy consumption (TWh) - short_unit: TWh - unit: terawatt-hours - description: Primary energy consumption, measured in terawatt-hours per year. - display: - name: Primary energy consumption - numDecimalPlaces: 0 - primary_energy_consumption_per_gdp__kwh_per_dollar: - title: Primary energy consumption per GDP (kWh/$) - short_unit: kWh - unit: kilowatt-hours per $ - description: Primary energy consumption per unit of gross domestic product, measured in kilowatt-hours per international-$. - display: - name: Energy consumption per dollar - primary_energy_consumption_per_capita__kwh: - title: Primary energy consumption per capita (kWh/person) - short_unit: kWh - unit: kilowatt-hours per capita - description: Primary energy consumption per capita, measured in kilowatt-hours per person per year. - display: - name: Per capita energy consumption - numDecimalPlaces: 0 - source: - title: Source of data - short_unit: source - unit: source diff --git a/etl/steps/archive/garden/energy/2023-06-01/primary_energy_consumption.py b/etl/steps/archive/garden/energy/2023-06-01/primary_energy_consumption.py deleted file mode 100644 index 63f8e0dfe75..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/primary_energy_consumption.py +++ /dev/null @@ -1,321 +0,0 @@ -"""Garden step for Primary energy consumption dataset (part of the OWID Energy dataset), based on a combination of BP's -Statistical Review dataset and EIA data on energy consumption. - -""" - -from typing import cast - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table - -from etl.data_helpers.geo import add_population_to_dataframe -from etl.helpers import PathFinder, create_dataset_with_combined_metadata - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Conversion factors. -# Terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - -# Countries whose data have to be removed since they were identified as outliers. -OUTLIERS = ["Gibraltar"] - - -def prepare_bp_data(tb_bp: Table) -> Table: - """Prepare BP data. - - Parameters - ---------- - tb_bp : Table - BP data. - - Returns - ------- - tb_bp : Table - BP data as a table with metadata. - - """ - tb_bp = tb_bp.reset_index() - - bp_columns = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "Primary energy consumption (TWh)", - } - tb_bp = tb_bp[list(bp_columns)].rename(columns=bp_columns) - - # Drop rows with missing values. - tb_bp = tb_bp.dropna(how="any").reset_index(drop=True) - - return cast(Table, tb_bp) - - -def prepare_eia_data(tb_eia: Table) -> Table: - """Prepare EIA data. - - Parameters - ---------- - tb_eia : Table - EIA data. - - Returns - ------- - eia_table : Table - EIA data as a table with metadata. - - """ - tb_eia = tb_eia.reset_index() - - eia_columns = { - "country": "country", - "year": "year", - "energy_consumption": "Primary energy consumption (TWh)", - } - tb_eia = tb_eia[list(eia_columns)].rename(columns=eia_columns) - - # Drop rows with missing values. - tb_eia = tb_eia.dropna(how="any").reset_index(drop=True) - - return cast(Table, tb_eia) - - -def prepare_ggdc_data(tb_ggdc: Table) -> Table: - """Prepare GGDC data. - - Parameters - ---------- - tb_ggdc : Table - GGDC data. - - Returns - ------- - ggdc_table : Table - GGDC data as a table with metadata. - - """ - tb_ggdc = tb_ggdc.reset_index() - - ggdc_columns = { - "country": "country", - "year": "year", - "gdp": "GDP", - } - tb_ggdc = tb_ggdc[list(ggdc_columns)].rename(columns=ggdc_columns) - - # Drop rows with missing values. - tb_ggdc = tb_ggdc.dropna(how="any").reset_index(drop=True) - - return cast(Table, tb_ggdc) - - -def combine_bp_and_eia_data(tb_bp: Table, tb_eia: Table) -> Table: - """Combine BP and EIA data. - - Parameters - ---------- - tb_bp : Table - Table from BP Statistical Review dataset. - tb_eia : Table - Table from EIA energy consumption dataset. - - Returns - ------- - combined : Table - Combined data. - - """ - # Check that there are no duplicated rows in any of the two datasets. - assert tb_bp[tb_bp.duplicated(subset=["country", "year"])].empty, "Duplicated rows in BP data." - assert tb_eia[tb_eia.duplicated(subset=["country", "year"])].empty, "Duplicated rows in EIA data." - - tb_bp["source"] = "bp" - tb_eia["source"] = "eia" - # Combine EIA data (which goes further back in the past) with BP data (which is more up-to-date). - # On coincident rows, prioritise BP data. - index_columns = ["country", "year"] - combined = Table(pd.concat([tb_eia, tb_bp], ignore_index=True)).drop_duplicates(subset=index_columns, keep="last") - - # Sort conveniently. - combined = combined.sort_values(index_columns).reset_index(drop=True) - - return combined - - -def add_annual_change(tb: Table) -> Table: - """Add annual change variables to combined BP & EIA data. - - Parameters - ---------- - tb : Table - Combined BP & EIA data. - - Returns - ------- - combined : Table - Combined BP & EIA data after adding annual change variables. - - """ - combined = tb.copy() - - # Calculate annual change. - combined = combined.sort_values(["country", "year"]).reset_index(drop=True) - combined["Annual change in primary energy consumption (%)"] = ( - combined.groupby("country")["Primary energy consumption (TWh)"].pct_change() * 100 - ) - combined["Annual change in primary energy consumption (TWh)"] = combined.groupby("country")[ - "Primary energy consumption (TWh)" - ].diff() - - return combined - - -def add_per_capita_variables(tb: Table, ds_population: Dataset) -> Table: - """Add a population column and add per-capita variables. - - Parameters - ---------- - tb : Table - Data. - ds_population : Dataset - Population dataset. - - Returns - ------- - tb : Table - Data after adding population and per-capita variables. - - """ - tb = tb.copy() - - # Add population to data. - tb = add_population_to_dataframe( - df=tb, - ds_population=ds_population, - country_col="country", - year_col="year", - population_col="Population", - warn_on_missing_countries=False, - ) - - # Calculate consumption per capita. - tb["Primary energy consumption per capita (kWh)"] = ( - tb["Primary energy consumption (TWh)"] / tb["Population"] * TWH_TO_KWH - ) - - return tb - - -def add_per_gdp_variables(tb: Table, ggdc_table: Table) -> Table: - """Add a GDP column and add per-gdp variables. - - Parameters - ---------- - tb : Table - Data. - ggdc_table : Table - GDP data from the GGDC Maddison dataset. - - Returns - ------- - tb : Table - Data after adding GDP and per-gdp variables. - - """ - tb = tb.copy() - - # Add population to data. - tb = pd.merge(tb, ggdc_table, on=["country", "year"], how="left") - - # Calculate consumption per GDP. - tb["Primary energy consumption per GDP (kWh per $)"] = ( - tb["Primary energy consumption (TWh)"] / tb["GDP"] * TWH_TO_KWH - ) - - return tb - - -def remove_outliers(tb: Table) -> Table: - """Remove infinity values and data that has been identified as spurious outliers. - - Parameters - ---------- - tb : Table - Data. - - Returns - ------- - tb : Table - Data after removing spurious data. - - """ - tb = tb.copy() - - # Remove spurious values. - tb = tb.replace(np.inf, np.nan) - - # Remove indexes of outliers from data. - tb = tb[~tb["country"].isin(OUTLIERS)].reset_index(drop=True) - - return tb - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BP statistical review dataset and read its main table. - ds_bp: Dataset = paths.load_dependency("statistical_review") - tb_bp = ds_bp["statistical_review"] - - # Load EIA dataset on energy consumption and read its main table. - ds_eia: Dataset = paths.load_dependency("energy_consumption") - tb_eia = ds_eia["energy_consumption"] - - # Load GGDC Maddison data on GDP and read its main table. - ds_ggdc: Dataset = paths.load_dependency("ggdc_maddison") - tb_ggdc = ds_ggdc["maddison_gdp"] - - # Load population dataset. - ds_population: Dataset = paths.load_dependency("population") - - # - # Process data. - # - # Prepare BP data. - tb_bp = prepare_bp_data(tb_bp=tb_bp) - - # Prepare EIA data. - tb_eia = prepare_eia_data(tb_eia=tb_eia) - - # Prepare GGDC data. - tb_ggdc = prepare_ggdc_data(tb_ggdc=tb_ggdc) - - # Combine BP and EIA data. - tb = combine_bp_and_eia_data(tb_bp=tb_bp, tb_eia=tb_eia) - - # Add annual change. - tb = add_annual_change(tb=tb) - - # Add per-capita variables. - tb = add_per_capita_variables(tb=tb, ds_population=ds_population) - - # Add per-GDP variables. - tb = add_per_gdp_variables(tb=tb, ggdc_table=tb_ggdc) - - # Remove outliers. - tb = remove_outliers(tb=tb) - - # Create an appropriate index and sort conveniently. - tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Update table short name. - tb.metadata.short_name = paths.short_name - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset_with_combined_metadata(dest_dir, datasets=[ds_bp, ds_eia], tables=[tb]) - ds_garden.save() diff --git a/etl/steps/archive/garden/energy/2023-06-01/uk_historical_electricity.meta.yml b/etl/steps/archive/garden/energy/2023-06-01/uk_historical_electricity.meta.yml deleted file mode 100644 index 69b8f77b574..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/uk_historical_electricity.meta.yml +++ /dev/null @@ -1,69 +0,0 @@ -dataset: - title: UK historical electricity (DUKES, 2023b) - description: | - All data prior to 1985 (and prior to 1965 in the case of renewables), is sourced from [the Digest of UK Energy Statistics (DUKES), published by the UK's Department for Business, Energy & Industrial Strategy](https://www.gov.uk/government/statistics/electricity-chapter-5-digest-of-united-kingdom-energy-statistics-dukes). - - All other data is sourced from the [BP's Statistical Review of World Energy](https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html) and [Ember's Yearly Electricity Data](https://ember-climate.org/data-catalogue/yearly-electricity-data/). Where data from BP is available for a given year, we rely on it as the primary source. We then supplement this with data from Ember where data from BP is not available. -tables: - uk_historical_electricity: - variables: - coal_generation: - title: Electricity generation from coal - short_unit: TWh - unit: terawatt-hours - display: - name: Coal - oil_generation: - title: Electricity generation from oil - short_unit: TWh - unit: terawatt-hours - display: - name: Oil - gas_generation: - title: Electricity generation from gas - short_unit: TWh - unit: terawatt-hours - display: - name: Natural gas - nuclear_generation: - title: Electricity generation from nuclear - short_unit: TWh - unit: terawatt-hours - display: - name: Nuclear - hydro_generation: - title: Electricity generation from hydropower - short_unit: TWh - unit: terawatt-hours - display: - name: Hydropower - solar_generation: - title: Electricity generation from solar - short_unit: TWh - unit: terawatt-hours - display: - name: Solar - wind_generation: - title: Electricity generation from wind - short_unit: TWh - unit: terawatt-hours - display: - name: Wind - other_renewables_generation: - title: Electricity generation from other renewables - short_unit: TWh - unit: terawatt-hours - display: - name: Other renewables - total_generation: - title: Total electricity generation - short_unit: TWh - unit: terawatt-hours - display: - name: Total electricity generation - net_imports: - title: Net electricity imports - short_unit: TWh - unit: terawatt-hours - display: - name: Net electricity imports diff --git a/etl/steps/archive/garden/energy/2023-06-01/uk_historical_electricity.py b/etl/steps/archive/garden/energy/2023-06-01/uk_historical_electricity.py deleted file mode 100644 index 44173c47cb4..00000000000 --- a/etl/steps/archive/garden/energy/2023-06-01/uk_historical_electricity.py +++ /dev/null @@ -1,190 +0,0 @@ -"""Combine UK BEIS' historical electricity with our electricity mix dataset (by BP & Ember) to obtain a long-run -electricity mix in the UK. - -""" - -import numpy as np -from owid.catalog import Dataset, Table -from owid.datautils import dataframes - -from etl.helpers import PathFinder, create_dataset_with_combined_metadata - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def prepare_electricity_mix_data(tb_elec: Table) -> Table: - """Select necessary columns from the electricity mix, and select rows corresponding to the UK. - - Parameters - ---------- - tb_elec : Table - Data from the main table of the electricity mix dataset. - - Returns - ------- - tb_elec : Table - Selected columns and rows from the electricity mix data. - - """ - tb_elec = tb_elec.copy() - - # Select columns and rename them conveniently. - elec_columns = { - "country": "country", - "year": "year", - "coal_generation__twh": "coal_generation", - "gas_generation__twh": "gas_generation", - "oil_generation__twh": "oil_generation", - "hydro_generation__twh": "hydro_generation", - "nuclear_generation__twh": "nuclear_generation", - "other_renewables_including_bioenergy_generation__twh": "other_renewables_generation", - "solar_generation__twh": "solar_generation", - "total_generation__twh": "total_generation", - "wind_generation__twh": "wind_generation", - "total_net_imports__twh": "net_imports", - } - - # Select necessary columns from electricity mix dataset. - tb_elec = tb_elec[list(elec_columns)].rename(columns=elec_columns) - - # Select UK data from Ember dataset. - tb_elec = tb_elec[tb_elec["country"] == "United Kingdom"].reset_index(drop=True) - - return tb_elec - - -def prepare_beis_data(tb_beis: Table) -> Table: - """Select (and rename) columns from the UK historical electricity data from BEIS. - - Parameters - ---------- - tb_beis : Table - Combined data for UK historical electricity data from BEIS. - - Returns - ------- - tb_beis : Table - Selected columns from the UK historical electricity data. - - """ - tb_beis = tb_beis.copy() - - # Select columns and rename them conveniently. - beis_columns = { - "country": "country", - "year": "year", - "coal": "coal_generation", - "oil": "oil_generation", - "electricity_generation": "total_generation", - "gas": "gas_generation", - "hydro": "hydro_generation", - "nuclear": "nuclear_generation", - "net_imports": "net_imports", - "implied_efficiency": "implied_efficiency", - "wind_and_solar": "wind_and_solar_generation", - } - tb_beis = tb_beis[list(beis_columns)].rename(columns=beis_columns) - - return tb_beis - - -def combine_beis_and_electricity_mix_data(tb_beis: Table, tb_elec: Table) -> Table: - """Combine BEIS data on UK historical electricity with the electricity mix data (after having selected rows for only - the UK). - - There are different processing steps done to the data, see comments below in the code. - - Parameters - ---------- - tb_beis : Table - Selected data from BEIS on UK historical electricity. - tb_elec : Table - Selected data from the electricity mix (after having selected rows for the UK). - - Returns - ------- - tb_combined : Table - Combined and processed data with a verified index. - - """ - # In the BEIS dataset, wind and solar are given as one joined variable. - # Check if we can ignore it (since it's better to have the two sources separately). - # Find the earliest year informed in the electricity mix for solar or wind generation. - solar_or_wind_first_year = tb_elec[tb_elec["wind_generation"].notnull() | tb_elec["solar_generation"].notnull()][ - "year" - ].min() - # Now check that, prior to that year, all generation from solar and wind was zero. - assert tb_beis[tb_beis["year"] < solar_or_wind_first_year]["wind_and_solar_generation"].fillna(0).max() == 0 - # Therefore, since wind and solar is always zero (prior to the beginning of the electricity mix data) - # we can ignore this column from the BEIS dataset. - tb_beis = tb_beis.drop(columns=["wind_and_solar_generation"]) - # And create two columns of zeros for wind and solar. - tb_beis["solar_generation"] = 0 - tb_beis["wind_generation"] = 0 - # Similarly, given that in the BEIS dataset there is no data about other renewable sources (apart from hydro, solar - # and wind), we can assume that the contribution from other renewables is zero. - tb_beis["other_renewables_generation"] = 0 - # And ensure these new columns do not have any values after the electricity mix data begins. - tb_beis.loc[ - tb_beis["year"] >= solar_or_wind_first_year, - ["solar_generation", "wind_generation", "other_renewables_generation"], - ] = np.nan - - # BEIS data on fuel input gives raw energy, but we want electricity generation (which is less, given the - # inefficiencies of the process of burning fossil fuels). - # They also include a variable on "implied efficiency", which they obtain by dividing the input energy by the total - # electricity generation. - # We multiply the raw energy by the efficiency to have an estimate of the electricity generated by each fossil fuel. - # This only affects data prior to the beginning of the electricity mix's data (which is 1965 for renewables and - # nuclear, and 1985 for the rest). - for source in ["coal", "oil", "gas"]: - tb_beis[f"{source}_generation"] *= tb_beis["implied_efficiency"] - - # Drop other unnecessary columns. - tb_beis = tb_beis.drop(columns=["implied_efficiency"]) - - # Combine BEIS and electricity mix data. - tb_combined = dataframes.combine_two_overlapping_dataframes( - df1=tb_elec, df2=tb_beis, index_columns=["country", "year"] - ) - - # Add an index and sort conveniently. - tb_combined = tb_combined.set_index(["country", "year"]).sort_index().sort_index(axis=1) - - return tb_combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load BEIS dataset and read its main table. - ds_beis: Dataset = paths.load_dependency("uk_historical_electricity") - tb_beis = ds_beis["uk_historical_electricity"].reset_index() - - # Load electricity mix dataset and read its main table. - ds_elec: Dataset = paths.load_dependency("electricity_mix") - tb_elec = ds_elec["electricity_mix"].reset_index() - - # - # Process data. - # - # Prepare electricity mix data. - tb_elec = prepare_electricity_mix_data(tb_elec=tb_elec) - - # Prepare BEIS data. - tb_beis = prepare_beis_data(tb_beis=tb_beis) - - # Combine BEIS and electricity mix data. - tb_combined = combine_beis_and_electricity_mix_data(tb_beis=tb_beis, tb_elec=tb_elec) - - # Update combined table name. - tb_combined.metadata.short_name = paths.short_name - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset_with_combined_metadata(dest_dir, datasets=[ds_beis, ds_elec], tables=[tb_combined]) - ds_garden.save() diff --git a/etl/steps/archive/garden/faostat/2021-03-18/faostat_qcl.country_std.json b/etl/steps/archive/garden/faostat/2021-03-18/faostat_qcl.country_std.json deleted file mode 100644 index 5f98d3125db..00000000000 --- a/etl/steps/archive/garden/faostat/2021-03-18/faostat_qcl.country_std.json +++ /dev/null @@ -1,220 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Africa": "Africa", - "Albania": "Albania", - "Algeria": "Algeria", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Asia": "Asia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bhutan": "Bhutan", - "Bolivia (Plurinational State of)": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Channel Islands": "Channel Islands", - "Chile": "Chile", - "China": "China", - "China, Hong Kong SAR": "Hong Kong", - "China, Macao SAR": "Macao", - "China, Taiwan Province of": "Taiwan", - "China, mainland": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Czechoslovakia": "Czechoslovakia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic People's Republic of Korea": "North Korea", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Ethiopia PDR": "Ethiopia (former)", - "Europe": "Europe", - "Faroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guyana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oceania": "Oceania", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palestine": "Palestine", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Republic of Korea": "South Korea", - "Republic of Moldova": "Moldova", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "R\u00e9union": "Reunion", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Serbia and Montenegro": "Serbia and Montenegro", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Sudan (former)": "Sudan (former)", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Tajikistan": "Tajikistan", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tokelau": "Tokelau", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "USSR": "USSR", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", - "United Republic of Tanzania": "Tanzania", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "World": "World", - "Yemen": "Yemen", - "Yugoslav SFR": "Yugoslavia", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Svalbard and Jan Mayen Islands": "Svalbard and Jan Mayen" -} diff --git a/etl/steps/archive/garden/faostat/2021-03-18/faostat_qcl.ipynb b/etl/steps/archive/garden/faostat/2021-03-18/faostat_qcl.ipynb deleted file mode 100644 index 3b2d67c65a0..00000000000 --- a/etl/steps/archive/garden/faostat/2021-03-18/faostat_qcl.ipynb +++ /dev/null @@ -1,499 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# FAOstat: Crops and livestock products\n", - "[_Source data_](https://www.fao.org/faostat/en/#data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "dest_dir = \"/tmp/faostat_qcl\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read data and reference tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import pandas as pd\n", - "from pathlib import Path\n", - "\n", - "from owid import catalog\n", - "from etl.paths import BASE_DIR, DATA_DIR" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "COUNTRY_MAPPING = BASE_DIR / \"etl/steps/data/garden/faostat/2021-03-18/faostat_qcl.country_std.json\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "## Load meadow dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_meadow = catalog.Dataset(DATA_DIR / \"meadow/faostat/2021-03-18/faostat_qcl\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "metadata = catalog.Dataset(DATA_DIR / \"meadow/faostat/2022-02-10/faostat_metadata\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Bulk data and items metadata\n", - "qcl_bulk = qcl_meadow[\"bulk\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_bulk.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Clean dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `Area`\n", - "Filtering and mapping" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare for Country Tool\n", - "# ds = qcl_area.Country.drop_duplicates()\n", - "# ds.to_csv(\"ign.countries.csv\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_area = metadata[\"meta_qcl_area\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load\n", - "with open(COUNTRY_MAPPING) as f:\n", - " country_mapping = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check which countries will be discarded based on our country standardisation file (those without a mapped standardised name)\n", - "msk = qcl_area.country.isin(country_mapping)\n", - "print(qcl_area.loc[-msk, \"country\"].tolist())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we build the `Area Code ---> Country` mapping dictionary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "area_codes_discard = [140, 259, 260]\n", - "if set(qcl_bulk.index.levels[0]).intersection(area_codes_discard):\n", - " raise ValueError(\"There are some changes in the bulk data! Codes that are being discarded might probably be needed\")\n", - "# Discard\n", - "qcl_area = qcl_area.loc[~qcl_area.index.isin(area_codes_discard)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "map_area = qcl_area.loc[msk, \"country\"].replace(country_mapping).sort_index().to_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `Item`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_item = metadata[\"meta_qcl_item\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Find Item Groups with more than one Code (legacy?)\n", - "x = qcl_item.reset_index()\n", - "_ = x.groupby([\"item_group\"]).agg({\"item_group_code\": [lambda x: x.nunique(), lambda x: x.unique().tolist()]})\n", - "__ = _[\"item_group_code\"][\"\"]\n", - "_[__ > 1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check if there are codes in bulk that *only* have a group code associated that is to be delete (sanity check before deletion)\n", - "codes_present = qcl_bulk.index.get_level_values(\"item_code\").unique().astype(str).tolist()\n", - "msk = x[\"item_code\"].astype(str).isin(codes_present)\n", - "y = x[msk]\n", - "yy = y.groupby(\"item_code\")[\"item_group_code\"].agg(set)\n", - "l = yy[yy == {\"QC\"}].index.tolist() # Change to see other groups with unique childs\n", - "x[x[\"item_code\"].isin(l)].head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_item = qcl_item[[\"item_group\", \"item\"]]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `Element`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_element = metadata[\"meta_qcl_element\"]\n", - "qcl_unit = metadata[\"meta_qcl_unit\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_element_unit = qcl_element.merge(\n", - " qcl_unit.rename(columns={\"description\": \"unit_description\"}),\n", - " left_on=\"unit\",\n", - " right_index=True,\n", - ")\n", - "assert qcl_element_unit.shape[0] == qcl_element.shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Bulk" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Filter countries + Area Code -> Country\n", - "qcl_bulk = qcl_bulk.loc[map_area].rename(index=map_area, level=0)\n", - "name_map = {\"area_code\": \"country\"}\n", - "qcl_bulk.index.names = [name_map.get(n, n) for n in qcl_bulk.index.names]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Drop Unit\n", - "qcl_bulk = qcl_bulk.drop(columns=[\"unit\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Variable name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_bulk.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_item.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get Item names\n", - "x = qcl_item.reset_index()\n", - "a = (\n", - " x[[\"item_group_code\", \"item_group\"]]\n", - " .drop_duplicates()\n", - " .rename(columns={\"item_group_code\": \"code\", \"item_group\": \"name\"})\n", - ")\n", - "b = x[[\"item_code\", \"item\"]].drop_duplicates().rename(columns={\"item_code\": \"code\", \"item\": \"name\"})\n", - "c = pd.concat([a, b])\n", - "map_items = dict(zip(c.code, c.name))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# manually add some missing names to the map that were removed from the API\n", - "\n", - "missing = {\n", - " 1067: \"Eggs, hen, in shell (number)\",\n", - " 1092: \"Eggs, other bird, in shell (number)\",\n", - " 1731: \"Oilcrops\",\n", - "}\n", - "\n", - "for k in missing:\n", - " assert k not in map_items\n", - " map_items[k] = missing[k]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "item_names = [map_items[it] for it in qcl_bulk.index.get_level_values(1)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get Element + Unit names\n", - "x = qcl_element_unit.reset_index()\n", - "y = list(x[\"element\"].astype(str) + \" (\" + x[\"unit\"].astype(str) + \")\")\n", - "map_elems = dict(zip(x[\"element_code\"], y))\n", - "elem_names = [map_elems[el] for el in qcl_bulk.index.get_level_values(2)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Construct variable name\n", - "variable_names = [f\"{i} - {e}\" for i, e in zip(item_names, elem_names)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Add variable name to index\n", - "qcl_bulk[\"variable_name\"] = variable_names\n", - "qcl_bulk = qcl_bulk.reset_index()\n", - "qcl_bulk = qcl_bulk.set_index([\"country\", \"item_code\", \"element_code\", \"variable_name\", \"year\", \"flag\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Garden dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_garden = catalog.Dataset.create_empty(dest_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Propagate metadata\n", - "qcl_garden.metadata = qcl_meadow.metadata\n", - "qcl_garden.save()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Add bulk table\n", - "qcl_garden.add(qcl_bulk)\n", - "# Add table items\n", - "qcl_garden.add(qcl_item)\n", - "# Add table elements\n", - "qcl_element_unit.metadata = qcl_element.metadata\n", - "qcl_garden.add(qcl_element_unit)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qcl_garden.save()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "interpreter": { - "hash": "bfee9b694fe04c946c13f91f59877f323f209df7eaba52b3079ace55470be701" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/etl/steps/archive/garden/faostat/2021-04-09/faostat_fbsc.country_std.json b/etl/steps/archive/garden/faostat/2021-04-09/faostat_fbsc.country_std.json deleted file mode 100644 index 34715c924d3..00000000000 --- a/etl/steps/archive/garden/faostat/2021-04-09/faostat_fbsc.country_std.json +++ /dev/null @@ -1,188 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bolivia (Plurinational State of)": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "China, Hong Kong SAR": "Hong Kong", - "China, Macao SAR": "Macao", - "China, Taiwan Province of": "Taiwan", - "China, mainland": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Czechoslovakia": "Czechoslovakia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic People's Republic of Korea": "North Korea", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Ethiopia PDR": "Ethiopia (former)", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Grenada": "Grenada", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "Netherlands Antilles (former)": "Netherlands Antilles", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oceania": "Oceania", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Republic of Korea": "South Korea", - "Republic of Moldova": "Moldova", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Serbia and Montenegro": "Serbia and Montenegro", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "South Africa": "South Africa", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Sudan (former)": "Sudan (former)", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Tajikistan": "Tajikistan", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "USSR": "USSR", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", - "United Republic of Tanzania": "Tanzania", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Yugoslav SFR": "Yugoslavia" -} diff --git a/etl/steps/archive/garden/faostat/2021-04-09/faostat_fbsc.ipynb b/etl/steps/archive/garden/faostat/2021-04-09/faostat_fbsc.ipynb deleted file mode 100644 index c0e8bd48b6d..00000000000 --- a/etl/steps/archive/garden/faostat/2021-04-09/faostat_fbsc.ipynb +++ /dev/null @@ -1,954 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "6d5acae7", - "metadata": {}, - "source": [ - "# FAOstat: Food Balances Combined\n", - "- [_Source data (FBS)_](https://www.fao.org/faostat/en/#data/FBS)\n", - "- [_Source data (FBSH)_](https://www.fao.org/faostat/en/#data/FBSH)\n", - "\n", - "This notebook integrates two FAOstat datasets (previously imported to _meadow_) into a single _garden_ dataset. This is because a new version of the _Food Balances_ dataset was launched in 2014 with a slightly new methodology ([more info](https://fenixservices.fao.org/faostat/static/documents/FBS/New%20FBS%20methodology.pdf)). The new dataset is named FBSC (Food Balances Combined)." - ] - }, - { - "cell_type": "markdown", - "id": "70df5526", - "metadata": {}, - "source": [ - "## Parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ec98d59c", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "dest_dir = \"/tmp/faostat_fbs\"" - ] - }, - { - "cell_type": "markdown", - "id": "05e5b89b-bf6f-4b52-99ef-67b36c85f31a", - "metadata": { - "tags": [] - }, - "source": [ - "## Imports & Paths\n", - "Import the required libraries and define paths to load files (including data files and standardisation mappings for item and element names)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a13d99e-cb34-4dee-a702-be026f2fa3a7", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "from pathlib import Path\n", - "import pandas as pd\n", - "from owid import catalog\n", - "from etl.paths import DATA_DIR, BASE_DIR" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c30b923-929c-4b0f-bd00-c0fdd6f59ad7", - "metadata": {}, - "outputs": [], - "source": [ - "# side-car file containing manual country mapping\n", - "COUNTRY_MAPPING = BASE_DIR / \"etl/steps/data/garden/faostat/2021-04-09/faostat_fbsc.country_std.json\"" - ] - }, - { - "cell_type": "markdown", - "id": "49420475-cb40-4148-b03b-f3322242197a", - "metadata": { - "tags": [] - }, - "source": [ - "## Load meadow datasets\n", - "In this step we load the required datasets from Garden: FBS and FBSH" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "134ea32a-77b4-4e4c-af5c-400f6edd5866", - "metadata": {}, - "outputs": [], - "source": [ - "# Read datasets\n", - "fbs_meadow = catalog.Dataset(DATA_DIR / \"meadow/faostat/2021-04-09/faostat_fbs\")\n", - "fbsh_meadow = catalog.Dataset(DATA_DIR / \"meadow/faostat/2017-12-11/faostat_fbsh\")\n", - "metadata = catalog.Dataset(DATA_DIR / \"meadow/faostat/2022-02-10/faostat_metadata\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5553eb58-fd10-4a93-9356-859121b7bed0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Bulk data and items metadata\n", - "fbs_bulk = fbs_meadow[\"bulk\"]\n", - "fbsh_bulk = fbsh_meadow[\"bulk\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9a67fe4-ca1e-4e73-b667-6cef8cc573b2", - "metadata": {}, - "outputs": [], - "source": [ - "print(fbs_bulk.shape)\n", - "fbs_bulk.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ccf0bed-0e52-4bb3-be88-3fa80e0f48b8", - "metadata": {}, - "outputs": [], - "source": [ - "print(fbsh_bulk.shape)\n", - "fbsh_bulk.head()" - ] - }, - { - "cell_type": "markdown", - "id": "3f4edb48-ed7f-4aab-99be-fd2607bc4d60", - "metadata": {}, - "source": [ - "## Sanity checks\n", - "As we are fusing two different datasets, we will be doing some checks to ensure the consistency of the dataset. Specially in the identifying fields (i.e. `Year`, `Area Code`, `Item Code`, `Element Code`, `Flag`)" - ] - }, - { - "cell_type": "markdown", - "id": "92ffe4cd-e984-49bb-a96e-caaab03128bd", - "metadata": { - "tags": [] - }, - "source": [ - "### Check data files" - ] - }, - { - "cell_type": "markdown", - "id": "7ccfdde4-9993-4aa4-b1ad-ad69a8d21ba9", - "metadata": {}, - "source": [ - "#### `Year`\n", - "Check if the time window of both datasets is disjoint, otherwise we could end up with duplicates." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2e5f421-db02-4e5a-9d92-0d146b19d491", - "metadata": {}, - "outputs": [], - "source": [ - "fbs_year_min, fbs_year_max = (\n", - " fbs_bulk.index.get_level_values(\"year\").min(),\n", - " fbs_bulk.index.get_level_values(\"year\").max(),\n", - ")\n", - "fbsh_year_min, fbsh_year_max = (\n", - " fbsh_bulk.index.get_level_values(\"year\").min(),\n", - " fbsh_bulk.index.get_level_values(\"year\").max(),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "449e3f54-c3d5-4bb1-a0d0-73ae4a887f64", - "metadata": {}, - "outputs": [], - "source": [ - "# Year disjoints\n", - "assert (fbsh_year_min < fbsh_year_max) & (fbsh_year_max + 1 == fbs_year_min < fbs_year_max)" - ] - }, - { - "cell_type": "markdown", - "id": "d986cac5-b0a6-4274-9778-1839f63f85bd", - "metadata": {}, - "source": [ - "#### `Area`\n", - "Here we check which Areas (i.e. countries/regions) appear in one dataset but not in the other.\n", - "\n", - "We observe that former countries only appear in FBSH (USSR, Serbia and Montenegro, Sudan (fromer), Belgium-Luxembourg, Checkoslovakia, Netherland Antilles, Yugoslavia, Ethiopia PDR), which makes sense. There are some special cases where countries stopped or started appearing (Bermuda, Brunei and Papua New Guinea, Seychelles and Comoros)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66501a75-5725-42fc-b68b-8e22a3c7a49e", - "metadata": {}, - "outputs": [], - "source": [ - "fbsh_area = metadata[\"meta_fbsh_area\"]\n", - "fbs_area = metadata[\"meta_fbs_area\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd6fb699-b4d2-4f4d-a330-7e350e5fa3b2", - "metadata": {}, - "outputs": [], - "source": [ - "# Get unique codes\n", - "codes_fbs = set(fbs_bulk.index.get_level_values(\"area_code\"))\n", - "codes_fbsh = set(fbsh_bulk.index.get_level_values(\"area_code\"))\n", - "# Find missing codes\n", - "miss_in_fbs = codes_fbsh.difference(codes_fbs)\n", - "miss_in_fbsh = codes_fbs.difference(codes_fbsh)\n", - "# Print\n", - "print(\"- FBSH but not FBS:\", fbsh_area.loc[sorted(miss_in_fbs), \"country\"].to_dict())\n", - "print(\"- FBS but not FBSH:\", fbs_area.loc[sorted(miss_in_fbsh), \"country\"].to_dict())" - ] - }, - { - "cell_type": "markdown", - "id": "f65d54c3-6655-49c5-aedb-4e098ee30ca5", - "metadata": {}, - "source": [ - "Next, we check that all codes correspond to the same country name in both datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b005a93e-3821-4a79-8447-ccc4fd08cc92", - "metadata": {}, - "outputs": [], - "source": [ - "x = fbs_area.merge(fbsh_area, left_index=True, right_index=True)\n", - "assert (x.country_x.astype(str) == x.country_y.astype(str)).all()" - ] - }, - { - "cell_type": "markdown", - "id": "b32813d0-558c-4bf4-8356-51875ce002ab", - "metadata": {}, - "source": [ - "#### `Item`\n", - "Here we check which items appear and disappear from dataset to dataset.\n", - "\n", - "It seems that some elements were deprecated in favour of others: `Groundnuts (Shelled Eq) --> Groundnuts` and `Rice (Milled Equivalent) --> Rice and products`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b5ac5cb-a2bf-4149-9a0c-f70086f96f7c", - "metadata": {}, - "outputs": [], - "source": [ - "# Load item info\n", - "fbsh_item = metadata[\"meta_fbsh_item\"]\n", - "fbs_item = metadata[\"meta_fbs_item\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e5c87a3-3897-499c-9aea-2cf6454165ff", - "metadata": {}, - "outputs": [], - "source": [ - "def build_item_all_df(df):\n", - " \"\"\"Flatten item dataframe.\"\"\"\n", - "\n", - " def _process_df(df, cols):\n", - " return df.drop_duplicates(cols)[cols].rename(columns=dict(zip(cols, [\"code\", \"name\"]))).set_index(\"code\")\n", - "\n", - " df = df.reset_index()\n", - " a = _process_df(df, [\"item_group_code\", \"item_group\"])\n", - " b = _process_df(df, [\"item_code\", \"item\"])\n", - " df = pd.concat([a, b])\n", - " assert df.index.value_counts().max() == 1\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a24645e7-bcd3-4ea2-91e2-9780651e4874", - "metadata": {}, - "outputs": [], - "source": [ - "# Build flattened version (item group, item in same column)\n", - "fbsh_item_ = build_item_all_df(fbsh_item)\n", - "fbs_item_ = build_item_all_df(fbs_item)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5220bbf-a755-4d5b-846b-537b068b1f05", - "metadata": {}, - "outputs": [], - "source": [ - "# Get unique codes\n", - "codes_fbs = set(fbs_bulk.index.get_level_values(\"item_code\"))\n", - "codes_fbsh = set(fbsh_bulk.index.get_level_values(\"item_code\"))\n", - "# Find missing codes\n", - "miss_in_fbs = codes_fbsh.difference(codes_fbs)\n", - "miss_in_fbsh = codes_fbs.difference(codes_fbsh)\n", - "# Print\n", - "print(\"- FBSH but not FBS:\", fbsh_item_.loc[sorted(miss_in_fbs), \"name\"].to_dict())\n", - "print(\"- FBS but not FBSH:\", fbs_item_.loc[sorted(miss_in_fbsh), \"name\"].to_dict())\n", - "# fbsh_item.reset_index().set_index([\"item_code\", \"item_group_code\"]).loc[2805]\n", - "# fbs_item.reset_index().set_index([\"item_code\", \"item_group_code\"]).loc[2807]" - ] - }, - { - "cell_type": "markdown", - "id": "9d54ee0c-c379-4408-9d40-03ce963c9244", - "metadata": {}, - "source": [ - "We check that all codes are mapped to the same names." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "275b7491-2e31-45a5-a48b-dbd3f93cb314", - "metadata": {}, - "outputs": [], - "source": [ - "x = fbs_item_.merge(fbsh_item_, left_index=True, right_index=True)\n", - "assert (x.name_x.astype(str) == x.name_y.astype(str)).all()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e71b22e-0f1a-4c95-8d85-04adc12265dc", - "metadata": {}, - "outputs": [], - "source": [ - "x[x.name_x != x.name_y]" - ] - }, - { - "cell_type": "markdown", - "id": "dbe512c5-9038-4afa-a0f9-2395e6d45669", - "metadata": { - "tags": [] - }, - "source": [ - "#### `Element`\n", - "We see that two items were introduced in FBS (not present in FBSH): `Residuals` and `Tourist consumption`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "605ca9e7-5f13-40c8-9d8c-b6b388b6fbb4", - "metadata": {}, - "outputs": [], - "source": [ - "# Load element info\n", - "fbsh_element = metadata[\"meta_fbsh_element\"]\n", - "fbs_element = metadata[\"meta_fbs_element\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49d1ba9e-c419-4568-828f-abbc73b5edef", - "metadata": {}, - "outputs": [], - "source": [ - "# Get unique codes\n", - "codes_fbs = set(fbs_bulk.index.get_level_values(\"element_code\"))\n", - "codes_fbsh = set(fbsh_bulk.index.get_level_values(\"element_code\"))\n", - "# Find missing codes\n", - "miss_in_fbs = codes_fbsh.difference(codes_fbs)\n", - "miss_in_fbsh = codes_fbs.difference(codes_fbsh)\n", - "# Print\n", - "print(\"- FBSH but not FBS:\", fbsh_element.loc[miss_in_fbs, \"element\"].to_dict())\n", - "print(\"- FBS but not FBSH:\", fbs_element.loc[miss_in_fbsh, \"element\"].to_dict())" - ] - }, - { - "cell_type": "markdown", - "id": "3a47588f-b35b-4029-92d1-22b0bacc9862", - "metadata": {}, - "source": [ - "First, we check if all element codes just have one unit associated. Next, we verify that in both datasets we have the same mappings `code -> name`, `code -> unit` and `code -> description`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac64c512-3e2b-4b08-97e3-437e24b56519", - "metadata": {}, - "outputs": [], - "source": [ - "# Only one unit per element code\n", - "assert fbs_bulk.reset_index().groupby(\"element_code\").unit.nunique().max() == 1\n", - "assert fbsh_bulk.reset_index().groupby(\"element_code\").unit.nunique().max() == 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c093dc21-525e-4839-be5b-f202c807fa5a", - "metadata": {}, - "outputs": [], - "source": [ - "# Given an element code, we have the same element name, unit and description in fbs and fbsh\n", - "x = fbs_element.merge(fbsh_element, left_index=True, right_index=True)\n", - "assert (x.element_x.astype(str) == x.element_y.astype(str)).all()\n", - "assert (x.unit_x.astype(str) == x.unit_y.astype(str)).all()\n", - "assert (x.description_x.astype(str) == x.description_y.astype(str)).all()" - ] - }, - { - "cell_type": "markdown", - "id": "9b4346e1-c9a4-4c83-8b53-e146943a6f91", - "metadata": {}, - "source": [ - "#### `Flag`\n", - "Next, we compare which flags appear in each dataset. We observe that some flags only appear in one of the datasets. This is fine." - ] - }, - { - "cell_type": "markdown", - "id": "69e54bec-be1c-44d8-93ed-4ee6cb0dec98", - "metadata": {}, - "source": [ - "In particular:\n", - "- `Im` (Imputed) ist most common in new dataset, whereas `S` (Standardized data) was in the old one.\n", - "- `Im` (Imputed) and `*` (Unofficial) appear first in new FBS.\n", - "- `nan` (Official data), `SD` (Statistical Discrepancy) and `F` (FAO estimate) appear only in old FBSH." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92809a3c-490e-4a58-9628-21bff768f6fd", - "metadata": {}, - "outputs": [], - "source": [ - "# Get unique codes\n", - "codes_fbs = set(fbs_bulk.index.get_level_values(\"flag\"))\n", - "codes_fbsh = set(fbsh_bulk.index.get_level_values(\"flag\"))\n", - "# Find missing codes\n", - "miss_in_fbs = codes_fbsh.difference(codes_fbs)\n", - "miss_in_fbsh = codes_fbs.difference(codes_fbsh)\n", - "# Print\n", - "print(\"- FBSH but not FBS:\", miss_in_fbs)\n", - "print(\"- FBS but not FBSH:\", miss_in_fbsh)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5bfc929-2f78-4a27-9a0c-0395cddf30ef", - "metadata": {}, - "outputs": [], - "source": [ - "pd.value_counts(fbsh_bulk.index.get_level_values(\"flag\").fillna(\"nan\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d865d23d-6ca5-4b4d-b131-e1402ecb92da", - "metadata": {}, - "outputs": [], - "source": [ - "pd.value_counts(fbs_bulk.index.get_level_values(\"flag\").fillna(\"nan\"))" - ] - }, - { - "cell_type": "markdown", - "id": "bdcb53c2-f0b1-4522-9c3f-e428fb3d6504", - "metadata": { - "tags": [] - }, - "source": [ - "## Merge dataset\n", - "The moment has arrived. Now we attempt to merge both FBS and FBSH datasets into one: FBSC dataset. For this, we will be merging several files:\n", - "- **bulk file**: The data itself.\n", - "- **item file**: The file containing the mapping from item code to item name.\n", - "- **element file**: The file containing the mapping from element to element name and unit.\n", - "\n", - "In addition, we will transition from `Area Code ---> Country`." - ] - }, - { - "cell_type": "markdown", - "id": "525e7395-81a1-4ba5-b31d-63c37880bdf7", - "metadata": {}, - "source": [ - "### `Area`\n", - "In this step, we standardise the country names. We first go from `Area Code` to `Area` (country name as per the FAO), and then `Area` to `Country`, using our country standardisation file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "09c4e2f7-6aa5-4146-9e0a-aeacc879734e", - "metadata": {}, - "outputs": [], - "source": [ - "# Load our country standardisation file\n", - "with open(COUNTRY_MAPPING) as f:\n", - " country_mapping = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66f3e32f-82d5-45d6-9dda-ba7b3aef4915", - "metadata": {}, - "outputs": [], - "source": [ - "# Merge both datasets Area Code -> Area mapping dataframe\n", - "fbsc_area = pd.concat([fbs_area, fbsh_area]).drop_duplicates(subset=[\"country\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7c83d72-94d5-4f29-9343-577f214a114a", - "metadata": {}, - "outputs": [], - "source": [ - "fbsc_area[fbsc_area.country.apply(lambda x: \"sudan\" in x.lower())]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4a9550f7-c4e4-4e87-a5c7-8045e51cce27", - "metadata": {}, - "outputs": [], - "source": [ - "# Check which countries will be discarded based on our country standardisation file (those without a mapped standardised name)\n", - "msk = fbsc_area.country.isin(country_mapping)\n", - "print(fbsc_area.loc[-msk, \"country\"].tolist())" - ] - }, - { - "cell_type": "markdown", - "id": "acc35845-98bb-4227-9f4a-d734c6b4ff1f", - "metadata": {}, - "source": [ - "Finally, we build the `Area Code ---> Country` mapping dictionary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6187dae0-6333-4910-b0d2-4dd54114a6c8", - "metadata": {}, - "outputs": [], - "source": [ - "map_area = fbsc_area.loc[msk, \"country\"].replace(country_mapping).sort_index().to_dict()" - ] - }, - { - "cell_type": "markdown", - "id": "5ae9be53-e478-4572-846f-5831f7ff1b09", - "metadata": {}, - "source": [ - "### `Item`\n", - "Merging the item dataframe is straight forward. There are some exceptions, which we accept, due to the renaming of items such as Groundnuts and Rice." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1c2e31a-dc5c-40aa-8309-c7f5d37a79e0", - "metadata": {}, - "outputs": [], - "source": [ - "fbsc_item = pd.concat([fbs_item, fbsh_item]).drop_duplicates(subset=[\"item_group\", \"item\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be235d1a-4e8a-4acc-9d56-9f124353b4f0", - "metadata": {}, - "outputs": [], - "source": [ - "# Check differences are as exprected\n", - "a = fbs_item.index\n", - "b = fbsh_item.index\n", - "c = fbsc_item.index\n", - "\n", - "assert not {cc for cc in c if cc not in a}.difference(\n", - " {\n", - " (2905, 2805),\n", - " (2901, 2805),\n", - " (2903, 2805),\n", - " (2901, 2556),\n", - " (2913, 2556),\n", - " (2903, 2556),\n", - " (2960, 2769),\n", - " }\n", - ")\n", - "\n", - "assert not {cc for cc in c if cc not in b}.difference(\n", - " {\n", - " (2905, 2807),\n", - " (2901, 2807),\n", - " (2903, 2807),\n", - " (2901, 2552),\n", - " (2913, 2552),\n", - " (2903, 2552),\n", - " (2961, 2769),\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6401adf5-d9c7-4d45-94c2-97e18cc52533", - "metadata": {}, - "outputs": [], - "source": [ - "# fbsh_item.loc[2960, 2769]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbf2f8dd-ff44-4c94-8a9d-5789c72d2330", - "metadata": {}, - "outputs": [], - "source": [ - "# fbs_item.loc[2961, 2769]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d3beea4-c7a0-4771-ba7d-da4e73689c93", - "metadata": {}, - "outputs": [], - "source": [ - "fbsc_item = fbsc_item[[\"item_group\", \"item\"]]" - ] - }, - { - "cell_type": "markdown", - "id": "940c66d3-1fe1-45cf-ad28-594658e431f6", - "metadata": {}, - "source": [ - "### `Element`\n", - "We merge element and unit dataframes, in order to obtain all the info in one. Next, we combine both FBS and FBSH datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c900886-ca25-407b-a102-662a7bf106fa", - "metadata": {}, - "outputs": [], - "source": [ - "# Load unit table\n", - "fbs_unit = metadata[\"meta_fbs_unit\"]\n", - "fbsh_unit = metadata[\"meta_fbsh_unit\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a11e3cd-07a2-4c35-96cb-cb4ab424a733", - "metadata": {}, - "outputs": [], - "source": [ - "# Merge element and unit\n", - "fbs_element_unit = fbs_element.merge(\n", - " fbs_unit.rename(columns={\"description\": \"unit_description\"}),\n", - " left_on=\"unit\",\n", - " right_index=True,\n", - ")\n", - "assert fbs_element_unit.shape[0] == fbs_element.shape[0]\n", - "\n", - "fbsh_element_unit = fbsh_element.merge(\n", - " fbsh_unit.rename(columns={\"description\": \"unit_description\"}),\n", - " left_on=\"unit\",\n", - " right_index=True,\n", - ")\n", - "assert fbsh_element_unit.shape[0] == fbsh_element.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac368715-7c07-4788-a1aa-1f7c284c8893", - "metadata": {}, - "outputs": [], - "source": [ - "# Merge\n", - "fbsc_element_unit = pd.concat([fbs_element_unit, fbsh_element_unit]).drop_duplicates(\n", - " subset=[\"element\", \"unit\", \"unit_description\"]\n", - ")\n", - "assert fbsc_element_unit.shape == fbsh_element_unit.shape == fbs_element_unit.shape" - ] - }, - { - "cell_type": "markdown", - "id": "324c8b72-ae6d-43dc-ac0b-1fd5febdaa4b", - "metadata": {}, - "source": [ - "### `Bulk`\n", - "Time to merge the core of the dataset, the bulk file! We do this by:\n", - "- Concatenating both datasets\n", - "- Renaming `Area Code --> Country`\n", - "- Drop unused columns (`Unit`, `Area Code`)\n", - "- Drop data related to population (`2501`) item.\n", - "- Add `variable_name` column, with some more verbosity about each row info." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edbccbda-e226-40df-8b22-a6574331d0eb", - "metadata": {}, - "outputs": [], - "source": [ - "fbsc_bulk = pd.concat([fbs_bulk, fbsh_bulk])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7133574-bcff-48af-9c5a-dd3950759c35", - "metadata": {}, - "outputs": [], - "source": [ - "# Filter countries + Area Code -> Country\n", - "index_new = [(col_map := {\"area_code\": \"country\"}).get(x, x) for x in fbsc_bulk.index.names]\n", - "fbsc_bulk = fbsc_bulk.loc[map_area].reset_index()\n", - "fbsc_bulk[col_map[\"area_code\"]] = fbsc_bulk[\"area_code\"].replace(map_area).tolist()\n", - "fbsc_bulk = fbsc_bulk.set_index(index_new)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "320bed95-f461-4a8a-8d15-956bc7f98ec6", - "metadata": {}, - "outputs": [], - "source": [ - "# Drop Unit, Area Code\n", - "fbsc_bulk = fbsc_bulk.drop(columns=[\"unit\", \"area_code\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8430241-3bd2-4253-b641-be3706cee654", - "metadata": {}, - "outputs": [], - "source": [ - "# Drop population (2501) item\n", - "msk = fbsc_bulk.index.get_level_values(\"item_code\").isin([2501])\n", - "fbsc_bulk = fbsc_bulk[~msk]" - ] - }, - { - "cell_type": "markdown", - "id": "558856dd-322b-4175-85f8-5057046f4468", - "metadata": {}, - "source": [ - "#### Variable name\n", - "Variable name is built using the name of the item, element and unit: `item - element - [unit]`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28e82671-58d7-4751-b15d-387d44dfdd09", - "metadata": {}, - "outputs": [], - "source": [ - "# Get item names\n", - "fbsc_item_ = build_item_all_df(fbsc_item)\n", - "map_items = fbsc_item_.astype(str)[\"name\"].to_dict()\n", - "item_names = [map_items[i] for i in fbsc_bulk.index.get_level_values(\"item_code\")]\n", - "# Get Element + Unit names\n", - "x = fbsc_element_unit.reset_index()\n", - "y = list(x[\"element\"].astype(str) + \" [\" + x[\"unit\"].astype(str) + \"]\")\n", - "map_elems = dict(zip(x[\"element_code\"], y))\n", - "elem_names = [map_elems[el] for el in fbsc_bulk.index.get_level_values(2)]\n", - "# Construct variable name\n", - "variable_names = [f\"{i} - {e}\" for i, e in zip(item_names, elem_names)]\n", - "# Add variable name to index\n", - "fbsc_bulk[\"variable_name\"] = variable_names\n", - "fbsc_bulk = fbsc_bulk.reset_index()\n", - "fbsc_bulk = fbsc_bulk.set_index([\"country\", \"item_code\", \"element_code\", \"variable_name\", \"year\", \"flag\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e06c09d4-ccd0-4aec-a51d-65e18bab2814", - "metadata": {}, - "outputs": [], - "source": [ - "fbsc_bulk.head()" - ] - }, - { - "cell_type": "markdown", - "id": "113594aa-b358-4baa-8ab5-38b7a6dad4d7", - "metadata": {}, - "source": [ - "## Create Garden dataset" - ] - }, - { - "cell_type": "markdown", - "id": "68ee93f1-77ce-47c6-be02-b523732d2bcf", - "metadata": {}, - "source": [ - "### Metadata\n", - "First, we create the metadata for this new dataset FBSC. Most of its content comes from concatenating FBS and FBSH fields. Checksum field is left to `None`, as it is unclear what we should use here." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bec146ab-3f8d-432e-b51b-0e03b247febb", - "metadata": {}, - "outputs": [], - "source": [ - "from owid.catalog.meta import DatasetMeta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92f8a6a6-5610-4769-a64e-0452e4fcbe23", - "metadata": {}, - "outputs": [], - "source": [ - "# Check description field in FBS and FBSH\n", - "assert fbsh_meadow.metadata.description == fbs_meadow.metadata.description\n", - "\n", - "# Define metadata\n", - "metadata = DatasetMeta(\n", - " namespace=\"faostat\",\n", - " short_name=\"faostat_fbsc\",\n", - " title=\"Food Balance: Food Balances (-2013 old methodology and 2014-) - FAO (2017, 2021)\",\n", - " description=fbsh_meadow.metadata.description,\n", - " sources=fbsh_meadow.metadata.sources + fbs_meadow.metadata.sources,\n", - " licenses=fbsh_meadow.metadata.licenses + fbs_meadow.metadata.licenses,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "28b4e7bd-88d1-4747-bced-4650794a75be", - "metadata": {}, - "source": [ - "### Create dataset and add tables\n", - "Finally, we add the tables to the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac66fe58-5dbd-4445-b255-c7d4f2ce91bf", - "metadata": {}, - "outputs": [], - "source": [ - "fbsc_garden = catalog.Dataset.create_empty(dest_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d01fe85-7479-445a-b69c-4266d911f992", - "metadata": {}, - "outputs": [], - "source": [ - "# Propagate metadata\n", - "fbsc_garden.metadata = metadata\n", - "fbsc_garden.save()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3df32ca-fcd9-40de-9ee7-6ce65a04737f", - "metadata": {}, - "outputs": [], - "source": [ - "# Add bulk table\n", - "fbsc_bulk.metadata.short_name = \"bulk\"\n", - "fbsc_garden.add(fbsc_bulk)\n", - "# Add table items\n", - "fbsc_item.metadata.short_name = \"meta_item\"\n", - "fbsc_garden.add(fbsc_item)\n", - "# Add table elements\n", - "fbsc_element_unit.metadata = fbs_element.metadata\n", - "fbsc_element_unit.metadata.description = (\n", - " \"List of elements, with their units and the respective descriptions of both. It also includes the element codes.\"\n", - ")\n", - "fbsc_garden.add(fbsc_element_unit)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08c11810-9fc1-4512-bcb4-edecbfaa8aac", - "metadata": {}, - "outputs": [], - "source": [ - "fbsc_garden.save()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_ef.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_ef.py deleted file mode 100644 index 9969bfd421f..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_ef.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ef dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_ei.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_ei.py deleted file mode 100644 index 5bd7d23db88..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_ei.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ei dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_ek.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_ek.py deleted file mode 100644 index c6ec4c862e8..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_ek.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ek dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_el.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_el.py deleted file mode 100644 index 43b06ade38c..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_el.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_el dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_emn.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_emn.py deleted file mode 100644 index 5f12637ec70..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_emn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_emn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_ep.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_ep.py deleted file mode 100644 index 0a44564deea..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_ep.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ep dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_esb.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_esb.py deleted file mode 100644 index 2a4896b9edf..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_esb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_esb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_fa.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_fa.py deleted file mode 100644 index e594773a567..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_fa.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_fa dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_fbsc.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_fbsc.py deleted file mode 100644 index 5e4e4e576f8..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_fbsc.py +++ /dev/null @@ -1,256 +0,0 @@ -"""FAOSTAT garden step for faostat_fbsc dataset (Food Balances Combined). - -Combine the old and new food balances datasets: -* `faostat_fbsh`: Old (historical) dataset. -* `faostat_fbs`: Current dataset. - -A new (combined) dataset will be generated: "faostat_fbsc". - -This is because a new version of the Food Balances dataset was launched in 2014 with a slightly new methodology: -https://fenixservices.fao.org/faostat/static/documents/FBS/New%20FBS%20methodology.pdf - -NOTE: It seems that FAOSTAT is possibly extending the coverage of the new methodology. So the year of intersection of -both datasets will be earlier and earlier. The global variable `FBS_FIRST_YEAR` may have to be redefined in a future -update. - -""" - -import json -from copy import deepcopy -from typing import cast - -import pandas as pd -from owid import catalog -from owid.catalog.meta import DatasetMeta, TableMeta -from owid.datautils import dataframes -from shared import ( - ADDED_TITLE_TO_WIDE_TABLE, - LATEST_VERSIONS_FILE, - NAMESPACE, - VERSION, - add_per_capita_variables, - add_regions, - clean_data, - harmonize_elements, - harmonize_items, - log, - prepare_long_table, - prepare_wide_table, - remove_outliers, -) - -from etl.paths import DATA_DIR, STEP_DIR - -# Dataset name. -DATASET_SHORT_NAME = f"{NAMESPACE}_fbsc" - -# First year for which we have data in fbs dataset (it defines the first year when new methodology is used). -FBS_FIRST_YEAR = 2010 -DATASET_TITLE = f"Food Balances (old methodology before {FBS_FIRST_YEAR}, and new from {FBS_FIRST_YEAR} onwards)" - - -def combine_fbsh_and_fbs_datasets( - fbsh_dataset: catalog.Dataset, - fbs_dataset: catalog.Dataset, -) -> pd.DataFrame: - """Combine `faostat_fbsh` and `faostat_fbs` meadow datasets. - - Parameters - ---------- - fbsh_dataset : catalog.Dataset - Meadow `faostat_fbsh` dataset. - fbs_dataset : catalog.Dataset - Meadow `faostat_fbs` dataset. - - Returns - ------- - fbsc : pd.DataFrame - Combination of the tables of the two input datasets (as a dataframe, not a dataset). - - """ - # Sanity checks. - error = "Description of fbs and fbsh datasets is different." - assert fbsh_dataset.metadata.description == fbs_dataset.metadata.description, error - error = "Licenses of fbsh and fbs are different." - assert fbsh_dataset.metadata.licenses == fbs_dataset.metadata.licenses, error - - # Load dataframes for fbs and fbsh datasets. - fbsh = pd.DataFrame(fbsh_dataset["faostat_fbsh"]).reset_index() - fbs = pd.DataFrame(fbs_dataset["faostat_fbs"]).reset_index() - - # Harmonize items and elements in both datasets. - fbsh = harmonize_items(df=fbsh, dataset_short_name="faostat_fbsh") - fbsh = harmonize_elements(df=fbsh) - fbs = harmonize_items(df=fbs, dataset_short_name="faostat_fbs") - fbs = harmonize_elements(df=fbs) - - # Ensure there is no overlap in data between the two datasets, and that there is no gap between them. - assert fbs["year"].min() == FBS_FIRST_YEAR, f"First year of fbs dataset is not {FBS_FIRST_YEAR}" - if fbsh["year"].max() >= fbs["year"].min(): - # There is overlapping data between fbsh and fbs datasets. Prioritising fbs over fbsh." - fbsh = fbsh.loc[fbsh["year"] < fbs["year"].min()].reset_index(drop=True) - if (fbsh["year"].max() + 1) < fbs["year"].min(): - log.warning("Data is missing for one or more years between fbsh and fbs datasets.") - - # Sanity checks. - # Ensure the elements that are in fbsh but not in fbs are covered by ITEMS_MAPPING. - error = "Mismatch between items in fbsh and fbs. Redefine shared.ITEM_AMENDMENTS." - assert set(fbsh["item"]) == set(fbs["item"]), error - # Some elements are found in fbs but not in fbsh. This is understandable, since fbs is - # more recent and may have additional elements. However, ensure that there are no - # elements in fbsh that are not in fbs. - error = "There are elements in fbsh that are not in fbs." - assert set(fbsh["element"]) < set(fbs["element"]), error - - # Concatenate old and new dataframes using function that keeps categoricals. - fbsc = dataframes.concatenate([fbsh, fbs]).sort_values(["area", "year"]).reset_index(drop=True) - - # Ensure that each element has only one unit and one description. - error = "Some elements in the combined dataset have more than one unit." - assert fbsc.groupby("element")["unit"].nunique().max() == 1, error - - return cast(pd.DataFrame, fbsc) - - -def _assert_df_size(df: pd.DataFrame, size_mb: float) -> None: - """Check that dataframe is smaller than given size to prevent OOM errors.""" - real_size_mb = df.memory_usage(deep=True).sum() / 1e6 - assert real_size_mb <= size_mb, f"DataFrame size is too big: {real_size_mb} MB > {size_mb} MB" - - -def run(dest_dir: str) -> None: - #################################################################################################################### - # Common definitions. - #################################################################################################################### - - # Load file of versions. - latest_versions = pd.read_csv(LATEST_VERSIONS_FILE).set_index(["channel", "dataset"]) - - # Find path to latest versions of fbsh dataset. - fbsh_version = latest_versions.loc["meadow", "faostat_fbsh"].item() - fbsh_file = DATA_DIR / "meadow" / NAMESPACE / fbsh_version / "faostat_fbsh" - # Find path to latest versions of fbs dataset. - fbs_version = latest_versions.loc["meadow", "faostat_fbs"].item() - fbs_file = DATA_DIR / "meadow" / NAMESPACE / fbs_version / "faostat_fbs" - # Path to dataset of FAOSTAT metadata. - garden_metadata_dir = DATA_DIR / "garden" / NAMESPACE / VERSION / f"{NAMESPACE}_metadata" - - # Path to outliers file. - outliers_file = STEP_DIR / "data" / "garden" / NAMESPACE / VERSION / "detected_outliers.json" - - #################################################################################################################### - # Load data. - #################################################################################################################### - - # Load fbsh and fbs. - log.info("faostat_fbsc.loading_datasets") - fbsh_dataset = catalog.Dataset(fbsh_file) - fbs_dataset = catalog.Dataset(fbs_file) - - # Load dataset of FAOSTAT metadata. - metadata = catalog.Dataset(garden_metadata_dir) - - # Load and prepare dataset, items and element-units metadata. - datasets_metadata = pd.DataFrame(metadata["datasets"]).reset_index() - datasets_metadata = datasets_metadata[datasets_metadata["dataset"] == DATASET_SHORT_NAME].reset_index(drop=True) - items_metadata = pd.DataFrame(metadata["items"]).reset_index() - items_metadata = items_metadata[items_metadata["dataset"] == DATASET_SHORT_NAME].reset_index(drop=True) - elements_metadata = pd.DataFrame(metadata["elements"]).reset_index() - elements_metadata = elements_metadata[elements_metadata["dataset"] == DATASET_SHORT_NAME].reset_index(drop=True) - countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() - - # Load file of detected outliers. - with open(outliers_file, "r") as _json_file: - outliers = json.loads(_json_file.read()) - - #################################################################################################################### - # Process data. - #################################################################################################################### - - # Combine fbsh and fbs datasets. - log.info( - "faostat_fbsc.combine_fbsh_and_fbs_datasets", - fbsh_shape=fbsh_dataset["faostat_fbsh"].shape, - fbs_shape=fbs_dataset["faostat_fbs"].shape, - ) - data = combine_fbsh_and_fbs_datasets(fbsh_dataset, fbs_dataset) - - _assert_df_size(data, 2000) - - # Prepare data. - data = clean_data( - data=data, - items_metadata=items_metadata, - elements_metadata=elements_metadata, - countries_metadata=countries_metadata, - ) - - # Add data for aggregate regions. - data = add_regions(data=data, elements_metadata=elements_metadata) - - # Add per-capita variables. - data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) - - # Remove outliers from data. - data = remove_outliers(data, outliers=outliers) - - # Avoid objects as they would explode memory, use categoricals instead. - for col in data.columns: - assert data[col].dtype != object, f"Column {col} should not have object type" - - _assert_df_size(data, 2000) - - # Create a long table (with item code and element code as part of the index). - log.info("faostat_fbsc.prepare_long_table", shape=data.shape) - data_table_long = prepare_long_table(data=data) - - _assert_df_size(data_table_long, 2000) - - # Create a wide table (with only country and year as index). - log.info("faostat_fbsc.prepare_wide_table", shape=data.shape) - data_table_wide = prepare_wide_table(data=data) - - #################################################################################################################### - # Prepare outputs. - #################################################################################################################### - - log.info("faostat_fbsc.prepare_outputs") - - # Initialize new garden dataset. - dataset_garden = catalog.Dataset.create_empty(dest_dir) - # Define metadata for new fbsc garden dataset (by default, take metadata from fbs dataset). - fbsc_sources = deepcopy(fbs_dataset.metadata.sources[0]) - fbsc_sources.source_data_url = None - fbsc_sources.owid_data_url = None - # Check that the title assigned here coincides with the one in custom_datasets.csv (for consistency). - error = "Dataset title given to fbsc is different to the one in custom_datasets.csv. Update the latter file." - assert DATASET_TITLE == datasets_metadata["owid_dataset_title"].item(), error - dataset_garden_metadata = DatasetMeta( - namespace=NAMESPACE, - short_name=DATASET_SHORT_NAME, - title=DATASET_TITLE, - # Take description from any of the datasets (since they should be identical). - description=datasets_metadata["owid_dataset_description"].item(), - # For sources and licenses, assume those of fbs. - sources=[fbsc_sources], - licenses=fbs_dataset.metadata.licenses, - version=VERSION, - ) - dataset_garden.metadata = dataset_garden_metadata - # Create new dataset in garden. - dataset_garden.save() - - # Prepare metadata for new garden long table. - data_table_long.metadata = TableMeta(short_name=DATASET_SHORT_NAME) - data_table_long.metadata.title = dataset_garden_metadata.title - data_table_long.metadata.description = dataset_garden_metadata.description - # Add long table to the dataset (no need to repack, since columns already have optimal dtypes). - dataset_garden.add(data_table_long, repack=False) - - # Prepare metadata for new garden wide table (starting with the metadata from the long table). - data_table_wide.metadata = deepcopy(data_table_long.metadata) - data_table_wide.metadata.title += ADDED_TITLE_TO_WIDE_TABLE - data_table_wide.metadata.short_name += "_flat" - data_table_wide.metadata.primary_key = list(data_table_wide.index.names) - # Add wide table to the dataset (no need to repack, since columns already have optimal dtypes). - dataset_garden.add(data_table_wide, repack=False) diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_fo.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_fo.py deleted file mode 100644 index bba98a5e224..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_fo.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_fo dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_food_explorer.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_food_explorer.py deleted file mode 100644 index 34bb40f922d..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_food_explorer.py +++ /dev/null @@ -1,510 +0,0 @@ -"""Dataset feeding the global food explorer. - -Load the qcl and fbsc (combination of fbsh and fbs) datasets, and create a combined dataset of food items (now called -products). - -The resulting dataset will later be loaded by the `explorer/food_explorer` which feeds our -[Global food explorer](https://ourworldindata.org/explorers/global-food). - -""" - -from copy import deepcopy -from typing import cast - -import pandas as pd -from owid import catalog -from owid.catalog.meta import DatasetMeta -from owid.datautils import dataframes -from shared import LATEST_VERSIONS_FILE, NAMESPACE, VERSION - -from etl.data_helpers import geo -from etl.paths import DATA_DIR - -# Dataset name and title. -DATASET_TITLE = "Food Explorer" -DATASET_SHORT_NAME = f"{NAMESPACE}_food_explorer" -DATASET_DESCRIPTION = ( - "This dataset has been created by Our World in Data, merging existing FAOSTAT datasets. In " - "particular, we have used 'Crops and livestock products' (QCL) and 'Food Balances' (FBSH and " - "FBS) datasets. Each row contains all the metrics for a specific combination of (country, " - "product, year). The metrics may come from different datasets." -) - -# List of items (OWID names) to include in the global food explorer. -# Note: The names of the products will be further edited in owid-content, following to the following file: -# https://github.com/owid/owid-content/blob/master/scripts/global-food-explorer/foods.csv -PRODUCTS = [ - "Almonds", - "Animal fats", - "Apples", - "Apricots", - "Areca nuts", - "Artichokes", - "Asparagus", - "Avocados", - "Bananas", - "Barley", - "Beans, dry", - "Beeswax", - "Blueberries", - "Brazil nuts, with shell", - "Broad beans", - "Buckwheat", - "Buffalo hides", - "Butter and ghee", - "Cabbages", - "Canary seed", - "Carrots and turnips", - "Cashew nuts", - "Cassava", - "Castor oil seed", - "Cattle hides", - "Cauliflowers and broccoli", - "Cereals", - "Cheese", - "Cherries", - "Chestnut", - "Chickpeas", - "Chillies and peppers", - "Citrus Fruit", - "Cocoa beans", - "Coconut oil", - "Coconuts", - "Coffee, green", - "Cotton", - "Cottonseed", - "Cottonseed oil", - "Cow peas", - "Cranberries", - "Cucumbers and gherkins", - "Currants", - "Dates", - "Eggplants", - "Eggs", - "Eggs from hens", - "Eggs from other birds (excl. hens)", - "Fat, buffaloes", - "Fat, camels", - "Fat, cattle", - "Fat, goats", - "Fat, pigs", - "Fat, sheep", - "Fibre crops", - "Fish and seafood", - "Flax fibre", - "Fruit", - "Garlic", - "Grapefruit", - "Grapes", - "Beans, green", - "Green maize", - "Groundnut oil", - "Groundnuts", - "Hazelnuts", - "Hempseed", - "Herbs (e.g. fennel)", - "Honey", - "Jute", - "Karite nuts", - "Kiwi", - "Kola nuts", - "Leeks", - "Lemons and limes", - "Lentils", - "Lettuce", - "Linseed", - "Linseed oil", - "Maize", - "Maize oil", - "Mangoes", - "Margarine", - "Meat, total", - "Meat, ass", - "Meat, beef", - "Meat, beef and buffalo", - "Meat, buffalo", - "Meat, camel", - "Meat, chicken", - "Meat, duck", - "Meat, game", - "Meat, goat", - "Meat, goose and guinea fowl", - "Meat, horse", - "Meat, lamb and mutton", - "Meat, mule", - "Meat, pig", - "Meat, poultry", - "Meat, rabbit", - "Meat, sheep and goat", - "Meat, turkey", - "Melon", - "Melonseed", - "Milk", - "Millet", - "Mixed grains", - "Molasses", - "Mushrooms", - "Mustard seed", - "Nuts", - "Oats", - "Offals", - "Offals, buffaloes", - "Offals, camels", - "Offals, cattle", - "Offals, goats", - "Offals, horses", - "Offals, pigs", - "Offals, sheep", - "Oilcrops", - "Oilcrops, Cake Equivalent", - "Oilcrops, Oil Equivalent", - "Okra", - "Olive oil", - "Olives", - "Onions", - "Oranges", - "Palm fruit oil", - "Palm kernel oil", - "Palm kernels", - "Palm oil", - "Papayas", - "Peaches and nectarines", - "Pears", - "Peas, dry", - "Peas, green", - "Pepper", - "Pigeon peas", - "Pineapples", - "Pistachios", - "Plantains", - "Plums", - "Poppy seeds", - "Pork", - "Potatoes", - "Pulses", - "Quinoa", - "Rapeseed", - "Rapeseed oil", - "Raspberries", - "Rice", - "Roots and tubers", - "Rye", - "Safflower oil", - "Safflower seed", - "Seed cotton", - "Sesame oil", - "Sesame seed", - "Silk", - "Skins, goat", - "Skins, sheep", - "Sorghum", - "Soybean oil", - "Soybeans", - "Spinach", - "Strawberries", - "String beans", - "Sugar (raw)", - "Sugar beet", - "Sugar cane", - "Sugar crops", - "Sunflower oil", - "Sunflower seed", - "Sweet potatoes", - "Tangerines", - "Tea", - "Tobacco", - "Tomatoes", - "Total", - "Treenuts", - "Vegetables", - "Walnuts", - "Watermelons", - "Wheat", - "Whey", - "Wine", - "Wool", - "Yams", -] -# OWID item name, element name, and unit name for population (as given in faostat_qcl and faostat_fbsc datasets). -FAO_POPULATION_ITEM_NAME = "Population" -FAO_POPULATION_ELEMENT_NAME = "Total Population - Both sexes" -FAO_POPULATION_UNIT = "1000 persons" - -# List of element codes to consider from faostat_qcl. -ELEMENT_CODES_QCL = [ - "005312", - "005313", - "005314", - "005318", - "005320", - "005321", - "005410", - "005413", - "005417", - "005419", - "005420", - "005422", - "005424", - "005510", - "005513", - "5312pc", - "5320pc", - "5321pc", - "5510pc", -] -# List of element codes to consider from faostat_fbsc. -ELEMENT_CODES_FBSC = [ - "000645", - "000664", - "000674", - "000684", - "005072", - "005123", - "005131", - "005142", - "005154", - "005170", - "005171", - "005301", - # Element 'Production' (in tonnes, originally given in 1000 tonnes) is taken from qcl. - # Although fbsc has items for this element that are not in qcl, they overlap in a number of items with slightly - # different values. To avoid this issue, we ignore the element from fbsc and use only the one in qcl. - # '005511', - "005521", - "005527", - "005611", - "005911", - "0645pc", - "0664pc", - "0674pc", - "0684pc", - "5123pc", - "5142pc", - "5154pc", - "5301pc", - "5521pc", - "5611pc", - "5911pc", - # The following element code is for population. - "000511", -] - - -def combine_qcl_and_fbsc(qcl_table: catalog.Table, fbsc_table: catalog.Table) -> pd.DataFrame: - """Combine garden `faostat_qcl` and `faostat_fbsc` datasets. - - Parameters - ---------- - qcl_table : catalog.Table - Main table (in long format) of the `faostat_qcl` dataset. - fbsc_table : catalog.Table - Main table (in long format) of the `faostat_fbsc` dataset. - - Returns - ------- - combined : pd.DataFrame - Combined data (as a dataframe, not a table). - - """ - columns = [ - "country", - "year", - "item_code", - "element_code", - "item", - "element", - "unit", - "unit_short_name", - "value", - "population_with_data", - ] - qcl = pd.DataFrame(qcl_table).reset_index()[columns] - # Select relevant element codes. - qcl = qcl[qcl["element_code"].isin(ELEMENT_CODES_QCL)].reset_index(drop=True) - qcl["value"] = qcl["value"].astype(float) - qcl["element"] = [element for element in qcl["element"]] - qcl["unit"] = [unit for unit in qcl["unit"]] - qcl["item"] = [item for item in qcl["item"]] - fbsc = pd.DataFrame(fbsc_table).reset_index()[columns] - # Select relevant element codes. - fbsc = fbsc[fbsc["element_code"].isin(ELEMENT_CODES_FBSC)].reset_index(drop=True) - fbsc["value"] = fbsc["value"].astype(float) - fbsc["element"] = [element for element in fbsc["element"]] - fbsc["unit"] = [unit for unit in fbsc["unit"]] - fbsc["item"] = [item for item in fbsc["item"]] - - rename_columns = {"item": "product"} - combined = ( - dataframes.concatenate([qcl, fbsc], ignore_index=True).rename(columns=rename_columns).reset_index(drop=True) - ) - - # Sanity checks. - assert len(combined) == (len(qcl) + len(fbsc)), "Unexpected number of rows after combining qcl and fbsc datasets." - - assert len(combined[combined["value"].isnull()]) == 0, "Unexpected nan values." - - n_items_per_item_code = combined.groupby("item_code")["product"].transform("nunique") - assert combined[n_items_per_item_code > 1].empty, "There are item codes with multiple items." - - n_elements_per_element_code = combined.groupby("element_code")["element"].transform("nunique") - assert combined[n_elements_per_element_code > 1].empty, "There are element codes with multiple elements." - - n_units_per_element_code = combined.groupby("element_code")["unit"].transform("nunique") - assert combined[n_units_per_element_code > 1].empty, "There are element codes with multiple units." - - error = "There are unexpected duplicate rows. Rename items in custom_items.csv to avoid clashes." - assert combined[combined.duplicated(subset=["product", "country", "year", "element", "unit"])].empty, error - - return cast(pd.DataFrame, combined) - - -def get_fao_population(combined: pd.DataFrame) -> pd.DataFrame: - """Extract the FAO population data from data (in long format). - - Parameters - ---------- - combined : pd.DataFrame - Combination of `faostat_qcl` and `faostat_fbsc` data (although this function could also be applied to just - `faostat_fbsc` data, since `faostat_qcl` does not contain FAO population data). - - Returns - ------- - fao_population : pd.DataFrame - Population (by country and year) according to FAO, extracted from the `faostat_fbsc` dataset. - - """ - # Select the item and element that corresponds to population values. - fao_population = combined[ - (combined["product"] == FAO_POPULATION_ITEM_NAME) & (combined["element"] == FAO_POPULATION_ELEMENT_NAME) - ].reset_index(drop=True) - - # Check that population is given in "1000 persons" and convert to persons. - error = "FAOSTAT population changed item, element, or unit." - assert list(fao_population["unit"].unique()) == [FAO_POPULATION_UNIT], error - fao_population["value"] *= 1000 - - # Drop missing values and prepare output dataframe. - fao_population = ( - fao_population[["country", "year", "value"]].dropna(how="any").rename(columns={"value": "fao_population"}) - ) - - return fao_population - - -def process_combined_data(combined: pd.DataFrame) -> pd.DataFrame: - """Process combined data (combination of `faostat_qcl` and `faostat_fbsc` data) to have the content and format - required by the food explorer. - - Parameters - ---------- - combined : pd.DataFrame - Combination of `faostat_qcl` and `faostat_fbsc` data. - - Returns - ------- - data_wide : pd.DataFrame - Processed data (in wide format). - - """ - combined = combined.copy() - - # Get FAO population from data (it is given as another item). - fao_population = get_fao_population(combined=combined) - - # Check that all expected products are included in the data. - missing_products = sorted(set(PRODUCTS) - set(set(combined["product"]))) - assert len(missing_products) == 0, f"{len(missing_products)} missing products for food explorer." - - # Select relevant products for the food explorer. - combined = combined[combined["product"].isin(PRODUCTS)].reset_index(drop=True) - - # Join element and unit into one title column. - combined["title"] = combined["element"] + " (" + combined["unit"] + ")" - - # This will create a table with just one column and country-year as index. - index_columns = ["product", "country", "year"] - data_wide = combined.pivot(index=index_columns, columns=["title"], values="value").reset_index() - - # Add column for FAO population. - data_wide = pd.merge(data_wide, fao_population, on=["country", "year"], how="left") - - # Add column for OWID population. - data_wide = geo.add_population_to_dataframe(df=data_wide, warn_on_missing_countries=False) - - # Fill gaps in OWID population with FAO population (for "* (FAO)" countries, i.e. countries that were not - # harmonized and for which there is no OWID population). - # Then drop "fao_population", since it is no longer needed. - data_wide["population"] = data_wide["population"].fillna(data_wide["fao_population"]) - data_wide = data_wide.drop(columns="fao_population") - - assert len(data_wide.columns[data_wide.isnull().all(axis=0)]) == 0, "Unexpected columns with only nan values." - - # Set a reasonable index. - data_wide = data_wide.set_index(index_columns, verify_integrity=True) - - return data_wide - - -def run(dest_dir: str) -> None: - #################################################################################################################### - # Load data. - #################################################################################################################### - - # Load file of versions. - latest_versions = pd.read_csv(LATEST_VERSIONS_FILE).set_index(["channel", "dataset"]) - - # Path to latest qcl and fbsc datasets in garden. - qcl_latest_version = latest_versions.loc["garden", f"{NAMESPACE}_qcl"].item() - qcl_latest_dir = DATA_DIR / "garden" / NAMESPACE / qcl_latest_version / f"{NAMESPACE}_qcl" - fbsc_latest_version = latest_versions.loc["garden", f"{NAMESPACE}_fbsc"].item() - fbsc_latest_dir = DATA_DIR / "garden" / NAMESPACE / fbsc_latest_version / f"{NAMESPACE}_fbsc" - - # Load qcl dataset and keep its metadata. - qcl_dataset = catalog.Dataset(qcl_latest_dir) - fbsc_dataset = catalog.Dataset(fbsc_latest_dir) - - # Get qcl long table inside qcl dataset. - qcl_table = qcl_dataset[f"{NAMESPACE}_qcl"] - # Idem for fbsc. - fbsc_table = fbsc_dataset[f"{NAMESPACE}_fbsc"] - - #################################################################################################################### - # Process data. - #################################################################################################################### - - # Combine `faostat_qcl` and `faostat_fbsc` data. - data = combine_qcl_and_fbsc(qcl_table=qcl_table, fbsc_table=fbsc_table) - - # Prepare data in the format required by the food explorer. - data = process_combined_data(combined=data) - - #################################################################################################################### - # Save outputs. - #################################################################################################################### - - # Initialize new garden dataset. - explorer_dataset = catalog.Dataset.create_empty(dest_dir) - # Define metadata for new garden dataset (by default, take metadata from fbsc dataset). - explorer_sources = deepcopy(fbsc_dataset.metadata.sources[0]) - explorer_sources.source_data_url = None - explorer_sources.owid_data_url = None - explorer_dataset.metadata = DatasetMeta( - namespace=NAMESPACE, - short_name=DATASET_SHORT_NAME, - title=DATASET_TITLE, - description=DATASET_DESCRIPTION, - sources=fbsc_dataset.metadata.sources + qcl_dataset.metadata.sources, - licenses=fbsc_dataset.metadata.licenses + qcl_dataset.metadata.licenses, - version=VERSION, - ) - # Create new dataset in garden. - explorer_dataset.save() - # Create table of products. - table = catalog.Table(data) - # Make all column names snake_case. - table = catalog.utils.underscore_table(table) - # Add metadata for the table. - table.metadata.short_name = "all_products" - table.metadata.primary_key = list(table.index.names) - # Add table to dataset. - explorer_dataset.add(table) diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_fs.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_fs.py deleted file mode 100644 index a836381fb94..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_fs.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_fs dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_lc.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_lc.py deleted file mode 100644 index ab508fd95ad..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_lc.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_lc dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_metadata.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_metadata.py deleted file mode 100644 index 1000fa9084c..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_metadata.py +++ /dev/null @@ -1,977 +0,0 @@ -"""FAOSTAT garden step for faostat_metadata dataset. - -This step reads from: -* The (additional) metadata dataset. The only crucial ingredients from here (that will be used later on in other garden - steps are element, item and units descriptions, and country groups (used to check that we do not double count - countries when aggregating data for regions). -* Custom datasets file ('./custom_datasets.csv'). -* Custom elements and units file ('./custom_elements_and_units.csv'). -* Custom items file ('./custom_items.csv'). -* Each of the individual meadow datasets. They are loaded to extract their countries, items, elements and units, and - some sanity checks are performed. - -This step will: -* Output a dataset (to be loaded by all garden datasets) with tables 'countries, 'datasets', 'elements' and 'items'. -* Apply sanity checks to countries, elements, items, and units. -* Apply custom names and descriptions to datasets, elements, items and units. -* Harmonize country names. -* Find countries that correspond to aggregates of other countries (e.g. 'Melanesia'). -* Ensure there are no degeneracies within a dataset (i.e. ensure each index is unique). -* Ensure there are no degeneracies between datasets (using dataset, item_code, element_code as keys). - -There are some non-trivial issues with the definitions of items at FAOSTAT: -* Some item codes in the data are missing in the metadata, and vice versa. -* The mapping item_code -> item in the data files is sometimes different from the mapping item_code -> item - in the (additional) metadata dataset. Some examples: - * In dataset qv, item code 221 in the data corresponds to item "Almonds, in shell", whereas in the metadata, - item code 221 corresponds to item "Almonds, with shell", which is the same item, but with a slightly different - name. This happens with many items. On the website (https://www.fao.org/faostat/en/?#data/QV) they seem to be - using the naming from the metadata. We can safely ignore this issue, and stick to the names in the data. - * In dataset sdgb, item codes have very unusual names, and they are not found in the metadata. We haven't figured - out the root of the issue yet. - -There are several cases in which one or a few item codes in the data are missing in the metadata. Also, there are -several cases in which an item code in the data has an item name slightly different in the metadata. But these are not -important issues (since we use item_code to merge different datasets, and we use metadata only to fetch descriptions). -However, for some domains there are too many differences between items in the data and in the metadata (as explained -above). For this reason, raise a warning only when there is a reasonable number of issues found. - -""" - -import json -import sys -from copy import deepcopy -from typing import Dict, List, Tuple, cast - -import pandas as pd -from owid import catalog -from owid.datautils import dataframes, io -from shared import ( - FLAGS_RANKING, - LATEST_VERSIONS_FILE, - NAMESPACE, - VERSION, - harmonize_elements, - harmonize_items, - log, - optimize_table_dtypes, -) -from tqdm.auto import tqdm - -from etl.paths import DATA_DIR, STEP_DIR - -# Define short name for output dataset. -DATASET_SHORT_NAME = f"{NAMESPACE}_metadata" - -# Minimum number of issues in the comparison of items and item codes from data and metadata to raise a warning. -N_ISSUES_ON_ITEMS_FOR_WARNING = 10 - - -def load_latest_data_table_for_dataset(dataset_short_name: str) -> catalog.Table: - """Load data table (in long format) from the latest version of a dataset for a given domain. - - Parameters - ---------- - dataset_short_name : str - Dataset short name (e.g. 'faostat_qcl'). - - Returns - ------- - table : catalog.Table - Latest version of table in long format for given domain. - - """ - # Path to folder with all versions of meadow datasets for FAOSTAT. - meadow_dir = DATA_DIR / "meadow" / NAMESPACE - # Load file of versions. - latest_versions = pd.read_csv(LATEST_VERSIONS_FILE).set_index(["channel", "dataset"]) - # Find latest meadow version for given dataset. - dataset_version = latest_versions.loc["meadow", dataset_short_name].item() - # Path to latest dataset folder. - dataset_path = meadow_dir / dataset_version / dataset_short_name - assert dataset_path.is_dir(), f"Dataset {dataset_short_name} not found in meadow." - # Load dataset. - dataset = catalog.Dataset(dataset_path) - assert len(dataset.table_names) == 1 - # Load table in long format from dataset. - table = dataset[dataset_short_name] - - return table - - -def create_dataset_descriptions_dataframe_for_domain(table: catalog.Table, dataset_short_name: str) -> pd.DataFrame: - """Create a single row dataframe with the dataset name, title and description, for a given domain. - - Parameters - ---------- - table : catalog.Table - Latest table for considered domain. - dataset_short_name : str - Dataset short name (e.g. 'faostat_qcl'). - - Returns - ------- - dataset_descriptions_df : pd.DataFrame - Dataframe of name, title and description of a domain. - - """ - dataset_descriptions_df = pd.DataFrame( - { - "dataset": [dataset_short_name], - "fao_dataset_title": [table.metadata.dataset.title], - "fao_dataset_description": [table.metadata.dataset.description], - } - ) - - return dataset_descriptions_df - - -def clean_global_dataset_descriptions_dataframe( - datasets_df: pd.DataFrame, custom_datasets: pd.DataFrame -) -> pd.DataFrame: - """Apply sanity checks to the dataframe gathered from the data of each individual datasets, and add custom dataset - titles and descriptions. - - Parameters - ---------- - datasets_df : pd.DataFrame - Dataframe of descriptions gathered from the data of each individual dataset. - custom_datasets : pd.DataFrame - Data from the custom_datasets.csv file. - - Returns - ------- - datasets_df : pd.Dataframe - Clean dataframe of dataset titles and descriptions (customized and original FAO ones). - - """ - datasets_df = datasets_df.copy() - - # Check that the dataset descriptions of fbsh and fbs are identical. - error = ( - "Datasets fbsh and fbs have different descriptions. " - "This may happen in the future: Simply check that nothing significant has changed and remove this assertion." - ) - assert ( - datasets_df[datasets_df["dataset"] == "faostat_fbsh"]["fao_dataset_description"].item() - == datasets_df[datasets_df["dataset"] == "faostat_fbs"]["fao_dataset_description"].item() - ), error - # Drop row for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). - datasets_df = datasets_df[datasets_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) - datasets_df.loc[datasets_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" - - # Add custom dataset titles. - datasets_df = pd.merge( - datasets_df, - custom_datasets, - on="dataset", - how="left", - suffixes=("_new", "_old"), - ) - - changed_titles = datasets_df[datasets_df["fao_dataset_title_old"] != datasets_df["fao_dataset_title_new"]] - changed_descriptions = datasets_df[ - datasets_df["fao_dataset_description_old"] != datasets_df["fao_dataset_description_new"] - ] - if len(changed_titles) > 0: - log.warning(f"{len(changed_titles)} domains have changed titles, consider updating custom_datasets.csv.") - if len(changed_descriptions) > 0: - log.warning( - f"{len(changed_descriptions)} domains have changed descriptions. " f"Consider updating custom_datasets.csv." - ) - - datasets_df = datasets_df.drop(columns=["fao_dataset_title_old", "fao_dataset_description_old"]).rename( - columns={ - "fao_dataset_title_new": "fao_dataset_title", - "fao_dataset_description_new": "fao_dataset_description", - } - ) - - datasets_df["owid_dataset_title"] = datasets_df["owid_dataset_title"].fillna(datasets_df["fao_dataset_title"]) - error = "Custom titles for different datasets are equal. Edit custom_datasets.csv file." - assert len(set(datasets_df["dataset"])) == len(set(datasets_df["owid_dataset_title"])), error - - # Add custom descriptions. - datasets_df["owid_dataset_description"] = datasets_df["owid_dataset_description"].fillna( - datasets_df["fao_dataset_description"] - ) - - # Reorder columns. - datasets_df = datasets_df[ - [ - "dataset", - "fao_dataset_title", - "owid_dataset_title", - "fao_dataset_description", - "owid_dataset_description", - ] - ] - - return datasets_df - - -def create_items_dataframe_for_domain( - table: catalog.Table, metadata: catalog.Dataset, dataset_short_name: str -) -> pd.DataFrame: - """Apply sanity checks to the items of a table in a dataset, and to the items from the metadata, harmonize all item - codes and items, and add item descriptions. - - Parameters - ---------- - table : catalog.Table - Data for a given domain. - metadata: catalog.Dataset - Metadata dataset from meadow. - dataset_short_name : str - Dataset short name (e.g. 'faostat_qcl'). - - Returns - ------- - items_from_data : pd.Dataframe - Item names and descriptions (customized ones and FAO original ones) for a particular domain. - - """ - df = pd.DataFrame(table).reset_index() - - # Load items from data. - items_from_data = ( - df.rename(columns={"item": "fao_item"})[["item_code", "fao_item"]].drop_duplicates().reset_index(drop=True) - ) - # Ensure items are well constructed and amend already known issues (defined in shared.ITEM_AMENDMENTS). - items_from_data = harmonize_items(df=items_from_data, dataset_short_name=dataset_short_name, item_col="fao_item") - - # Load items from metadata. - items_columns = { - "item_code": "item_code", - "item": "fao_item", - "description": "fao_item_description", - } - _items_df = ( - metadata[f"{dataset_short_name}_item"] - .reset_index()[list(items_columns)] - .rename(columns=items_columns) - .drop_duplicates() - .sort_values(list(items_columns.values())) - .reset_index(drop=True) - ) - _items_df = harmonize_items(df=_items_df, dataset_short_name=dataset_short_name, item_col="fao_item") - _items_df["fao_item_description"] = _items_df["fao_item_description"].astype("string") - - # Add descriptions (from metadata) to items (from data). - items_from_data = ( - pd.merge(items_from_data, _items_df, on=["item_code", "fao_item"], how="left") - .sort_values(["item_code", "fao_item"]) - .reset_index(drop=True) - ) - items_from_data["dataset"] = dataset_short_name - items_from_data["fao_item_description"] = items_from_data["fao_item_description"].fillna("") - - # Sanity checks for items in current dataset: - - # Check that in data, there is only one item per item code. - n_items_per_item_code = items_from_data.groupby("item_code")["fao_item"].transform("nunique") - error = f"Multiple items for a given item code in dataset {dataset_short_name}." - assert items_from_data[n_items_per_item_code > 1].empty, error - - # Check that all item codes in data are defined in metadata, and check that the mapping item code -> item in - # the data is the same as in the metadata (which often is not the case). - compared = pd.merge( - items_from_data[["item_code", "fao_item"]], - _items_df[["item_code", "fao_item"]], - on="item_code", - how="left", - suffixes=("_in_data", "_in_metadata"), - ) - different_items = compared[compared["fao_item_in_data"] != compared["fao_item_in_metadata"]] - missing_item_codes = set(items_from_data["item_code"]) - set(_items_df["item_code"]) - if (len(different_items) + len(missing_item_codes)) > N_ISSUES_ON_ITEMS_FOR_WARNING: - log.warning( - f"{len(missing_item_codes)} item codes in {dataset_short_name} missing in metadata. " - f"{len(different_items)} item codes in data mapping to different items in metadata." - ) - - return items_from_data - - -def clean_global_items_dataframe(items_df: pd.DataFrame, custom_items: pd.DataFrame) -> pd.DataFrame: - """Apply global sanity checks to items gathered from all datasets, and create a clean global items dataframe. - - Parameters - ---------- - items_df : pd.DataFrame - Items dataframe gathered from all domains. - custom_items : pd.DataFrame - Data from custom_items.csv file. - - Returns - ------- - items_df : pd.DataFrame - Clean global items dataframe. - - """ - items_df = items_df.copy() - - # Check that fbs and fbsh have the same contributions, remove one of them, and rename the other to fbsc. - check = pd.merge( - items_df[items_df["dataset"] == "faostat_fbsh"].reset_index(drop=True)[["item_code", "fao_item"]], - items_df[items_df["dataset"] == "faostat_fbs"].reset_index(drop=True)[["item_code", "fao_item"]], - how="outer", - on=["item_code"], - suffixes=("_fbsh", "_fbs"), - ) - assert (check["fao_item_fbsh"] == check["fao_item_fbs"]).all() - # Drop all rows for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). - items_df = items_df[items_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) - items_df.loc[items_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" - - # Add custom item names. - items_df = pd.merge( - items_df, - custom_items.rename(columns={"fao_item": "fao_item_check"}), - on=["dataset", "item_code"], - how="left", - suffixes=("_new", "_old"), - ) - - changed_descriptions = items_df[ - (items_df["fao_item_description_old"] != items_df["fao_item_description_new"]) - & (items_df["fao_item_description_old"].notnull()) - ] - if len(changed_descriptions) > 0: - log.warning( - f"WARNING: {len(changed_descriptions)} domains have changed descriptions. " - f"Consider updating custom_items.csv." - ) - - items_df = items_df.drop(columns="fao_item_description_old").rename( - columns={"fao_item_description_new": "fao_item_description"} - ) - - error = "Item names may have changed with respect to custom items file. Update custom items file." - assert ( - items_df[items_df["fao_item_check"].notnull()]["fao_item_check"] - == items_df[items_df["fao_item_check"].notnull()]["fao_item"] - ).all(), error - items_df = items_df.drop(columns=["fao_item_check"]) - - # Assign original FAO name to all owid items that do not have a custom name. - items_df["owid_item"] = items_df["owid_item"].fillna(items_df["fao_item"]) - - # Add custom item descriptions, and assign original FAO descriptions to items that do not have a custom description. - items_df["owid_item_description"] = items_df["owid_item_description"].fillna(items_df["fao_item_description"]) - - # Check that we have not introduced ambiguities when assigning custom item names. - n_owid_items_per_item_code = items_df.groupby(["dataset", "item_code"])["owid_item"].transform("nunique") - error = "Multiple owid items for a given item code in a dataset." - assert items_df[n_owid_items_per_item_code > 1].empty, error - - items_df = ( - items_df[ - [ - "dataset", - "item_code", - "fao_item", - "owid_item", - "fao_item_description", - "owid_item_description", - ] - ] - .sort_values(["dataset", "item_code"]) - .reset_index(drop=True) - ) - - return items_df - - -def create_elements_dataframe_for_domain( - table: catalog.Table, metadata: catalog.Dataset, dataset_short_name: str -) -> pd.DataFrame: - """Apply sanity checks to the elements and units of a table in a dataset, and to the elements and units from the - metadata, harmonize all element code, and add descriptions. - - Parameters - ---------- - table : catalog.Table - Data for a given domain. - metadata: catalog.Dataset - Additional metadata dataset from meadow. - dataset_short_name : str - Dataset short name (e.g. 'faostat_qcl'). - - Returns - ------- - elements_from_data : pd.Dataframe - Element names and descriptions and unit names and descriptions (customized ones and FAO original ones) for a - particular domain. - - """ - - df = pd.DataFrame(table).reset_index() - # Load elements from data. - elements_from_data = ( - df.rename(columns={"element": "fao_element", "unit": "fao_unit_short_name"})[ - ["element_code", "fao_element", "fao_unit_short_name"] - ] - .drop_duplicates() - .reset_index(drop=True) - ) - # Ensure element_code is always a string of a fix number of characters. - elements_from_data = harmonize_elements(df=elements_from_data, element_col="fao_element") - - # Load elements from metadata. - elements_columns = { - "element_code": "element_code", - "element": "fao_element", - "description": "fao_element_description", - } - _elements_df = ( - metadata[f"{dataset_short_name}_element"] - .reset_index()[list(elements_columns)] - .rename(columns=elements_columns) - .drop_duplicates() - .sort_values(list(elements_columns.values())) - .reset_index(drop=True) - ) - _elements_df = harmonize_elements(df=_elements_df, element_col="fao_element") - _elements_df["fao_element_description"] = _elements_df["fao_element_description"].astype("string") - - # Load units metadata. - units_columns = { - "unit_name": "fao_unit_short_name", - "description": "fao_unit", - } - _units_df = ( - metadata[f"{dataset_short_name}_unit"] - .reset_index()[list(units_columns)] - .rename(columns=units_columns) - .drop_duplicates() - .sort_values(list(units_columns.values())) - .reset_index(drop=True) - ) - _units_df["fao_unit"] = _units_df["fao_unit"].astype("string") - - # Add element descriptions (from metadata). - elements_from_data = ( - pd.merge( - elements_from_data, - _elements_df, - on=["element_code", "fao_element"], - how="left", - ) - .sort_values(["element_code", "fao_element"]) - .reset_index(drop=True) - ) - elements_from_data["dataset"] = dataset_short_name - elements_from_data["fao_element_description"] = elements_from_data["fao_element_description"].fillna("") - - # Add unit descriptions (from metadata). - elements_from_data = ( - pd.merge(elements_from_data, _units_df, on=["fao_unit_short_name"], how="left") - .sort_values(["fao_unit_short_name"]) - .reset_index(drop=True) - ) - elements_from_data["fao_unit"] = elements_from_data["fao_unit"].fillna(elements_from_data["fao_unit_short_name"]) - - # Sanity checks: - - # Check that in data, there is only one unit per element code. - n_units_per_element_code = df.groupby("element_code")["unit"].transform("nunique") - error = f"Multiple units for a given element code in dataset {dataset_short_name}." - assert df[n_units_per_element_code > 1].empty, error - - # Check that in data, there is only one element per element code. - n_elements_per_element_code = elements_from_data.groupby("element_code")["fao_element"].transform("nunique") - error = f"Multiple elements for a given element code in dataset {dataset_short_name}." - assert elements_from_data[n_elements_per_element_code > 1].empty, error - - return elements_from_data - - -def clean_global_elements_dataframe(elements_df: pd.DataFrame, custom_elements: pd.DataFrame) -> pd.DataFrame: - """Apply global sanity checks to elements and units gathered from all datasets, and create a clean global elements - and units dataframe. - - Parameters - ---------- - elements_df : pd.DataFrame - Elements and units dataframe gathered from all domains. - custom_elements : pd.DataFrame - Data from custom_items.csv file. - - Returns - ------- - elements_df : pd.DataFrame - Clean global elements and units dataframe. - - """ - elements_df = elements_df.copy() - - # Check that all elements of fbsh are in fbs (although fbs may contain additional elements). - assert set(elements_df[elements_df["dataset"] == "faostat_fbsh"]["element_code"]) <= set( - elements_df[elements_df["dataset"] == "faostat_fbs"]["element_code"] - ) - # Drop all rows for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). - elements_df = elements_df[elements_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) - elements_df.loc[elements_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" - - elements_df = pd.merge( - elements_df, - custom_elements.rename( - columns={ - "fao_element": "fao_element_check", - "fao_unit_short_name": "fao_unit_short_name_check", - } - ), - on=["dataset", "element_code"], - how="left", - suffixes=("_new", "_old"), - ) - - changed_units = elements_df[ - (elements_df["fao_unit_new"] != elements_df["fao_unit_old"]) & (elements_df["fao_unit_old"].notnull()) - ] - if len(changed_units) > 0: - log.warning(f"{len(changed_units)} domains have changed units, consider updating custom_elements.csv.") - - changed_descriptions = elements_df[ - (elements_df["fao_element_description_new"] != elements_df["fao_element_description_old"]) - & (elements_df["fao_element_description_old"].notnull()) - ] - if len(changed_descriptions) > 0: - log.warning( - f"{len(changed_descriptions)} domains have changed descriptions. " f"Consider updating custom_elements.csv." - ) - - elements_df = elements_df.drop(columns=["fao_unit_old", "fao_element_description_old"]).rename( - columns={ - "fao_element_description_new": "fao_element_description", - "fao_unit_new": "fao_unit", - } - ) - - error = "Element names have changed with respect to custom elements file. Update custom elements file." - assert ( - elements_df[elements_df["fao_element_check"].notnull()]["fao_element_check"] - == elements_df[elements_df["fao_element_check"].notnull()]["fao_element"] - ).all(), error - elements_df = elements_df.drop(columns=["fao_element_check"]) - - error = "Unit names have changed with respect to custom elements file. Update custom elements file." - assert ( - elements_df[elements_df["fao_unit_short_name_check"].notnull()]["fao_unit_short_name_check"] - == elements_df[elements_df["fao_unit_short_name_check"].notnull()]["fao_unit_short_name"] - ).all(), error - elements_df = elements_df.drop(columns=["fao_unit_short_name_check"]) - - # Assign original FAO names where there is no custom one. - elements_df["owid_element"] = elements_df["owid_element"].fillna(elements_df["fao_element"]) - elements_df["owid_unit"] = elements_df["owid_unit"].fillna(elements_df["fao_unit"]) - elements_df["owid_element_description"] = elements_df["owid_element_description"].fillna( - elements_df["fao_element_description"] - ) - elements_df["owid_unit_short_name"] = elements_df["owid_unit_short_name"].fillna(elements_df["fao_unit_short_name"]) - - # Assume variables were not per capita, if was_per_capita is not informed, and make boolean. - elements_df["was_per_capita"] = elements_df["was_per_capita"].fillna("0").replace({"0": False, "1": True}) - - # Idem for variables to make per capita. - elements_df["make_per_capita"] = elements_df["make_per_capita"].fillna("0").replace({"0": False, "1": True}) - - # Check that we have not introduced ambiguities when assigning custom element or unit names. - n_owid_elements_per_element_code = elements_df.groupby(["dataset", "element_code"])["owid_element"].transform( - "nunique" - ) - error = "Multiple owid elements for a given element code in a dataset." - assert elements_df[n_owid_elements_per_element_code > 1].empty, error - - # Check that we have not introduced ambiguities when assigning custom element or unit names. - n_owid_units_per_element_code = elements_df.groupby(["dataset", "element_code"])["owid_unit"].transform("nunique") - error = "Multiple owid elements for a given element code in a dataset." - assert elements_df[n_owid_units_per_element_code > 1].empty, error - - # NOTE: We assert that there is one element for each element code. But the opposite may not be true: there can be - # multiple element codes with the same element. And idem for items. - - return elements_df - - -def clean_global_countries_dataframe( - countries_in_data: pd.DataFrame, - country_groups: Dict[str, List[str]], - countries_harmonization: Dict[str, str], -) -> pd.DataFrame: - """Clean dataframe of countries gathered from the data of the individual domains, harmonize country names (and - country names of members of regions), and create a clean global countries dataframe. - - Parameters - ---------- - countries_in_data : pd.DataFrame - Countries gathered from the data of all domains. - country_groups : dict - Countries and their members, gathered from the data. - countries_harmonization : dict - Mapping of country names (from FAO names to OWID names). - - Returns - ------- - countries_df : pd.DataFrame - Clean global countries dataframe. - - """ - countries_df = countries_in_data.copy() - - # Remove duplicates of area_code and fao_country, ensuring to keep m49_code when it is given. - if "m49_code" in countries_df.columns: - # Sort so that nans in m49_code are at the bottom, and then keep only the first duplicated row. - countries_df = countries_df.sort_values("m49_code") - countries_df = ( - countries_df.drop_duplicates(subset=["area_code", "fao_country"], keep="first") - .sort_values(["area_code"]) - .reset_index(drop=True) - ) - - countries_not_harmonized = sorted(set(countries_df["fao_country"]) - set(countries_harmonization)) - if len(countries_not_harmonized) > 0: - log.info( - f"{len(countries_not_harmonized)} countries not included in countries file. " - f"They will not have data after countries are harmonized in a further step." - ) - - # Harmonize country groups and members. - country_groups_harmonized = { - countries_harmonization[group]: sorted([countries_harmonization[member] for member in country_groups[group]]) - for group in country_groups - if group in countries_harmonization - } - - # Harmonize country names. - countries_df["country"] = dataframes.map_series( - series=countries_df["fao_country"], - mapping=countries_harmonization, - warn_on_unused_mappings=True, - make_unmapped_values_nan=True, - show_full_warning=False, - ) - - # Add country members to countries dataframe. - countries_df["members"] = dataframes.map_series( - series=countries_df["country"], - mapping=country_groups_harmonized, - make_unmapped_values_nan=True, - ) - - # Feather does not support object types, so convert column of lists to column of strings. - countries_df["members"] = [ - json.dumps(members) if isinstance(members, list) else members for members in countries_df["members"] - ] - - return countries_df - - -def create_table(df: pd.DataFrame, short_name: str, index_cols: List[str]) -> catalog.Table: - """Create a table with optimal format and basic metadata, out of a dataframe. - - Parameters - ---------- - df : pd.DataFrame - Input dataframe. - short_name : str - Short name to add in the metadata of the new table. - index_cols : list - Columns to use as indexes of the new table. - - Returns - ------- - table : catalog.Table - New table. - - """ - table = catalog.Table(df).copy() - - # Optimize column dtypes before storing feather file, and ensure codes are categories (instead of ints). - table = optimize_table_dtypes(table) - - # Set indexes and other necessary metadata. - table = table.set_index(index_cols, verify_integrity=True) - table.metadata.short_name = short_name - table.metadata.primary_key = index_cols - - return cast(catalog.Table, table) - - -def check_that_flag_definitions_in_dataset_agree_with_those_in_flags_ranking( - metadata: catalog.Dataset, -) -> None: - """Check that the definition of flags in the additional metadata for current dataset agree with the ones we have - manually written down in our flags ranking (raise error otherwise). - - Parameters - ---------- - metadata : catalog.Dataset - Additional metadata dataset (that must contain one table for current dataset). - - """ - for table_name in metadata.table_names: - if "flag" in table_name: - flag_df = metadata[table_name].reset_index() - comparison = pd.merge(FLAGS_RANKING, flag_df, on="flag", how="inner") - error_message = ( - f"Flag definitions in file {table_name} are different to those in our flags ranking. " - f"Redefine shared.FLAGS_RANKING." - ) - assert (comparison["description"] == comparison["flags"]).all(), error_message - - -def check_that_all_flags_in_dataset_are_in_ranking(table: catalog.Table, metadata_for_flags: catalog.Table) -> None: - """Check that all flags found in current dataset are defined in our flags ranking (raise error otherwise). - - Parameters - ---------- - table : pd.DataFrame - Data table for current dataset. - metadata_for_flags : catalog.Table - Flags for current dataset, as defined in dataset of additional metadata. - - """ - if not set(table["flag"]) < set(FLAGS_RANKING["flag"]): - missing_flags = set(table["flag"]) - set(FLAGS_RANKING["flag"]) - flags_data = pd.DataFrame(metadata_for_flags).reset_index() - if set(missing_flags) < set(flags_data["flag"]): - message = "Missing flags. Copy the following lines to FLAGS_RANKING (and put them in the right order):" - for i, j in pd.DataFrame(metadata_for_flags).loc[list(missing_flags)].iterrows(): - message += f"\n{(i, j['flags'])}," - log.warning(message) - else: - log.warning( - f"Missing flags. {missing_flags} are not defined in additional metadata. Get definition from " - f"https://www.fao.org/faostat/en/#definitions" - ) - raise AssertionError("Flags in dataset not found in FLAGS_RANKING. Manually add those flags.") - - -def process_metadata( - metadata: catalog.Dataset, - custom_datasets: pd.DataFrame, - custom_elements: pd.DataFrame, - custom_items: pd.DataFrame, - countries_harmonization: Dict[str, str], -) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """Apply various sanity checks, gather data (about dataset, item, element and unit names and descriptions) from all - domains, compare with data from its corresponding metadata file, and create clean dataframes of metadata about - dataset, elements, units, items, and countries. - - Parameters - ---------- - metadata : catalog.Dataset - Additional metadata dataset from meadow. - custom_datasets : pd.DataFrame - Data from custom_datasets.csv file. - custom_elements : pd.DataFrame - Data from custom_elements_and_units.csv file. - custom_items : pd.DataFrame - Data from custom_items.csv file. - countries_harmonization : dict - Data from faostat.countries.json file. - - Returns - ------- - countries_df : pd.DataFrame - Clean dataframe of global countries. - datasets_df : pd.DataFrame - Clean dataframe of global dataset names and descriptions. - elements_df : pd.DataFrame - Clean dataframe of global element and unit names and descriptions. - items_df : pd.DataFrame - Clean dataframe of global item names and descriptions. - - """ - # Check if flags definitions need to be updated. - check_that_flag_definitions_in_dataset_agree_with_those_in_flags_ranking(metadata) - - # List all FAOSTAT dataset short names. - dataset_short_names = sorted( - set([NAMESPACE + "_" + table_name.split("_")[1] for table_name in metadata.table_names]) - ) - - # Initialise dataframe of dataset descriptions, items, and element-units. - # We cannot remove "dataset" from the items and elements dataframes, because it can happen that, for a given - # item code, the item name is slightly different in two different datasets. - datasets_df = pd.DataFrame({"dataset": [], "fao_dataset_title": [], "fao_dataset_description": []}) - items_df = pd.DataFrame({"dataset": [], "item_code": [], "fao_item": [], "fao_item_description": []}) - elements_df = pd.DataFrame( - { - "dataset": [], - "element_code": [], - "fao_element": [], - "fao_element_description": [], - "fao_unit": [], - "fao_unit_short_name": [], - } - ) - - # Initialise list of all countries in all datasets, and all country groups. - countries_in_data = pd.DataFrame({"area_code": [], "fao_country": []}).astype({"area_code": "Int64"}) - country_groups_in_data: Dict[str, List[str]] = {} - # Gather all variables from the latest version of each meadow dataset. - for dataset_short_name in tqdm(dataset_short_names, file=sys.stdout): - # Load latest meadow table for current dataset. - table = load_latest_data_table_for_dataset(dataset_short_name=dataset_short_name) - df = pd.DataFrame(table.reset_index()).rename( - columns={ - "area": "fao_country", - "recipient_country": "fao_country", - "recipient_country_code": "area_code", - } - )[["area_code", "fao_country"]] - - # Column 'area_code' in faostat_sdgb is float instead of integer, and it does not agree with the usual area - # codes. For example, Afghanistan has area code 4.0 in faostat_sdgb, whereas in other dataset it is 2. - # It seems to be the UN M49 code. - # So we add this code as a new column to the countries dataframe, to be able to map sdgb area codes later on. - if df["area_code"].dtype == "float64": - sdgb_codes_df = ( - metadata["faostat_sdgb_area"] - .reset_index()[["country_code", "m49_code"]] - .rename(columns={"country_code": "area_code"}) - ) - df = pd.merge( - df.rename(columns={"area_code": "m49_code"}), - sdgb_codes_df, - on="m49_code", - how="left", - ) - - df["area_code"] = df["area_code"].astype("Int64") - - check_that_all_flags_in_dataset_are_in_ranking( - table=table, metadata_for_flags=metadata[f"{dataset_short_name}_flag"] - ) - - # Gather dataset descriptions, items, and element-units for current domain. - datasets_from_data = create_dataset_descriptions_dataframe_for_domain( - table, dataset_short_name=dataset_short_name - ) - - items_from_data = create_items_dataframe_for_domain( - table=table, metadata=metadata, dataset_short_name=dataset_short_name - ) - - elements_from_data = create_elements_dataframe_for_domain( - table=table, metadata=metadata, dataset_short_name=dataset_short_name - ) - - # Add countries in this dataset to the list of all countries. - countries_in_data = pd.concat([countries_in_data, df]).drop_duplicates() - - # Get country groups in this dataset. - area_group_table_name = f"{dataset_short_name}_area_group" - if area_group_table_name in metadata: - country_groups = ( - metadata[f"{dataset_short_name}_area_group"] - .reset_index() - .drop_duplicates(subset=["country_group", "country"]) - .groupby("country_group") - .agg({"country": list}) - .to_dict()["country"] - ) - # Add new groups to country_groups_in_data; if they are already there, ensure they contain all members. - for group in list(country_groups): - if group not in countries_in_data["fao_country"]: - # This should not happen, but skip just in case. - continue - if group in list(country_groups_in_data): - all_members = set(country_groups_in_data[group]) | set(country_groups[group]) - country_groups_in_data[group] = list(all_members) - else: - country_groups_in_data[group] = country_groups[group] - - # Add dataset descriptions, items, and element-units from current dataset to global dataframes. - datasets_df = dataframes.concatenate([datasets_df, datasets_from_data], ignore_index=True) - items_df = dataframes.concatenate([items_df, items_from_data], ignore_index=True) - elements_df = dataframes.concatenate([elements_df, elements_from_data], ignore_index=True) - - datasets_df = clean_global_dataset_descriptions_dataframe(datasets_df=datasets_df, custom_datasets=custom_datasets) - items_df = clean_global_items_dataframe(items_df=items_df, custom_items=custom_items) - elements_df = clean_global_elements_dataframe(elements_df=elements_df, custom_elements=custom_elements) - - countries_df = clean_global_countries_dataframe( - countries_in_data=countries_in_data, - country_groups=country_groups_in_data, - countries_harmonization=countries_harmonization, - ) - - return countries_df, datasets_df, elements_df, items_df - - -def run(dest_dir: str) -> None: - #################################################################################################################### - # Common definitions. - #################################################################################################################### - - # Path to latest garden version for FAOSTAT. - garden_code_dir = STEP_DIR / "data" / "garden" / NAMESPACE / VERSION - # Path to file with custom dataset titles and descriptions. - custom_datasets_file = garden_code_dir / "custom_datasets.csv" - # Path to file with custom item names and descriptions. - custom_items_file = garden_code_dir / "custom_items.csv" - # Path to file with custom element and unit names and descriptions. - custom_elements_and_units_file = garden_code_dir / "custom_elements_and_units.csv" - - # Load file of versions. - latest_versions = pd.read_csv(LATEST_VERSIONS_FILE).set_index(["channel", "dataset"]) - - # Find latest meadow version of dataset of FAOSTAT metadata. - metadata_version = latest_versions.loc["meadow", DATASET_SHORT_NAME].item() - metadata_path = DATA_DIR / "meadow" / NAMESPACE / metadata_version / DATASET_SHORT_NAME - - # Countries file, with mapping from FAO names to OWID harmonized country names. - countries_file = garden_code_dir / f"{NAMESPACE}.countries.json" - - #################################################################################################################### - # Load and process data. - #################################################################################################################### - - # Load metadata from meadow. - assert metadata_path.is_dir() - metadata = catalog.Dataset(metadata_path) - - # Load custom dataset names, items, and element-unit names. - custom_datasets = pd.read_csv(custom_datasets_file, dtype=str) - custom_elements = pd.read_csv(custom_elements_and_units_file, dtype=str) - custom_items = pd.read_csv(custom_items_file, dtype=str) - - # Load countries file. - countries_harmonization = io.load_json(countries_file) - - countries_df, datasets_df, elements_df, items_df = process_metadata( - metadata=metadata, - custom_datasets=custom_datasets, - custom_elements=custom_elements, - custom_items=custom_items, - countries_harmonization=countries_harmonization, - ) - - #################################################################################################################### - # Save outputs. - #################################################################################################################### - - # Initialize new garden dataset. - dataset_garden = catalog.Dataset.create_empty(dest_dir) - dataset_garden.short_name = DATASET_SHORT_NAME - # Keep original dataset's metadata from meadow. - dataset_garden.metadata = deepcopy(metadata.metadata) - # Create new dataset in garden. - dataset_garden.save() - - # Create new garden dataset with all dataset descriptions, items, element-units, and countries. - datasets_table = create_table(df=datasets_df, short_name="datasets", index_cols=["dataset"]) - items_table = create_table(df=items_df, short_name="items", index_cols=["dataset", "item_code"]) - elements_table = create_table(df=elements_df, short_name="elements", index_cols=["dataset", "element_code"]) - - countries_table = create_table(df=countries_df, short_name="countries", index_cols=["area_code"]) - - # Add tables to dataset (no need to repack, since columns already have optimal dtypes). - dataset_garden.add(datasets_table, repack=False) - dataset_garden.add(items_table, repack=False) - dataset_garden.add(elements_table, repack=False) - dataset_garden.add(countries_table, repack=False) diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_qcl.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_qcl.py deleted file mode 100644 index c99f29cd170..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_qcl.py +++ /dev/null @@ -1,412 +0,0 @@ -"""FAOSTAT garden step for faostat_qcl dataset.""" - -import json -from copy import deepcopy - -import numpy as np -import pandas as pd -from owid import catalog -from owid.datautils import dataframes -from shared import ( - ADDED_TITLE_TO_WIDE_TABLE, - FLAG_MULTIPLE_FLAGS, - LATEST_VERSIONS_FILE, - NAMESPACE, - REGIONS_TO_ADD, - VERSION, - add_per_capita_variables, - add_regions, - clean_data, - harmonize_elements, - harmonize_items, - prepare_long_table, - prepare_wide_table, - remove_outliers, -) - -from etl.paths import DATA_DIR, STEP_DIR - - -def add_slaughtered_animals_to_meat_total(data: pd.DataFrame) -> pd.DataFrame: - """Add number of slaughtered animals to meat total. - - There is no FAOSTAT data on slaughtered animals for total meat. We construct this data by aggregating that element - for the items specified in items_to_aggregate (which corresponds to all meat items after removing redundancies). - - Parameters - ---------- - data : pd.DataFrame - Processed data where meat total does not have number of slaughtered animals. - - Returns - ------- - combined_data : pd.DataFrame - Data after adding the new variable. - - """ - # List of items to sum as part of "Meat, total" (avoiding double-counting items). - items_to_aggregate = [ - "Meat, ass", - "Meat, beef and buffalo", - "Meat, camel", - "Meat, horse", - "Meat, lamb and mutton", - "Meat, mule", - "Meat, pig", - "Meat, poultry", - "Meat, rabbit", - "Meat, sheep and goat", - ] - # OWID item name for total meat. - total_meat_item = "Meat, total" - # OWID element name, unit name, and unit short name for number of slaughtered animals. - slaughtered_animals_element = "Producing or slaughtered animals" - slaughtered_animals_unit = "animals" - slaughtered_animals_unit_short_name = "animals" - error = f"Some items required to get the aggregate '{total_meat_item}' are missing in data." - assert set(items_to_aggregate) < set(data["item"]), error - assert slaughtered_animals_element in data["element"].unique() - assert slaughtered_animals_unit in data["unit"].unique() - - # For some reason, there are two element codes for the same element (they have different items assigned). - error = "Element codes for 'Producing or slaughtered animals' may have changed." - assert data[(data["element"] == slaughtered_animals_element) & ~(data["element_code"].str.contains("pc"))][ - "element_code" - ].unique().tolist() == ["005320", "005321"], error - - # Similarly, there are two items for meat total. - error = f"Item codes for '{total_meat_item}' may have changed." - assert list(data[data["item"] == total_meat_item]["item_code"].unique()) == ["00001765"], error - - # We arbitrarily choose the first element code and the first item code. - slaughtered_animals_element_code = "005320" - total_meat_item_code = "00001765" - - # Check that, indeed, this variable is not given in the original data. - assert data[ - (data["item"] == total_meat_item) - & (data["element"] == slaughtered_animals_element) - & (data["unit"] == slaughtered_animals_unit) - ].empty - - # Select the subset of data to aggregate. - data_to_aggregate = ( - data[ - (data["element"] == slaughtered_animals_element) - & (data["unit"] == slaughtered_animals_unit) - & (data["item"].isin(items_to_aggregate)) - ] - .dropna(subset="value") - .reset_index(drop=True) - ) - - # Create a dataframe with the total number of animals used for meat. - animals = dataframes.groupby_agg( - data_to_aggregate, - groupby_columns=[ - "area_code", - "fao_country", - "fao_element", - "country", - "year", - "population_with_data", - ], - aggregations={ - "value": "sum", - "flag": lambda x: x if len(x) == 1 else FLAG_MULTIPLE_FLAGS, - }, - ).reset_index() - - # Get element description for selected element code. - _slaughtered_animals_element_description = data[data["element_code"] == slaughtered_animals_element_code][ - "element_description" - ].unique() - assert len(_slaughtered_animals_element_description) == 1 - slaughtered_animals_element_description = _slaughtered_animals_element_description[0] - - # Get item description for selected item code. - _total_meat_item_description = data[data["item_code"] == total_meat_item_code]["item_description"].unique() - assert len(_total_meat_item_description) == 1 - total_meat_item_description = _total_meat_item_description[0] - - # Get FAO item name for selected item code. - _total_meat_fao_item = data[data["item_code"] == total_meat_item_code]["fao_item"].unique() - assert len(_total_meat_fao_item) == 1 - total_meat_fao_item = _total_meat_fao_item[0] - - # Get FAO unit for selected item code. - _total_meat_fao_unit = data[data["item_code"] == total_meat_item_code]["fao_unit_short_name"].unique() - assert len(_total_meat_fao_unit) == 1 - total_meat_fao_unit = _total_meat_fao_unit[0] - - # Manually include the rest of columns. - animals["element"] = slaughtered_animals_element - animals["element_description"] = slaughtered_animals_element_description - animals["unit"] = slaughtered_animals_unit - animals["unit_short_name"] = slaughtered_animals_unit_short_name - animals["element_code"] = slaughtered_animals_element_code - animals["item_code"] = total_meat_item_code - animals["item"] = total_meat_item - animals["item_description"] = total_meat_item_description - animals["fao_item"] = total_meat_fao_item - animals["fao_unit_short_name"] = total_meat_fao_unit - - # Check that we are not missing any column. - assert set(data.columns) == set(animals.columns) - - # Add animals data to the original dataframe. - combined_data = ( - pd.concat([data, animals], ignore_index=True) - .reset_index(drop=True) - .astype( - { - "element_code": "category", - "item_code": "category", - "fao_item": "category", - "fao_unit_short_name": "category", - "flag": "category", - "item": "category", - "item_description": "category", - "element": "category", - "unit": "category", - "element_description": "category", - "unit_short_name": "category", - } - ) - ) - - return combined_data - - -def add_yield_to_aggregate_regions(data: pd.DataFrame) -> pd.DataFrame: - """Add yield (production / area harvested) to data for aggregate regions (i.e. continents and income groups). - - This data is not included in aggregate regions because it cannot be aggregated by simply summing the contribution of - the individual countries. Instead, we need to aggregate production, then aggregate area harvested, and then divide - one by the other. - - Note: Here, we divide production (the sum of the production from a list of countries in a region) by area (the sum - of the area from a list of countries in a region) to obtain yield. But the list of countries that contributed to - production may not be the same as the list of countries that contributed to area. We could impose that they must be - the same, but this causes the resulting series to have gaps. Additionally, it seems that FAO also constructs yield - in the same way. This was checked by comparing the resulting yield curves for 'Almonds' for all aggregate regions - with their corresponding *(FAO) regions; they were identical. - - Parameters - ---------- - data : pd.DataFrame - Data that does not contain yield for aggregate regions. - - Returns - ------- - combined_data : pd.DataFrame - Data after adding yield. - - """ - # Element code of production, area harvested, and yield. - production_element_code = "005510" - area_element_code = "005312" - yield_element_code = "005419" - - # Check that indeed regions do not contain any data for yield. - assert data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == yield_element_code)].empty - - # Gather all fields that should stay the same. - additional_fields = data[data["element_code"] == yield_element_code][ - [ - "element", - "element_description", - "fao_element", - "fao_unit_short_name", - "unit", - "unit_short_name", - ] - ].drop_duplicates() - assert len(additional_fields) == 1 - - # Create a dataframe of production of regions. - data_production = data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == production_element_code)] - - # Create a dataframe of area of regions. - data_area = data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == area_element_code)] - - # Merge the two dataframes and create the new yield variable. - merge_cols = [ - "area_code", - "year", - "item_code", - "fao_country", - "fao_item", - "item", - "item_description", - "country", - ] - combined = pd.merge( - data_production, - data_area[merge_cols + ["flag", "value"]], - on=merge_cols, - how="inner", - suffixes=("_production", "_area"), - ) - - combined["value"] = combined["value_production"] / combined["value_area"] - - # Replace infinities (caused by dividing by zero) by nan. - combined["value"] = combined["value"].replace(np.inf, np.nan) - - # If both fields have the same flag, use that, otherwise use the flag of multiple flags. - combined["flag"] = [ - flag_production if flag_production == flag_area else FLAG_MULTIPLE_FLAGS - for flag_production, flag_area in zip(combined["flag_production"], combined["flag_area"]) - ] - - # Drop rows of nan and unnecessary columns. - combined = combined.drop(columns=["flag_production", "flag_area", "value_production", "value_area"]) - combined = combined.dropna(subset="value").reset_index(drop=True) - - # Replace fields appropriately. - combined["element_code"] = yield_element_code - # Replace all other fields from the corresponding fields in yield (tonnes per hectare) variable. - for field in additional_fields.columns: - combined[field] = additional_fields[field].item() - - assert set(data.columns) == set(combined.columns) - - combined = combined - - combined_data = ( - pd.concat([data, combined], ignore_index=True) - .reset_index(drop=True) - .astype( - { - "element_code": "category", - "fao_element": "category", - "fao_unit_short_name": "category", - "flag": "category", - "element": "category", - "unit": "category", - "element_description": "category", - "unit_short_name": "category", - } - ) - ) - - return combined_data - - -def run(dest_dir: str) -> None: - #################################################################################################################### - # Common definitions. - #################################################################################################################### - - # Load file of versions. - latest_versions = pd.read_csv(LATEST_VERSIONS_FILE).set_index(["channel", "dataset"]) - - # Dataset short name. - dataset_short_name = f"{NAMESPACE}_qcl" - # Path to latest dataset in meadow for current FAOSTAT domain. - meadow_version = latest_versions.loc["meadow", dataset_short_name].item() - meadow_data_dir = DATA_DIR / "meadow" / NAMESPACE / meadow_version / dataset_short_name - # Path to dataset of FAOSTAT metadata. - garden_metadata_dir = DATA_DIR / "garden" / NAMESPACE / VERSION / f"{NAMESPACE}_metadata" - - # Path to outliers file. - outliers_file = STEP_DIR / "data" / "garden" / NAMESPACE / VERSION / "detected_outliers.json" - - #################################################################################################################### - # Load data. - #################################################################################################################### - - # Load meadow dataset and keep its metadata. - dataset_meadow = catalog.Dataset(meadow_data_dir) - # Load main table from dataset. - data_table_meadow = dataset_meadow[dataset_short_name] - data = pd.DataFrame(data_table_meadow).reset_index() - - # Load dataset of FAOSTAT metadata. - metadata = catalog.Dataset(garden_metadata_dir) - - # Load and prepare dataset, items, element-units, and countries metadata. - datasets_metadata = pd.DataFrame(metadata["datasets"]).reset_index() - datasets_metadata = datasets_metadata[datasets_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - items_metadata = pd.DataFrame(metadata["items"]).reset_index() - items_metadata = items_metadata[items_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - elements_metadata = pd.DataFrame(metadata["elements"]).reset_index() - elements_metadata = elements_metadata[elements_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() - - # Load file of detected outliers. - with open(outliers_file, "r") as _json_file: - outliers = json.loads(_json_file.read()) - - #################################################################################################################### - # Process data. - #################################################################################################################### - - # Harmonize items and elements, and clean data. - data = harmonize_items(df=data, dataset_short_name=dataset_short_name) - data = harmonize_elements(df=data) - - # Prepare data. - data = clean_data( - data=data, - items_metadata=items_metadata, - elements_metadata=elements_metadata, - countries_metadata=countries_metadata, - ) - - # Include number of slaughtered animals in total meat (which is missing). - data = add_slaughtered_animals_to_meat_total(data=data) - - # Add data for aggregate regions. - data = add_regions(data=data, elements_metadata=elements_metadata) - - # Add per-capita variables. - data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) - - # Add yield (production per area) to aggregate regions. - data = add_yield_to_aggregate_regions(data) - - # Remove outliers from data. - data = remove_outliers(data, outliers=outliers) - - # Create a long table (with item code and element code as part of the index). - data_table_long = prepare_long_table(data=data) - - # Create a wide table (with only country and year as index). - data_table_wide = prepare_wide_table(data=data) - - #################################################################################################################### - # Save outputs. - #################################################################################################################### - - # Initialize new garden dataset. - dataset_garden = catalog.Dataset.create_empty(dest_dir) - # Prepare metadata for new garden dataset (starting with the metadata from the meadow version). - dataset_garden_metadata = deepcopy(dataset_meadow.metadata) - dataset_garden_metadata.version = VERSION - dataset_garden_metadata.description = datasets_metadata["owid_dataset_description"].item() - dataset_garden_metadata.title = datasets_metadata["owid_dataset_title"].item() - # Add metadata to dataset. - dataset_garden.metadata = dataset_garden_metadata - # Create new dataset in garden. - dataset_garden.save() - - # Prepare metadata for new garden long table (starting with the metadata from the meadow version). - data_table_long.metadata = deepcopy(data_table_meadow.metadata) - data_table_long.metadata.title = dataset_garden_metadata.title - data_table_long.metadata.description = dataset_garden_metadata.description - data_table_long.metadata.primary_key = list(data_table_long.index.names) - # Add long table to the dataset (no need to repack, since columns already have optimal dtypes). - dataset_garden.add(data_table_long, repack=False) - - # Prepare metadata for new garden wide table (starting with the metadata from the long table). - # Add wide table to the dataset. - data_table_wide.metadata = deepcopy(data_table_long.metadata) - - data_table_wide.metadata.title += ADDED_TITLE_TO_WIDE_TABLE - data_table_wide.metadata.short_name += "_flat" - data_table_wide.metadata.primary_key = list(data_table_wide.index.names) - - # Add wide table to the dataset (no need to repack, since columns already have optimal dtypes). - dataset_garden.add(data_table_wide, repack=False) diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_qi.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_qi.py deleted file mode 100644 index 8c271f07bc2..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_qi.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_qi dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_qv.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_qv.py deleted file mode 100644 index f564688376e..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_qv.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_qv dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_rfb.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_rfb.py deleted file mode 100644 index 68669b4cbd4..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_rfb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_rfb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_rfn.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_rfn.py deleted file mode 100644 index 4ebfe341728..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_rfn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_rfn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_rl.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_rl.py deleted file mode 100644 index f43cbe31912..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_rl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_rl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_rp.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_rp.py deleted file mode 100644 index f15e468d920..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_rp.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_rp dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_rt.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_rt.py deleted file mode 100644 index 8b7a9257526..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_rt.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_rt dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_scl.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_scl.py deleted file mode 100644 index 00d0d6eb376..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_scl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_scl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_sdgb.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_sdgb.py deleted file mode 100644 index 67932fa7aaf..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_sdgb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_sdgb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_tcl.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_tcl.py deleted file mode 100644 index 2df286d3992..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_tcl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_tcl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2022-05-17/faostat_ti.py b/etl/steps/archive/garden/faostat/2022-05-17/faostat_ti.py deleted file mode 100644 index 682199d79d9..00000000000 --- a/etl/steps/archive/garden/faostat/2022-05-17/faostat_ti.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ti dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/additional_variables.meta.yml b/etl/steps/archive/garden/faostat/2023-02-22/additional_variables.meta.yml deleted file mode 100644 index fadb0ee9051..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/additional_variables.meta.yml +++ /dev/null @@ -1,711 +0,0 @@ -all_sources: - - faostat: &faostat_source - name: Food and Agriculture Organization of the United Nations - published_by: Food and Agriculture Organization of the United Nations - url: http://www.fao.org/faostat/en/#data/ - date_accessed: "2023-02-22" - publication_date: "2023-02-22" - publication_year: 2023 - -dataset: - title: Additional variables (FAOSTAT, 2023) - description: | - Additional variables created using data from different FAOSTAT datasets. - sources: - - *faostat_source - -tables: - arable_land_per_crop_output: - variables: - area: - title: 'Arable land' - unit: 'hectares' - short_unit: 'ha' - description: | - Arable land is the total of areas (extent of surface of land or water) under temporary crops, temporary meadows and pastures, and land with temporary fallow. Arable land does not include land that is potentially cultivable but is not normally cultivated. - index: - title: 'Gross Production Index Number' - unit: '' - short_unit: '' - description: | - Gross Production Index Number (2014-2016 = 100). - arable_land_per_crop_output: - title: Arable land needed to produce a fixed quantity of crops - unit: '' - short_unit: '' - description: | - Index of arable land needed to produce a fixed quantity of crops (where values in 1961 are equal to 1.0). This is calculated as arable land divided by the crop production index (PIN). The crop production index here is the sum of crop commodities produced (after deductions of quantities used as seed and feed). It is weighted by the commodity prices. - - This metric measures the index of arable land needed to produce a fixed quantity of crops (where values in 1961 are equal to 1.0). - - Arable land is the total of areas (extent of surface of land or water) under temporary crops, temporary meadows and pastures, and land with temporary fallow. Arable land does not include land that is potentially cultivable but is not normally cultivated. - area_used_per_crop_type: - variables: - area_used_for_production: - title: "Area used for production" - unit: "hectares" - short_unit: "ha" - # Description will be fetched from the original FAOSTAT item and element descriptions. - # description: | - share_of_sustainable_and_overexploited_fish: - variables: - sustainable_fish: - title: "Percentage of fish stocks within biologically sustainable levels" - unit: "%" - short_unit: "%" - description: | - Fish stock are subpopulations of a particular species of fish which have common parameters such as location, growth and mortality which define their population dynamics. Fish stocks are within biologically sustainable levels when fish catch does not exceed the maximum sustainable yield (MSY) - the rate at which fish populations can regenerate. - overexploited_fish: - title : "Percentage of overexploited fish stocks" - unit: "%" - short_unit: "%" - description: | - Fish stock are subpopulations of a particular species of fish which have common parameters such as location, growth and mortality which define their population dynamics. Fish stocks are overexploited when fish catch exceeds the maximum sustainable yield (MSY) - the rate at which fish populations can regenerate. - land_spared_by_increased_crop_yields: - variables: - area: - title: "Actual cropland area today" - unit: "hectares" - short_unit: "ha" - description: | - Total cropland area on a given year, calculated by dividing the total production by the crop yield. - area_with_yield_of_1961: - title: "Cropland area needed if yields stagnated in 1961" - unit: "hectares" - short_unit: "ha" - description: | - Total cropland area that would be necessary if crop yields stagnated in 1961. - - This area is calculated by dividing the total production on a given year by the crop yield of 1961. - spared_land: - title: "Land spared due to crop yield increases" - unit: "hectares" - short_unit: "ha" - description: | - Land spared since 1961 due to the increase of crop yields. - - This area is calculated as the cropland area that would be necessary if crop yields stagnated in 1961 (the total production on a given year divided by the crop yield of 1961), minus the true cropland area on a given year. - spared_land__pct: - title: "Percentage reduction in area needed due to crop yield increases" - unit: "hectares" - short_unit: "ha" - description: | - Land spared since 1961 due to the increase of crop yields, as a percentage of the total land that would be necessary if crop yields had not increased since then. - # All metadata for food_available_for_consumption is prepared via script. - # food_available_for_consumption: - macronutrient_compositions: - variables: - energy_from_animal_products: - title: "Daily caloric intake per person from animal products" - unit: "kilocalories per day per capita" - short_unit: "kcal" - description: ¯onutrient_composition_variable_description | - The FAO provide annual figures from 1961 by country on daily caloric supply, fat supply (in grams), and protein supply (in grams). To calculate the daily per capita supply of carbohydrates, we assume an energy density by macronutrient of 4 kcal per gram of both protein and carbohydrate and 9 kcal per gram of fat (based on established nutritional guidelines reported by the FAO). The daily supply of carbohydrates is therefore calculated as: - - ((Daily supply of kcal)-(Daily supply of protein * 4 + Daily supply of fat * 9)) / 4 - - The quantity of calories from each macronutrient is then calculated based on the energy density figures given above (e.g. calories from protein is calculated by multiplying the daily supply of protein in grams by 4). - - For an explanation of these conversion factors, see "Chapter 3: Calculation Of The Energy Content Of Foods - Energy Conversion Factors", available at: http://www.fao.org/docrep/006/Y5022E/y5022e04.htm - - The share of calories derived from each macronutrient is then calculated by dividing the number of calories derived from a given macronutrient by the total daily caloric supply. - - Protein of animal origin includes protein supplied in the form of all meat commodities, eggs and dairy products, and fish & seafood. - energy_from_animal_protein: - title: "Daily caloric intake per person that comes from animal protein" - unit: "kilocalories per day per capita" - short_unit: "kcal" - description: *macronutrient_composition_variable_description - energy_from_vegetal_products: - title: "Daily caloric intake per person from vegetal products" - unit: "kilocalories per day per capita" - short_unit: "kcal" - description: *macronutrient_composition_variable_description - energy_from_vegetal_protein: - title: "Daily caloric intake per person that comes from vegetal protein" - unit: "kilocalories per day per capita" - short_unit: "kcal" - description: *macronutrient_composition_variable_description - fat_from_animal_products: - title: "Daily fat intake per person from animal products" - unit: "grams per day per capita" - short_unit: "g" - description: *macronutrient_composition_variable_description - fat_from_vegetal_products: - title: "Daily fat intake per person from vegetal products" - unit: "grams per day per capita" - short_unit: "g" - description: *macronutrient_composition_variable_description - protein_from_animal_products: - title: "Daily protein intake from animal products" - unit: "grams per day per capita" - short_unit: "g" - description: *macronutrient_composition_variable_description - protein_from_vegetal_products: - title: "Daily protein intake per person from vegetal products" - unit: "grams per day per capita" - short_unit: "g" - description: *macronutrient_composition_variable_description - share_of_energy_from_animal_protein: - title: "Share of the daily caloric intake that comes from animal protein" - unit: "%" - short_unit: "%" - description: *macronutrient_composition_variable_description - share_of_energy_from_carbohydrates: - title: "Share of the daily caloric intake that comes from carbohydrates" - unit: "%" - short_unit: "%" - description: *macronutrient_composition_variable_description - share_of_energy_from_fat: - title: "Share of the daily caloric intake that comes from fat" - unit: "%" - short_unit: "%" - description: *macronutrient_composition_variable_description - share_of_energy_from_protein: - title: "Share of the daily caloric intake that comes from protein" - unit: "%" - short_unit: "%" - description: *macronutrient_composition_variable_description - share_of_energy_from_vegetal_protein: - title: "Share of the daily caloric intake that comes from vegetal protein" - unit: "%" - short_unit: "%" - description: *macronutrient_composition_variable_description - total_carbohydrates: - title: "Daily carbohydrates intake per person" - unit: "grams per day per capita" - short_unit: "g" - description: *macronutrient_composition_variable_description - total_energy: - title: "Daily caloric intake per person" - unit: "kilocalories per day per capita" - short_unit: "kcal" - description: *macronutrient_composition_variable_description - total_energy_from_carbohydrates: - title: "Daily caloric intake per person from carbohydrates" - unit: "kilocalories per day per capita" - short_unit: "kcal" - description: *macronutrient_composition_variable_description - total_energy_from_fat: - title: "Daily caloric intake per person from fat" - unit: "kilocalories per day per capita" - short_unit: "kcal" - description: *macronutrient_composition_variable_description - total_energy_from_protein: - title: "Daily caloric intake per person from protein" - unit: "kilocalories per day per capita" - short_unit: "kcal" - description: *macronutrient_composition_variable_description - total_fat: - title: "Daily fat intake per person" - unit: "grams per day per capita" - short_unit: "g" - description: *macronutrient_composition_variable_description - total_protein: - title: "Daily protein intake per person" - unit: "grams per day per capita" - short_unit: "g" - description: *macronutrient_composition_variable_description - fertilizers: - variables: - nitrogen_per_cropland: - title: Nitrogen use per area of cropland - unit: kilograms per hectare - short_unit: kg/ha - description: | - Nutrient nitrogen (N) from all fertilizer products per area of cropland, which corresponds to the sum of arable land and permanent crops. - phosphate_per_cropland: - title: Phosphate use per area of cropland - unit: kilograms per hectare - short_unit: kg/ha - description: | - Nutrient phosphate (P2O5) from all fertilizer products per area of cropland, which corresponds to the sum of arable land and permanent crops. - potash_per_cropland: - title: Potash use per area of cropland - unit: kilograms per hectare - short_unit: kg/ha - description: | - Nutrient potash (K2O) from all fertilizer products per area of cropland, which corresponds to the sum of arable land and permanent crops. - all_fertilizers_per_cropland: - title: All fertilizers use per area of cropland - unit: kilograms per hectare - short_unit: kg/ha - description: | - Agricultural use of all fertilizer products (including nitrogenous, potash, and phosphate fertilizers) per area of cropland, which corresponds to the sum of arable land and permanent crops. - cropland: - title: Area of cropland - unit: hectares - short_unit: ha - description: - Surface area of cropland, which corresponds to the sum of arable land and permanent crops. - nitrogen_use: - title: Nitrogen use - unit: tonnes - short_unit: t - description: | - Agricultural use of nutrient nitrogen (N) from all fertilizer products. - phosphate_use: - title: Phosphate use - unit: tonnes - short_unit: t - description: | - Agricultural use of nutrient phosphate (P2O5) from all fertilizer products. - potash_use: - title: Potash use - unit: tonnes - short_unit: t - description: | - Agricultural use of nutrient potash (K2O) from all fertilizer products. - all_fertilizers_use: - title: All fertilizers use - unit: tonnes - short_unit: t - description: | - Agricultural use from all fertilizer products (including nitrogenous, potash, and phosphate fertilizers). - vegetable_oil_yields: - variables: - sunflower_production: - title: Production of sunflower oil - unit: tonnes - short_unit: t - description: | - Amount of sunflower oil produced. - soybean_production: - title: Production of soybean oil - unit: tonnes - short_unit: t - description: | - Amount of soybean oil produced. - groundnut_production: - title: Production of groundnut oil - unit: tonnes - short_unit: t - description: | - Amount of groundnut oil produced. - coconut_production: - title: Production of coconut oil - unit: tonnes - short_unit: t - description: | - Amount of coconut oil produced. - olive_production: - title: Production of olive oil - unit: tonnes - short_unit: t - description: | - Amount of olive oil produced. - cottonseed_production: - title: Production of cottonseed oil - unit: tonnes - short_unit: t - description: | - Amount of cottonseed oil produced. - sesame_production: - title: Production of sesame oil - unit: tonnes - short_unit: t - description: | - Amount of sesame oil produced. - rapeseed_production: - title: Production of rapeseed oil - unit: tonnes - short_unit: t - description: | - Amount of rapeseed oil produced. - palm_production: - title: Production of palm oil - unit: tonnes - short_unit: t - description: | - Amount of palm oil produced. Palm oil includes palm kernel oil. - sunflower_area: - title: Area harvested for sunflower crops - unit: hectares - short_unit: ha - description: | - Land area used to harvest sunflower crops. - cottonseed_area: - title: Area harvested for cottonseed crops - unit: hectares - short_unit: ha - description: | - Land area used to harvest cottonseed crops. - soybean_area: - title: Area harvested for soybean crops - unit: hectares - short_unit: ha - description: | - Land area used to harvest soybean crops. - groundnut_area: - title: Area harvested for groundnut crops - unit: hectares - short_unit: ha - description: | - Land area used to harvest groundnut crops. - olive_area: - title: Area harvested for olive crops - unit: hectares - short_unit: ha - description: | - Land area used to harvest olive crops. - rapeseed_area: - title: Area harvested for rapeseed crops - unit: hectares - short_unit: ha - description: | - Land area used to harvest rapeseed crops. - coconut_area: - title: Area harvested for coconut crops - unit: hectares - short_unit: ha - description: | - Land area used to harvest coconut crops. - sesame_area: - title: Area harvested for sesame crops - unit: hectares - short_unit: ha - description: | - Land area used to harvest sesame crops. - palm_area: - title: Area harvested for palm fruit crops - unit: hectares - short_unit: ha - description: | - Land area used to harvest palm fruit crops. Palm oil includes palm kernel oil. - vegetable_oils_production: - title: Global production of vegetable oils - unit: tonnes - short_unit: t - description: | - Amount of vegetable oils produced worldwide. - palm_tonnes_per_hectare: - title: Palm oil yield per crop - unit: tonnes per hectare - short_unit: tonnes/ha - description: | - Average amount of palm oil produced per hectare of palm fruit crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. Palm oil includes palm kernel oil. - palm_hectares_per_tonne: - title: Area of palm fruit crops harvested to produce a tonne of palm oil - unit: hectares per tonne - short_unit: hectares/tonne - description: | - Area of palm fruit crops harvested to produce a tonne of palm oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. Palm oil includes palm kernel oil. - palm_area_to_meet_global_oil_demand: - title: Area needed to meet the global vegetable oil demand with only palm oil - unit: hectares - short_unit: ha - description: | - Amount of land that would need to be devoted to grow palm fruit crops if it was to meet global vegetable oil demand alone. Palm oil includes palm kernel oil. - sunflower_tonnes_per_hectare: - title: Sunflower oil yield per crop - unit: tonnes per hectare - short_unit: tonnes/ha - description: | - Average amount of sunflower oil produced per hectare of sunflower crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. - sunflower_hectares_per_tonne: - title: Area of sunflower crops harvested to produce a tonne of sunflower oil - unit: hectares per tonne - short_unit: hectares/tonne - description: | - Area of sunflower crops harvested to produce a tonne of sunflower oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. - sunflower_area_to_meet_global_oil_demand: - title: Area needed to meet the global vegetable oil demand with only sunflower oil - unit: hectares - short_unit: ha - description: | - Amount of land that would need to be devoted to grow sunflower crops if it was to meet global vegetable oil demand alone. - rapeseed_tonnes_per_hectare: - title: Rapeseed oil yield per crop - unit: tonnes per hectare - short_unit: tonnes/ha - description: | - Average amount of rapeseed oil produced per hectare of rapeseed crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. - rapeseed_hectares_per_tonne: - title: Area of rapeseed crops harvested to produce a tonne of rapeseed oil - unit: hectares per tonne - short_unit: hectares/tonne - description: | - Area of rapeseed crops harvested to produce a tonne of rapeseed oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. - rapeseed_area_to_meet_global_oil_demand: - title: Area needed to meet the global vegetable oil demand with only rapeseed oil - unit: hectares - short_unit: ha - description: | - Amount of land that would need to be devoted to grow rapeseed crops if it was to meet global vegetable oil demand alone. - soybean_tonnes_per_hectare: - title: Soybean oil yield per crop - unit: tonnes per hectare - short_unit: tonnes/ha - description: | - Average amount of soybean oil produced per hectare of soybean crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. - soybean_hectares_per_tonne: - title: Area of soybean crops harvested to produce a tonne of soybean oil - unit: hectares per tonne - short_unit: hectares/tonne - description: | - Area of soybean crops harvested to produce a tonne of soybean oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. - soybean_area_to_meet_global_oil_demand: - title: Area needed to meet the global vegetable oil demand with only soybean oil - unit: hectares - short_unit: ha - description: | - Amount of land that would need to be devoted to grow soybean crops if it was to meet global vegetable oil demand alone. - olive_tonnes_per_hectare: - title: Olive oil yield per crop - unit: tonnes per hectare - short_unit: tonnes/ha - description: | - Average amount of olive oil produced per hectare of olive crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. - olive_hectares_per_tonne: - title: Area of olive crops harvested to produce a tonne of olive oil - unit: hectares per tonne - short_unit: hectares/tonne - description: | - Area of olive crops harvested to produce a tonne of olive oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. - olive_area_to_meet_global_oil_demand: - title: Area needed to meet the global vegetable oil demand with only olive oil - unit: hectares - short_unit: ha - description: | - Amount of land that would need to be devoted to grow olive crops if it was to meet global vegetable oil demand alone. - coconut_tonnes_per_hectare: - title: Coconut oil yield per crop - unit: tonnes per hectare - short_unit: tonnes/ha - description: | - Average amount of coconut oil produced per hectare of coconut crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. - coconut_hectares_per_tonne: - title: Area of coconut crops harvested to produce a tonne of coconut oil - unit: hectares per tonne - short_unit: hectares/tonne - description: | - Area of coconut crops harvested to produce a tonne of coconut oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. - coconut_area_to_meet_global_oil_demand: - title: Area needed to meet the global vegetable oil demand with only coconut oil - unit: hectares - short_unit: ha - description: | - Amount of land that would need to be devoted to grow coconut crops if it was to meet global vegetable oil demand alone. - groundnut_tonnes_per_hectare: - title: Groundnut oil yield per crop - unit: tonnes per hectare - short_unit: tonnes/ha - description: | - Average amount of groundnut oil produced per hectare of groundnut crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. - groundnut_hectares_per_tonne: - title: Area of groundnut crops harvested to produce a tonne of groundnut oil - unit: hectares per tonne - short_unit: hectares/tonne - description: | - Area of groundnut crops harvested to produce a tonne of groundnut oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. - groundnut_area_to_meet_global_oil_demand: - title: Area needed to meet the global vegetable oil demand with only groundnut oil - unit: hectares - short_unit: ha - description: | - Amount of land that would need to be devoted to grow groundnut crops if it was to meet global vegetable oil demand alone. - cottonseed_tonnes_per_hectare: - title: Cottonseed oil yield per crop - unit: tonnes per hectare - short_unit: tonnes/ha - description: | - Average amount of cottonseed oil produced per hectare of cottonseed crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. - cottonseed_hectares_per_tonne: - title: Area of cottonseed crops harvested to produce a tonne of cottonseed oil - unit: hectares per tonne - short_unit: hectares/tonne - description: | - Area of cottonseed crops harvested to produce a tonne of cottonseed oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. - cottonseed_area_to_meet_global_oil_demand: - title: Area needed to meet the global vegetable oil demand with only cottonseed oil - unit: hectares - short_unit: ha - description: | - Amount of land that would need to be devoted to grow cottonseed crops if it was to meet global vegetable oil demand alone. - sesame_tonnes_per_hectare: - title: Sesame oil yield per crop - unit: tonnes per hectare - short_unit: tonnes/ha - description: | - Average amount of sesame oil produced per hectare of sesame crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. - sesame_hectares_per_tonne: - title: Area of sesame crops harvested to produce a tonne of sesame oil - unit: hectares per tonne - short_unit: hectares/tonne - description: | - Area of sesame crops harvested to produce a tonne of sesame oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. - sesame_area_to_meet_global_oil_demand: - title: Area needed to meet the global vegetable oil demand with only sesame oil - unit: hectares - short_unit: ha - description: | - Amount of land that would need to be devoted to grow sesame crops if it was to meet global vegetable oil demand alone. - agriculture_land_use_evolution: - variables: - agriculture_area: - title: Area used for agriculture - unit: hectares - short_unit: ha - description: | - Surface area devoted to agriculture on a given year. - agriculture_area_one_decade_back: - title: Area used for agriculture one decade back - unit: hectares - short_unit: ha - description: | - Surface area devoted to agriculture one decade before a given year. For example, for year 2020, this variable gives the extent of agricultural land in 2010. - cropland_area: - title: Area used for croplands - unit: hectares - short_unit: ha - description: | - Surface area devoted to croplands on a given year. - cropland_area_one_decade_back: - title: Area used for croplands one decade back - unit: hectares - short_unit: ha - description: | - Surface area devoted to croplands one decade before a given year. For example, for year 2020, this variable gives the extent of croplands in 2010. - pasture_area: - title: Area used for pastures - unit: hectares - short_unit: ha - description: | - Surface area devoted to pastures on a given year. - pasture_area_one_decade_back: - title: Area used for pastures one decade back - unit: hectares - short_unit: ha - description: | - Surface area devoted to pastures one decade before a given year. For example, for year 2020, this variable gives the extent of pastures in 2010. - year_one_decade_back: - title: Year one decade back - unit: "" - short_unit: "" - description: | - Year one decade before a given year. For example, for year 2020, this variable would be 2010. - agriculture_area_change: - title: Change in agriculture area with respect to one decade back - unit: "%" - short_unit: "%" - description: | - Percentage change in surface area devoted to agriculture with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. - - This data is used to assess which countries may have already peaked in their agricultural land use. - - Assessing this by looking at annual land use data is difficult because there can be significant year-to-year variability. That land use for one or two years was lower than previous years would be insufficient to conclude that a country had peaked. - - For this reason we look at decadal changes in agricultural land. We look at land use in the latest year relative to 10 years before. - - If land use is lower in the latest year then we suggest that land use may have peaked. If land use it the same or higher than a decade back, we suggest that it hasn't, or this is uncertain. - cropland_area_change: - title: Change in cropland area with respect to one decade back - unit: "%" - short_unit: "%" - description: | - Percentage change in surface area devoted to croplands with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. - - This data is used to assess which countries may have already peaked in their agricultural land use. - - Assessing this by looking at annual land use data is difficult because there can be significant year-to-year variability. That land use for one or two years was lower than previous years would be insufficient to conclude that a country had peaked. - - For this reason we look at decadal changes in agricultural land. We look at land use in the latest year relative to 10 years before. - - If land use is lower in the latest year then we suggest that land use may have peaked. If land use it the same or higher than a decade back, we suggest that it hasn't, or this is uncertain. - pasture_area_change: - title: Change in pasture area with respect to one decade back - unit: "%" - short_unit: "%" - description: | - Percentage change in surface area devoted to pastures with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. - - This data is used to assess which countries may have already peaked in their agricultural land use. - - Assessing this by looking at annual land use data is difficult because there can be significant year-to-year variability. That land use for one or two years was lower than previous years would be insufficient to conclude that a country had peaked. - - For this reason we look at decadal changes in agricultural land. We look at land use in the latest year relative to 10 years before. - - If land use is lower in the latest year then we suggest that land use may have peaked. If land use it the same or higher than a decade back, we suggest that it hasn't, or this is uncertain. - hypothetical_meat_consumption: - variables: - animals_global: - title: Number of slaughtered animals to produce meat worldwide - unit: "animals" - short_unit: "" - animals_global_hypothetical: - title: Hypothetical number of slaughtered animals if everyone ate like the average citizen of a given country - unit: "animals" - short_unit: "" - description: | - Hypothetical number of slaughtered animals worldwide if everyone in the world ate the same quantity as the average citizen of a given country. - - This is a hypothetical variable derived by Our World in Data which answers the question: "How many animals would need to be slaughtered if everyone in the world consumed the average per capita amount of a given country?". For example: "How many animals would need to be slaughtered if everyone in the world consumed the same amount of meat as the average UK citizen?". - - This was derived by multiplying global population by the per capita number of slaughtered animals of a given country. - animals_per_capita: - title: Number of slaughtered animals per person in each country - unit: "animals per person" - short_unit: "" - global_population: - title: World population - unit: "people" - short_unit: "" - production_global: - title: Total amount of meat produced worldwide - unit: "tonnes" - short_unit: "t" - production_global_hypothetical: - title: Hypothetical global meat demand if everyone ate like the average citizen of a given country - unit: "tonnes" - short_unit: "t" - description: | - Hypothetical global meat demand if everyone in the world ate the same quantity as the average citizen of a given country. - - This is a hypothetical variable derived by Our World in Data which answers the question: "What would global meat production have to be if everyone in the world consumed the average per capita amount of a given country?". For example: "How much meat would we need to produce if everyone in the world consumed the same amount of meat as the average UK citizen?". - - This was derived by multiplying global population by per capita meat supply of a given country. - production_per_capita: - title: Per-capita production of meat in each country - unit: "tonnes per person" - short_unit: "t/person" - cereal_allocation: - variables: - cereals_allocated_to_animal_feed: - title: Cereals allocated to animal feed - unit: tonnes - short_unit: t - description: | - Quantity of cereal crops allocated to animal feed (and not human food or other uses, such as biofuel production). - cereals_allocated_to_food: - title: Cereals allocated to human food - unit: tonnes - short_unit: t - description: | - Quantity of cereal crops allocated to human food (and not animal feed or other uses, such as biofuel production). - cereals_allocated_to_other_uses: - title: Cereals allocated to other uses - unit: tonnes - short_unit: t - description: | - Quantity of cereal crops allocated to other uses (and not to human food or animal feed), predominantly industrial uses such as biofuel production. - share_of_cereals_allocated_to_animal_feed: - title: Share of cereals that are allocated to animal feed - unit: "%" - short_unit: "%" - description: | - This is calculated by dividing the amount of cereals allocated to animal feed by the sum of all cereal uses considered (namely human food, animal feed, and other uses such us biofuel production). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. - share_of_cereals_allocated_to_food: - title: Share of cereals that are allocated to human food - unit: "%" - short_unit: "%" - description: | - This is calculated by dividing the amount of cereals allocated to human food by the sum of all cereal uses considered (namely human food, animal feed, and other uses such us biofuel production). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. - share_of_cereals_allocated_to_other_uses: - title: Share of cereals that are allocated to other uses such as biofuel production - unit: "%" - short_unit: "%" - description: | - This is calculated by dividing the amount of cereals allocated to other uses (predominantly industrial uses such as biofuel production) by the sum of all cereal uses considered (namely human food, animal feed, and other uses). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. - # All metadata for maize_and_wheat and fertilizer_exports is prepared via script. - # maize_and_wheat: - # fertilizer_exports: diff --git a/etl/steps/archive/garden/faostat/2023-02-22/additional_variables.py b/etl/steps/archive/garden/faostat/2023-02-22/additional_variables.py deleted file mode 100644 index ab3f7ad4a49..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/additional_variables.py +++ /dev/null @@ -1,1296 +0,0 @@ -"""Dataset that combines different variables of other FAOSTAT datasets. - -""" - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore -from owid.datautils.dataframes import multi_merge -from shared import NAMESPACE - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def generate_arable_land_per_crop_output(df_rl: pd.DataFrame, df_qi: pd.DataFrame) -> Table: - # Item code for item "Arable land" of faostat_rl dataset. - ITEM_CODE_FOR_ARABLE_LAND = "00006621" - # Element code for element "Area" of faostat_rl dataset. - ELEMENT_CODE_FOR_AREA = "005110" - # Item code for item "Crops" of faostat_qi dataset. - ITEM_CODE_FOR_CROPS = "00002041" - # Element code for "Gross Production Index Number (2014-2016 = 100)" of faostat_qi dataset. - ELEMENT_CODE_PRODUCTION_INDEX = "000432" - # Reference year for production index (values of area/index will be divided by the value on this year). - PRODUCTION_INDEX_REFERENCE_YEAR = 1961 - - # Select the necessary item and element of the land use dataset. - df_rl = df_rl[ - (df_rl["item_code"] == ITEM_CODE_FOR_ARABLE_LAND) & (df_rl["element_code"] == ELEMENT_CODE_FOR_AREA) - ].reset_index(drop=True) - # Sanity check. - error = "Unit for element 'Area' in faostat_rl has changed." - assert list(df_rl["unit"].unique()) == ["hectares"], error - # Rename columns and select only necessary columns. - df_rl = df_rl[["country", "year", "value"]].rename(columns={"value": "area"}).reset_index(drop=True) - - # Select the necessary item and element of the production index dataset. - df_qi = df_qi[ - (df_qi["element_code"] == ELEMENT_CODE_PRODUCTION_INDEX) & (df_qi["item_code"] == ITEM_CODE_FOR_CROPS) - ].reset_index(drop=True) - # Sanity check. - error = "Unit for element 'Gross Production Index Number (2014-2016 = 100)' in faostat_qi has changed." - assert list(df_qi["unit"].unique()) == ["index"], error - # Rename columns and select only necessary columns. - df_qi = df_qi[["country", "year", "value"]].rename(columns={"value": "index"}) - - # Combine both dataframes. - combined = pd.merge(df_rl, df_qi, on=["country", "year"], how="inner", validate="one_to_one") - - # Create the new variable of arable land per crop output. - combined["value"] = combined["area"] / combined["index"] - - # Add a column of a reference value for each country, and normalize data by dividing by the reference value. - reference = combined[combined["year"] == PRODUCTION_INDEX_REFERENCE_YEAR][["country", "value"]].reset_index( - drop=True - ) - combined = pd.merge( - combined, reference[["country", "value"]], on=["country"], how="left", suffixes=("", "_reference") - ) - combined["value"] /= combined["value_reference"] - - # Remove all countries for which we did not have data for the reference year. - combined = combined.dropna(subset="value").reset_index(drop=True) - - # Remove unnecessary columns and rename conveniently. - combined = combined.drop(columns=["value_reference"]).rename(columns={"value": "arable_land_per_crop_output"}) - - # Set an appropriate index and sort conveniently. - tb_combined = Table( - combined.set_index(["country", "year"], verify_integrity=True).sort_index(), - short_name="arable_land_per_crop_output", - ) - - return tb_combined - - -def generate_area_used_for_production_per_crop_type(df_qcl: pd.DataFrame) -> Table: - # Element code for "Area harvested" of faostat_qcl dataset. - ELEMENT_CODE_FOR_AREA_HARVESTED = "005312" - - # List of items belonging to item group "Coarse Grain, Total", according to - # https://www.fao.org/faostat/en/#definitions - ITEM_CODES_COARSE_GRAINS = [ - "00000044", # Barley - "00000089", # Buckwheat - "00000101", # Canary seed - "00000108", # Cereals n.e.c. - "00000108", # Cereals nes - "00000094", # Fonio - "00000103", # Grain, mixed - "00000056", # Maize - "00000056", # Maize (corn) - "00000079", # Millet - "00000103", # Mixed grain - "00000075", # Oats - "00000092", # Quinoa - "00000071", # Rye - "00000083", # Sorghum - "00000097", # Triticale - ] - - # Item codes for croup groups from faostat_qcl. - ITEM_CODES_OF_CROP_GROUPS = [ - "00001717", # Cereals - "00001804", # Citrus Fruit - "00001738", # Fruit - "00000780", # Jute - "00001732", # Oilcrops, Oil Equivalent - "00001726", # Pulses - "00001720", # Roots and tubers - "00001729", # Treenuts - "00001735", # Vegetables - "00001814", # Coarse Grain - ] - - error = "Not all expected item codes were found in QCL." - assert set(ITEM_CODES_COARSE_GRAINS) < set(df_qcl["item_code"]), error - - # Select the world and the element code for area harvested. - area_by_crop_type = df_qcl[ - (df_qcl["country"] == "World") & (df_qcl["element_code"] == ELEMENT_CODE_FOR_AREA_HARVESTED) - ].reset_index(drop=True) - error = "Unit for element 'Area harvested' in faostat_qcl has changed." - assert list(area_by_crop_type["unit"].unique()) == ["hectares"], error - - # Add items for item group "Coarse Grain, Total". - coarse_grains = ( - area_by_crop_type[(area_by_crop_type["item_code"].isin(ITEM_CODES_COARSE_GRAINS))] - .groupby("year", as_index=False) - .agg({"value": "sum"}) - .assign(**{"item": "Coarse Grain", "item_code": "00001814"}) - ) - area_by_crop_type = pd.concat( - [area_by_crop_type[~area_by_crop_type["item_code"].isin(ITEM_CODES_COARSE_GRAINS)], coarse_grains], - ignore_index=True, - ) - - area_by_crop_type = area_by_crop_type[area_by_crop_type["item_code"].isin(ITEM_CODES_OF_CROP_GROUPS)].reset_index( - drop=True - ) - - # Prepare variable description. - descriptions = "Definitions by FAOSTAT:" - for item in sorted(set(area_by_crop_type["item"])): - descriptions += f"\n\nItem: {item}" - item_description = area_by_crop_type[area_by_crop_type["item"] == item]["item_description"].fillna("").iloc[0] - if len(item_description) > 0: - descriptions += f"\nDescription: {item_description}" - - descriptions += f"\n\nMetric: {area_by_crop_type['element'].iloc[0]}" - descriptions += f"\nDescription: {area_by_crop_type['element_description'].iloc[0]}" - - # Create a table with the necessary columns, set an appropriate index, and sort conveniently. - tb_area_by_crop_type = Table( - area_by_crop_type[["item", "year", "value"]] - .rename(columns={"value": "area_used_for_production"}) - .set_index(["item", "year"], verify_integrity=True) - .sort_index(), - short_name="area_used_per_crop_type", - ) - - # Add a table description. - tb_area_by_crop_type["area_used_for_production"].metadata.description = descriptions - - return tb_area_by_crop_type - - -def generate_percentage_of_sustainable_and_overexploited_fish(df_sdgb: pd.DataFrame) -> Table: - # "14.4.1 Proportion of fish stocks within biologically sustainable levels (not overexploited) (%)" - ITEM_CODE_SUSTAINABLE_FISH = "00024029" - - # Select the necessary item. - df_sdgb = df_sdgb[df_sdgb["item_code"] == ITEM_CODE_SUSTAINABLE_FISH].reset_index(drop=True) - error = "Unit for fish data has changed." - assert list(df_sdgb["unit"].unique()) == ["percent"], error - error = "Element for fish data has changed." - assert list(df_sdgb["element"].unique()) == ["Value"], error - - # Select necessary columns (item and element descriptions are empty in the current version). - df_sdgb = df_sdgb[["country", "year", "value"]].rename(columns={"value": "sustainable_fish"}) - - error = "Percentage of sustainable fish larger than 100%." - assert (df_sdgb["sustainable_fish"] <= 100).all(), error - - # Add column of percentage of overexploited fish. - df_sdgb["overexploited_fish"] = 100 - df_sdgb["sustainable_fish"] - - # Create a table with the necessary columns, set an appropriate index, and sort conveniently. - tb_fish = ( - Table(df_sdgb, short_name="share_of_sustainable_and_overexploited_fish") - .set_index(["country", "year"], verify_integrity=True) - .sort_index() - ) - - return tb_fish - - -def generate_spared_land_from_increased_yields(df_qcl: pd.DataFrame) -> Table: - # Reference year (to see how much land we spare from increased yields). - REFERENCE_YEAR = 1961 - # Element code for "Yield" of faostat_qcl dataset. - ELEMENT_CODE_FOR_YIELD = "005419" - # Element code for "Production" of faostat_qcl dataset. - ELEMENT_CODE_FOR_PRODUCTION = "005510" - - # Item codes for crop groups from faostat_qcl. - ITEM_CODES_OF_CROP_GROUPS = [ - "00001717", # Cereals - "00001738", # Fruit - "00001726", # Pulses - "00001720", # Roots and tubers - "00001735", # Vegetables - "00001723", # Sugar Crops - "00001729", # Treenuts - # Data for fibre crops has changed significantly since last version, and is also significantly smaller than - # other crop groups, so we omit it. - # "00000821", # Fibre crops. - ] - - # Select necessary items and elements. - spared_land = df_qcl[ - (df_qcl["item_code"].isin(ITEM_CODES_OF_CROP_GROUPS)) - & (df_qcl["element_code"].isin([ELEMENT_CODE_FOR_PRODUCTION, ELEMENT_CODE_FOR_YIELD])) - ].reset_index(drop=True) - - # Sanity check. - error = "Units for production and yield have changed." - assert set(spared_land["unit"]) == set(["tonnes per hectare", "tonnes"]), error - - # Transpose dataframe. - spared_land = spared_land.pivot( - index=["country", "year", "item"], columns=["element"], values="value" - ).reset_index() - - # Fix spurious index name after pivotting. - spared_land.columns = list(spared_land.columns) - - # Add columns for production and yield for a given reference year. - reference_values = spared_land[spared_land["year"] == REFERENCE_YEAR].drop(columns=["year"]) - spared_land = pd.merge( - spared_land, reference_values, on=["country", "item"], how="left", suffixes=("", f" in {REFERENCE_YEAR}") - ) - - # Drop countries for which we did not have data in the reference year. - spared_land = spared_land.dropna().reset_index(drop=True) - - # Calculate area harvested that would be required given current production, with the yield of the reference year. - spared_land[f"Area with yield of {REFERENCE_YEAR}"] = ( - spared_land["Production"] / spared_land[f"Yield in {REFERENCE_YEAR}"] - ) - # Calculate the real area harvested (given the current production and yield). - spared_land["Area"] = spared_land["Production"] / spared_land["Yield"] - - # Keep only required columns - spared_land = spared_land[["country", "year", "item", "Area", f"Area with yield of {REFERENCE_YEAR}"]].reset_index( - drop=True - ) - - # Add total area for all crops. - all_crops = ( - spared_land.groupby(["country", "year"], as_index=False, observed=True) - .agg({"Area": sum, f"Area with yield of {REFERENCE_YEAR}": sum}) - .assign(**{"item": "All crops"}) - ) - spared_land = pd.concat([spared_land, all_crops], ignore_index=True) - - # Calculate the spared land in total value, and as a percentage of land we would have used with no yield increase. - spared_land["Spared land"] = spared_land[f"Area with yield of {REFERENCE_YEAR}"] - spared_land["Area"] - spared_land["Spared land (%)"] = ( - 100 * spared_land["Spared land"] / spared_land[f"Area with yield of {REFERENCE_YEAR}"] - ) - - # Create a table with the necessary columns, set an appropriate index, and sort conveniently. - tb_spared_land = Table( - spared_land.set_index(["country", "year", "item"], verify_integrity=True).sort_index(), - short_name="land_spared_by_increased_crop_yields", - underscore=True, - ) - - return tb_spared_land - - -def generate_food_available_for_consumption(df_fbsc: pd.DataFrame) -> Table: - # Element code for "Food available for consumption" of faostat_fbsc (in kilocalories per day per capita). - ELEMENT_CODE_FOR_PER_CAPITA_FOOD = "0664pc" - # Expected unit. - CONSUMPTION_UNIT = "kilocalories per day per capita" - - # Select relevant metric. - df_fbsc = df_fbsc[(df_fbsc["element_code"] == ELEMENT_CODE_FOR_PER_CAPITA_FOOD)].reset_index(drop=True) - - # Sanity check. - error = "Units for food available for consumption have changed." - assert list(df_fbsc["unit"].unique()) == [CONSUMPTION_UNIT], error - - # List of food groups created by OWID. - # Each food group contains one or more "item groups", defined by FAOSTAT. - # Each item group contains one or more "item", defined by FAOSTAT. - # The complete list of items coincides exactly with the complete list of items of FAOSTAT item group "Grand Total" - # (with item group code 2901). - # So all existing food items in FBSC are contained here, and there are no repetitions. - # Notes: - # * There are a few item groups that are not included here, namely "Vegetal Products" (item group code 2903), - # and "Animal Products" (item group code 2941). But their items are contained in other item groups, so including - # them would cause unnecessary repetition of items. - # * To check for the components of an individual item group: - # from etl.paths import DATA_DIR - # metadata = Dataset(DATA_DIR / "meadow/faostat/2023-02-22/faostat_metadata") - # item_groups = metadata["faostat_fbs_item_group"] - # set(item_groups.loc[2941]["item"]) - FOOD_GROUPS = { - "Cereals and grains": [ - "00002905", # Cereals, Excluding Beer - # Item group contains: - # 'Barley and products', - # 'Cereals, Other', - # 'Maize and products', - # 'Millet and products', - # 'Oats', - # 'Rice and products', - # 'Rye and products', - # 'Sorghum and products', - # 'Wheat and products', - ], - "Pulses": [ - "00002911", # Pulses - # Item group contains: - # 'Beans', - # 'Peas', - # 'Pulses, Other and products', - ], - "Starchy roots": [ - "00002907", # Starchy Roots - # Item group contains: - # 'Cassava and products', - # 'Potatoes and products', - # 'Roots, Other', - # 'Sweet potatoes', - # 'Yams', - ], - "Fruits and vegetables": [ - "00002919", # Fruits - Excluding Wine - # Item group contains: - # 'Apples and products', - # 'Bananas', - # 'Citrus, Other', - # 'Dates', - # 'Fruits, other', - # 'Grapefruit and products', - # 'Grapes and products (excl wine)', - # 'Lemons, Limes and products', - # 'Oranges, Mandarines', - # 'Pineapples and products', - # 'Plantains', - "00002918", # Vegetables - # Item group contains: - # 'Onions', - # 'Tomatoes and products', - # 'Vegetables, other', - ], - "Oils and fats": [ - "00002914", # Vegetable Oils - # Item group contains: - # 'Coconut Oil', - # 'Cottonseed Oil', - # 'Groundnut Oil', - # 'Maize Germ Oil', - # 'Oilcrops Oil, Other', - # 'Olive Oil', - # 'Palm Oil', - # 'Palmkernel Oil', - # 'Rape and Mustard Oil', - # 'Ricebran Oil', - # 'Sesameseed Oil', - # 'Soyabean Oil', - # 'Sunflowerseed Oil' - "00002946", # Animal fats group - # Item group contains: - # 'Butter, Ghee', - # 'Cream', - # 'Fats, Animals, Raw', - # 'Fish, Body Oil', - # 'Fish, Liver Oil' - "00002913", # Oilcrops - # Item group contains: - # 'Coconuts - Incl Copra', - # 'Cottonseed', - # 'Groundnuts', - # 'Oilcrops, Other', - # 'Olives (including preserved)', - # 'Palm kernels', - # 'Rape and Mustardseed', - # 'Sesame seed', - # 'Soyabeans', - # 'Sunflower seed' - "00002912", # Treenuts - # Item group contains: - # 'Nuts and products', - ], - "Sugar": [ - "00002909", # Sugar & Sweeteners - # Item group contains: - # 'Honey', - # 'Sugar (Raw Equivalent)', - # 'Sugar non-centrifugal', - # 'Sweeteners, Other', - "00002908", # Sugar crops - # Item group contains: - # 'Sugar beet', - # 'Sugar cane', - ], - "Meat": [ - "00002960", # Fish and seafood - # Item group contains: - # 'Aquatic Animals, Others', - # 'Cephalopods', - # 'Crustaceans', - # 'Demersal Fish', - # 'Freshwater Fish', - # 'Marine Fish, Other', - # 'Molluscs, Other', - # 'Pelagic Fish', - "00002943", # Meat, total - # Item group contains: - # 'Bovine Meat', - # 'Meat, Other', - # 'Mutton & Goat Meat', - # 'Pigmeat', - # 'Poultry Meat', - ], - "Dairy and eggs": [ - "00002948", # Milk - Excluding Butter - # Item group contains: - # 'Milk - Excluding Butter', - "00002949", # Eggs - # Item group contains: - # 'Eggs', - ], - "Alcoholic beverages": [ - "00002924", # Alcoholic Beverages - # Item group contains: - # 'Alcohol, Non-Food', - # 'Beer', - # 'Beverages, Alcoholic', - # 'Beverages, Fermented', - # 'Wine', - ], - "Other": [ - "00002928", # Miscellaneous - # Item group contains: - # 'Infant food', - # 'Miscellaneous', - "00002923", # Spices - # Item group contains: - # 'Cloves', - # 'Pepper', - # 'Pimento', - # 'Spices, Other', - "00002922", # Stimulants - # Item group contains: - # 'Cocoa Beans and products', - # 'Coffee and products', - # 'Tea (including mate)', - "00002945", # Offals - # Item group contains: - # 'Offals, Edible', - "00002961", # Aquatic Products, Other - # 'Aquatic Plants', - # 'Meat, Aquatic Mammals', - ], - } - - # Sanity check. - error = "Not all expected item codes are found in the data." - assert set([item_code for group in FOOD_GROUPS.values() for item_code in group]) <= set(df_fbsc["item_code"]), error - - # Create a list of dataframes, one for each food group. - dfs = [ - df_fbsc[df_fbsc["item_code"].isin(FOOD_GROUPS[group])] - .groupby(["country", "year"], as_index=False, observed=True) - .agg({"value": "sum"}) - .rename(columns={"value": group}) - for group in FOOD_GROUPS - ] - combined = multi_merge(dfs=dfs, on=["country", "year"], how="outer") - - # Create a table, set an appropriate index, and sort conveniently. - tb_food_available_for_consumption = Table( - combined.set_index(["country", "year"], verify_integrity=True).sort_index(), - short_name="food_available_for_consumption", - underscore=True, - ) - - # Prepare variable metadata. - common_description = ( - "Data represents the average daily per capita supply of calories from the full range of " - "commodities, grouped by food categories. Note that these figures do not correct for waste at the " - "household/consumption level so may not directly reflect the quantity of food finally consumed by a given " - "individual.\n\nSpecific food commodities have been grouped into higher-level categories." - ) - for group in FOOD_GROUPS: - item_names = list(df_fbsc[df_fbsc["item_code"].isin(FOOD_GROUPS[group])]["item"].unique()) - description = ( - common_description - + f" Food group '{group}' includes the FAO item groups: '" - + "', '".join(item_names) - + "'." - ) - tb_food_available_for_consumption[ - underscore(group) - ].metadata.title = f"Daily caloric intake per person from {group.lower().replace('other', 'other commodities')}" - tb_food_available_for_consumption[underscore(group)].metadata.unit = CONSUMPTION_UNIT - tb_food_available_for_consumption[underscore(group)].metadata.short_unit = "kcal" - tb_food_available_for_consumption[underscore(group)].metadata.description = description - - return tb_food_available_for_consumption - - -def generate_macronutrient_compositions(df_fbsc: pd.DataFrame) -> Table: - # Item code for "Total" of faostat_fbsc. - ITEM_CODE_ALL_PRODUCTS = "00002901" - # Item code for "Vegetal Products" of faostat_fbsc. - ITEM_CODE_VEGETAL_PRODUCTS = "00002903" - # Item code for "Animal Products" of faostat_fbsc. - ITEM_CODE_ANIMAL_PRODUCTS = "00002941" - - # Element code for "Food available for consumption" of faostat_fbsc (in kilocalories per day per capita). - ELEMENT_CODE_FOR_ENERGY_PER_DAY = "0664pc" - # Element code for "Food available for consumption" of faostat_fbsc (in grams of protein per day per capita). - ELEMENT_CODE_FOR_PROTEIN_PER_DAY = "0674pc" - # Element code for "Food available for consumption" of faostat_fbsc (in grams of fat per day per capita). - ELEMENT_CODE_FOR_FAT_PER_DAY = "0684pc" - - # Assumed energy density by macronutrient, in kilocalories per gram of fat, protein or carbohydrates. - KCAL_PER_GRAM_OF_FAT = 9 - KCAL_PER_GRAM_OF_PROTEIN = 4 - KCAL_PER_GRAM_OF_CARBOHYDRATES = 4 - - # Select relevant items and elements. - df = df_fbsc[ - (df_fbsc["item_code"].isin([ITEM_CODE_ALL_PRODUCTS, ITEM_CODE_ANIMAL_PRODUCTS, ITEM_CODE_VEGETAL_PRODUCTS])) - & ( - df_fbsc["element_code"].isin( - [ELEMENT_CODE_FOR_ENERGY_PER_DAY, ELEMENT_CODE_FOR_PROTEIN_PER_DAY, ELEMENT_CODE_FOR_FAT_PER_DAY] - ) - ) - ].reset_index(drop=True) - - # Sanity check. - error = "One or more of the units of food available for consumption has changed." - assert list(df["unit"].unique()) == [ - "kilocalories per day per capita", - "grams of protein per day per capita", - "grams of fat per day per capita", - ], error - - # Food contents and element code for the metric of their consumption per day per capita. - food_contents = { - "energy": ELEMENT_CODE_FOR_ENERGY_PER_DAY, - "fat": ELEMENT_CODE_FOR_FAT_PER_DAY, - "protein": ELEMENT_CODE_FOR_PROTEIN_PER_DAY, - } - - # Initialize a list of dataframes, one for each food content (energy, fat or protein). - dfs = [] - for content in food_contents: - # Create a dataframe for each food content, and add it to the list. - df_content = df[df["element_code"] == food_contents[content]].pivot( - index=["country", "year"], columns=["item"], values=["value"] - ) # .reset_index() - df_content.columns = df_content.columns.droplevel(0) - df_content = df_content.reset_index().rename( - columns={ - "Total": f"Total {content}", - "Vegetal Products": f"{content.capitalize()} from vegetal products", - "Animal Products": f"{content.capitalize()} from animal products", - } - ) - dfs.append(df_content) - - # Sanity check. - error = f"The sum of animal and vegetable {content} does not add up to the total." - assert ( - 100 - * abs( - df_content[f"{content.capitalize()} from animal products"] - + df_content[f"{content.capitalize()} from vegetal products"] - - df_content[f"Total {content}"] - ) - / df_content[f"Total {content}"] - < 1 - ).all(), error - - # Combine all dataframes. - combined = multi_merge(dfs=dfs, on=["country", "year"], how="outer") - - # Daily caloric intake from fat, per person. - combined["Total energy from fat"] = combined["Total fat"] * KCAL_PER_GRAM_OF_FAT - # Daily caloric intake from protein, per person. - combined["Total energy from protein"] = combined["Total protein"] * KCAL_PER_GRAM_OF_PROTEIN - # Daily caloric intake from carbohydrates (assumed to be the rest of the daily caloric intake), per person. - # This is the difference between the total caloric intake minus the caloric intake from protein and fat. - combined["Total energy from carbohydrates"] = ( - combined["Total energy"] - combined["Total energy from fat"] - combined["Total energy from protein"] - ) - - # Daily intake of carbohydrates per person. - combined["Total carbohydrates"] = combined["Total energy from carbohydrates"] / KCAL_PER_GRAM_OF_CARBOHYDRATES - - # Caloric intake from fat as a percentage of the total daily caloric intake. - combined["Share of energy from fat"] = 100 * combined["Total energy from fat"] / combined["Total energy"] - # Caloric intake from protein as a percentage of the total daily caloric intake. - combined["Share of energy from protein"] = 100 * combined["Total energy from protein"] / combined["Total energy"] - # Caloric intake from carbohydrates as a percentage of the total daily caloric intake. - combined["Share of energy from carbohydrates"] = ( - 100 * combined["Total energy from carbohydrates"] / combined["Total energy"] - ) - - # Daily caloric intake from animal protein. - combined["Energy from animal protein"] = combined["Protein from animal products"] * KCAL_PER_GRAM_OF_PROTEIN - # Caloric intake from animal protein as a percentage of the total daily caloric intake. - combined["Share of energy from animal protein"] = ( - 100 * combined["Energy from animal protein"] / combined["Total energy"] - ) - # Daily caloric intake from vegetal protein. - combined["Energy from vegetal protein"] = combined["Protein from vegetal products"] * KCAL_PER_GRAM_OF_PROTEIN - # Caloric intake from vegetal protein as a percentage of the total daily caloric intake. - combined["Share of energy from vegetal protein"] = ( - 100 * combined["Energy from vegetal protein"] / combined["Total energy"] - ) - - # Create a table, set an appropriate index, and sort conveniently. - tb_combined = Table( - combined.set_index(["country", "year"], verify_integrity=True).sort_index(), - short_name="macronutrient_compositions", - underscore=True, - ) - - return tb_combined - - -def generate_fertilizers(df_ef: pd.DataFrame, df_rl: pd.DataFrame) -> Table: - # Item code for "Cropland" (which includes arable land and permanent crops). - ITEM_CODE_FOR_CROPLAND = "00006620" - - # Element code for element "Area" of faostat_rl dataset. - ELEMENT_CODE_FOR_AREA = "005110" - - # Item codes for fertilizers in faostat_ef (namely nitrogen, phosphate and potash). - ITEM_CODES_FOR_FERTILIZERS = ["00003102", "00003103", "00003104"] - - # Element code for use per area of cropland. - ELEMENT_CODE_FOR_USE_PER_AREA = "005159" - - # Convert units from kilograms to tonnes. - KG_TO_TONNES = 1e-3 - - # Select necessary element (use per area). - fertilizers = df_ef[(df_ef["element_code"] == ELEMENT_CODE_FOR_USE_PER_AREA)].reset_index(drop=True) - - # Sanity checks. - error = "Unit for use per area has changed." - assert list(fertilizers["unit"].unique()) == ["kilograms per hectare"], error - - error = "Unexpected list of item codes for fertilizers (maybe another was added to faostat_ef)." - assert set(fertilizers["item_code"]) == set(ITEM_CODES_FOR_FERTILIZERS), error - - # Transpose fertilizers data. - fertilizers = fertilizers.pivot(index=["country", "year"], columns=["item"], values=["value"]) - - # Fix spurious index names after pivoting, and rename columns conveniently. - fertilizers.columns = [column[1] for column in fertilizers.columns] - - fertilizers = fertilizers.rename( - columns={ - "Nutrient nitrogen N (total)": "nitrogen_per_cropland", - "Nutrient phosphate P2O5 (total)": "phosphate_per_cropland", - "Nutrient potash K2O (total)": "potash_per_cropland", - }, - errors="raise", - ) - - # Add column for total fertilizers per area cropland. - fertilizers["all_fertilizers_per_cropland"] = fertilizers[ - ["nitrogen_per_cropland", "phosphate_per_cropland", "potash_per_cropland"] - ].sum(axis=1) - - # To get total agricultural use of fertilizers, we need cropland area. - area = df_rl[ - (df_rl["element_code"] == ELEMENT_CODE_FOR_AREA) & (df_rl["item_code"] == ITEM_CODE_FOR_CROPLAND) - ].reset_index(drop=True) - - # Sanity check. - error = "Unit for area has changed." - assert list(area["unit"].unique()) == ["hectares"], error - - # Transpose area data. - area = area.pivot(index=["country", "year"], columns=["item"], values=["value"]).reset_index() - area.columns = ["country", "year", "cropland"] - - # Combine fertilizers and area. - combined = pd.merge(fertilizers, area, on=["country", "year"], how="outer", validate="one_to_one") - - # Add variables for total fertilizer use. - for fertilizer in ["nitrogen", "phosphate", "potash", "all_fertilizers"]: - combined[f"{fertilizer}_use"] = combined[f"{fertilizer}_per_cropland"] * combined["cropland"] * KG_TO_TONNES - - # Create a table, set an appropriate index, and sort conveniently. - tb_fertilizers = Table( - combined.set_index(["country", "year"], verify_integrity=True).sort_index(), - short_name="fertilizers", - underscore=True, - ) - - return tb_fertilizers - - -def generate_vegetable_oil_yields(df_qcl: pd.DataFrame, df_fbsc: pd.DataFrame) -> Table: - # Element code for "Production" in faostat_qcl. - ELEMENT_CODE_FOR_PRODUCTION_QCL = "005510" - # Element code for "Production" in faostat_fbsc. - ELEMENT_CODE_FOR_PRODUCTION_FBSC = "005511" - # Unit for "Production". - UNIT_FOR_PRODUCTION = "tonnes" - # Element code for "Area harvested". - ELEMENT_CODE_FOR_AREA = "005312" - # Unit for "Area harvested". - UNIT_FOR_AREA = "hectares" - # Item code for "Vegetable Oils" (required to get the global production of vegetable oils on a given year). - ITEM_CODE_FOR_VEGETABLE_OILS_TOTAL = "00002914" - # Item codes in faostat_qcl for the area of the crops (we don't need the production of the crops). - ITEM_CODE_FOR_EACH_CROP_AREA = { - # The item "Palm fruit oil" refers to the fruit that contains both the pulp (that leads to palm oil) - # as well as the kernel (that leads to palm kernel oil). - "palm": "00000254", # Palm fruit oil - "sunflower": "00000267", # Sunflower seed - "rapeseed": "00000270", # Rapeseed - "soybean": "00000236", # Soybeans - "olive": "00000260", # Olives - "coconut": "00000249", # Coconuts - "groundnut": "00000242", # Groundnuts - "cottonseed": "00000328", # Seed cotton - "sesame": "00000289", # Sesame seed - # Item "Maize" has the description "[...] This class includes: - maize harvested for their dry grains only" - # So it's not clear whether it includes area used for maize oil, and therefore I won't consider it. - # "maize": "00000056", # Maize - # Other vegetable oils not considered. - # "safflower": "00000280", # Safflower seed - # "linseed": "00000333", # Linseed - } - # Item codes in faostat_qcl for the production of the oils (there is no area harvested data for oils). - ITEM_CODE_FOR_EACH_CROP_PRODUCTION = { - # The item "Palm oil" doesn't have a description, but it probably refers to only the oil from the pulp of the - # palm fruit (therefore it does not include the kernel). - "palm": "00000257", # Palm oil - # The item "Palm kernel oil" clearly refers to only the oil produced from the kernel of the palm fruit. - # Therefore, "Palm oil" and "Palm kernel oil" will need to be combined to account for all oils produced from - # the palm fruit (item "Palm fruit oil" for which we have the area harvested). - "palm_kernel": "00000258", # Palm kernel oil - "sunflower": "00000268", # Sunflower oil - "rapeseed": "00000271", # Rapeseed oil - "soybean": "00000237", # Soybean oil - "olive": "00000261", # Olive oil - "coconut": "00000252", # Coconut oil - "groundnut": "00000244", # Groundnut oil - "cottonseed": "00000331", # Cottonseed oil - "sesame": "00000290", # Sesame oil - # Item "maize" is not included (see comment above). - # "maize": "00000060", # Maize oil - # Other vegetable oils not considered. - # "safflower": "00000281", # Safflower oil - # "linseed": "00000334", # Linseed oil - } - - # Extract the total production of vegetable oil. This is given in fbsc but not qcl. - total_production = df_fbsc[ - (df_fbsc["country"] == "World") - & (df_fbsc["item_code"] == ITEM_CODE_FOR_VEGETABLE_OILS_TOTAL) - & (df_fbsc["element_code"] == ELEMENT_CODE_FOR_PRODUCTION_FBSC) - & (df_fbsc["unit"] == UNIT_FOR_PRODUCTION) - ].reset_index(drop=True) - - # Transpose data. - total_production = total_production.pivot( - index=["country", "year"], columns=["item_code"], values=["value"] - ).rename(columns={ITEM_CODE_FOR_VEGETABLE_OILS_TOTAL: "vegetable_oils_production"}) - - # Fix column names after pivoting. - total_production.columns = [column[1] for column in total_production.columns] - total_production = total_production.reset_index().drop(columns=["country"]) - - # Select relevant items, elements and units for the production of crops. - production = df_qcl[ - (df_qcl["item_code"].isin(ITEM_CODE_FOR_EACH_CROP_PRODUCTION.values())) - & (df_qcl["unit"] == UNIT_FOR_PRODUCTION) - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION_QCL) - ].reset_index(drop=True) - - # Transpose data. - production = production.pivot(index=["country", "year"], columns=["item_code"], values=["value"]) - - # Fix column names after pivoting. - production.columns = np.array(production.columns.tolist())[:, 1] - - # Assign a convenient name to each crop. - CROP_NAME_FOR_ITEM_CODE = { - ITEM_CODE_FOR_EACH_CROP_PRODUCTION[item_code]: item_code for item_code in ITEM_CODE_FOR_EACH_CROP_PRODUCTION - } - production = production.rename( - columns={item_code: CROP_NAME_FOR_ITEM_CODE[item_code] + "_production" for item_code in production.columns} - ).reset_index() - - # Select relevant items, elements and units for the area of crops. - area = df_qcl[ - (df_qcl["item_code"].isin(ITEM_CODE_FOR_EACH_CROP_AREA.values())) - & (df_qcl["unit"] == UNIT_FOR_AREA) - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_AREA) - ].reset_index(drop=True) - - # Transpose data. - area = area.pivot(index=["country", "year"], columns=["item_code"], values=["value"]) - - # Fix column names after pivoting. - area.columns = np.array(area.columns.tolist())[:, 1] - - # Assign a convenient name to each crop. - CROP_NAME_FOR_ITEM_CODE = { - ITEM_CODE_FOR_EACH_CROP_AREA[item_code]: item_code for item_code in ITEM_CODE_FOR_EACH_CROP_AREA - } - area = area.rename( - columns={item_code: CROP_NAME_FOR_ITEM_CODE[item_code] + "_area" for item_code in area.columns} - ).reset_index() - - # Combine production and area. - combined = pd.merge(production, area, on=["country", "year"], how="outer") - - # Add column for global vegetable oil production. - combined = pd.merge(combined, total_production, on=["year"], how="left") - - # Combine the production of palm oil and palm kernel oil, since we have the area harvested for the palm fruit - # (which leads to the production of both palm oil and palm kernel oil). - combined["palm_production"] += combined["palm_kernel_production"] - combined = combined.drop(columns=["palm_kernel_production"]) - - # For each crop, create three relevant metrics. - for crop in ITEM_CODE_FOR_EACH_CROP_AREA: - # Vegetable oil yield, which is the amount of oil produced per area harvested of the original crop. - combined[f"{crop}_tonnes_per_hectare"] = combined[f"{crop}_production"] / combined[f"{crop}_area"] - # Hectares of the original crop harvested per tonne of oil produced (inverse of the previous). - combined[f"{crop}_hectares_per_tonne"] = combined[f"{crop}_area"] / combined[f"{crop}_production"] - # Area required to produce the total demand of vegetable oils using only one specific crop. - combined[f"{crop}_area_to_meet_global_oil_demand"] = ( - combined[f"{crop}_hectares_per_tonne"] * combined["vegetable_oils_production"] - ) - - # Replace infinite values (obtained when dividing by a null area) by nans. - combined = combined.replace(np.inf, np.nan) - - # Create a table, set an appropriate index, and sort conveniently. - tb_vegetable_oil_yields = Table( - combined.set_index(["country", "year"], verify_integrity=True).sort_index(), - short_name="vegetable_oil_yields", - underscore=True, - ) - - return tb_vegetable_oil_yields - - -def generate_agriculture_land_evolution(df_rl: pd.DataFrame) -> Table: - # Element code for "Area". - ELEMENT_CODE_FOR_AREA = "005110" - # Unit for element of area. - UNIT_FOR_AREA = "hectares" - # Item code for "Land under perm. meadows and pastures". - ITEM_CODE_FOR_PASTURES = "00006655" - # Item code for "Cropland". - ITEM_CODE_FOR_CROPLAND = "00006620" - # Item code for "Agricultural land". - ITEM_CODE_FOR_AGRICULTURAL_LAND = "00006610" - - # Select the relevant items, elements and units. - land = df_rl[ - (df_rl["unit"] == UNIT_FOR_AREA) - & (df_rl["element_code"] == ELEMENT_CODE_FOR_AREA) - & (df_rl["item_code"].isin([ITEM_CODE_FOR_AGRICULTURAL_LAND, ITEM_CODE_FOR_CROPLAND, ITEM_CODE_FOR_PASTURES])) - ].reset_index(drop=True) - - # Transpose data and rename columns conveniently. - land = land.pivot(index=["country", "year"], columns=["item_code"], values="value").reset_index() - land.columns = list(land.columns) - land = land.rename( - columns={ - ITEM_CODE_FOR_AGRICULTURAL_LAND: "agriculture_area", - ITEM_CODE_FOR_CROPLAND: "cropland_area", - ITEM_CODE_FOR_PASTURES: "pasture_area", - }, - errors="raise", - ) - - # Add columns corresponding to the values of one decade before. - _land = land.copy() - _land["_year"] = _land["year"] + 10 - combined = pd.merge( - land, - _land, - left_on=["country", "year"], - right_on=["country", "_year"], - how="inner", - suffixes=("", "_one_decade_back"), - ).drop(columns=["_year"]) - - # For each item, add the percentage change of land use this year with respect to one decade back. - for item in ["agriculture_area", "cropland_area", "pasture_area"]: - combined[f"{item}_change"] = ( - 100 * (combined[f"{item}"] - combined[f"{item}_one_decade_back"]) / combined[f"{item}_one_decade_back"] - ) - - # Set an appropriate index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create a table. - tb_land_use_evolution = Table(combined, short_name="agriculture_land_use_evolution", underscore=True) - - return tb_land_use_evolution - - -def generate_hypothetical_meat_consumption(df_qcl: pd.DataFrame) -> Table: - # Element code and unit for "Production". - ELEMENT_CODE_FOR_PRODUCTION = "005510" - UNIT_FOR_PRODUCTION = "tonnes" - # Element code and unit for per-capita "Production". - ELEMENT_CODE_FOR_PRODUCTION_PER_CAPITA = "5510pc" - UNIT_FOR_PRODUCTION_PER_CAPITA = "tonnes per capita" - # Element code and unit for "Producing or slaughtered animals". - ELEMENT_CODE_FOR_ANIMALS = "005320" - UNIT_FOR_ANIMALS = "animals" - # Element code and unit for per-capita "Producing or slaughtered animals". - ELEMENT_CODE_FOR_ANIMALS_PER_CAPITA = "5320pc" - UNIT_FOR_ANIMALS_PER_CAPITA = "animals per capita" - # Item code for "Meat, total". - ITEM_CODE_FOR_MEAT_TOTAL = "00001765" - - # Select the required items/elements/units to get national data on per-capita production and slaughtered animals. - meat = df_qcl[ - (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) - & (df_qcl["element_code"].isin([ELEMENT_CODE_FOR_PRODUCTION_PER_CAPITA, ELEMENT_CODE_FOR_ANIMALS_PER_CAPITA])) - & (df_qcl["unit"].isin([UNIT_FOR_PRODUCTION_PER_CAPITA, UNIT_FOR_ANIMALS_PER_CAPITA])) - ].reset_index(drop=True) - meat = meat.pivot(index=["country", "year"], columns="element_code", values="value").reset_index() - meat = meat.rename( - columns={ - ELEMENT_CODE_FOR_ANIMALS_PER_CAPITA: "animals_per_capita", - ELEMENT_CODE_FOR_PRODUCTION_PER_CAPITA: "production_per_capita", - } - ) - - # Take data for global population from the "population_with_data" column for the production of total meat. - # This should coincide with the true world population. - # Note that "population_with_data" may differ with the total population for certain items/elements for region - # aggregates (e.g. "Africa"). For slaughtered animals, population with data may also differ, since it's - # built for all countries (in the garden faostat_qcl step) by aggregating. - # But this does not happen with total meat/production for the "World", since this data was extracted directly from FAOSTAT. - # TODO: Confirm this by checking qcl code, especially the one about animals slaughtered - global_population = ( - df_qcl[ - (df_qcl["country"] == "World") - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION) - & (df_qcl["unit"] == UNIT_FOR_PRODUCTION) - & (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) - ][["year", "population_with_data"]] - .reset_index(drop=True) - .rename(columns={"population_with_data": "global_population"}) - ).astype({"global_population": int}) - - # Just for reference, extract global production and number of slaughtered animals. - global_production = ( - df_qcl[ - (df_qcl["country"] == "World") - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION) - & (df_qcl["unit"] == UNIT_FOR_PRODUCTION) - & (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) - ][["year", "value"]] - .reset_index(drop=True) - .rename(columns={"value": "production_global"}) - ) - global_animals = ( - df_qcl[ - (df_qcl["country"] == "World") - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_ANIMALS) - & (df_qcl["unit"] == UNIT_FOR_ANIMALS) - & (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) - ][["year", "value"]] - .reset_index(drop=True) - .rename(columns={"value": "animals_global"}) - ) - - # Combine national with global data. - combined = multi_merge(dfs=[meat, global_population, global_production, global_animals], on=["year"], how="left") - - # Sanity check. - error = "Rows have changed after merging national data with global data." - assert len(combined) == len(meat), error - - # Add columns for hypothetical global production and number of slaughtered animals. - # This is the production (or number of slaughtered animals) that would be needed worldwide to meet the demand of a given country. - combined["production_global_hypothetical"] = combined["production_per_capita"] * combined["global_population"] - combined["animals_global_hypothetical"] = combined["animals_per_capita"] * combined["global_population"] - - # Set an appropriate index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create a table with the combined data. - tb_hypothetical_meat_consumption = Table(combined, short_name="hypothetical_meat_consumption", underscore=True) - - return tb_hypothetical_meat_consumption - - -def generate_cereal_allocation(df_fbsc: pd.DataFrame) -> Table: - # Item code for "Cereals - Excluding Beer". - ITEM_CODE_FOR_CEREALS = "00002905" - # Note: We disregard the contribution from "00002520" ("Cereals, Other"), which is usually negligible compared to the total. - # Element code and unit for "Food". - # Note: The element code for "Food available for consumption" is "000645"; this should be the same data, except that - # it is given in kilograms (originally it was given per capita). Therefore, we use "Food", which is more convenient. - ELEMENT_CODE_FOR_FOOD = "005142" - UNIT_FOR_FOOD = "tonnes" - # Element code and unit for "Feed". - ELEMENT_CODE_FOR_FEED = "005521" - UNIT_FOR_FEED = "tonnes" - # Element code and unit for "Other uses". - ELEMENT_CODE_FOR_OTHER_USES = "005154" - UNIT_FOR_OTHER_USES = "tonnes" - - # Select the relevant items/elements. - cereals = df_fbsc[ - (df_fbsc["item_code"] == ITEM_CODE_FOR_CEREALS) - & (df_fbsc["element_code"].isin([ELEMENT_CODE_FOR_FOOD, ELEMENT_CODE_FOR_FEED, ELEMENT_CODE_FOR_OTHER_USES])) - ].reset_index(drop=True) - - # Sanity check. - error = "Units have changed" - assert set(cereals["unit"]) == set([UNIT_FOR_FOOD, UNIT_FOR_FEED, UNIT_FOR_OTHER_USES]), error - - # Transpose data and rename columns conveniently. - cereals = ( - cereals.pivot(index=["country", "year"], columns="element_code", values="value") - .reset_index() - .rename( - columns={ - ELEMENT_CODE_FOR_FOOD: "cereals_allocated_to_food", - ELEMENT_CODE_FOR_FEED: "cereals_allocated_to_animal_feed", - ELEMENT_CODE_FOR_OTHER_USES: "cereals_allocated_to_other_uses", - } - ) - ) - - # Add variables for the share of cereals allocated to each use. - all_cereal_uses = ["food", "animal_feed", "other_uses"] - for item in all_cereal_uses: - cereals[f"share_of_cereals_allocated_to_{item}"] = ( - 100 - * cereals[f"cereals_allocated_to_{item}"] - / cereals[[f"cereals_allocated_to_{use}" for use in all_cereal_uses]].sum(axis=1) - ) - - # Set an appropriate index and sort conveniently. - cereals = cereals.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create a table with the generated data. - tb_cereal_allocation = Table(cereals, short_name="cereal_allocation", underscore=True) - - return tb_cereal_allocation - - -def generate_maize_and_wheat(df_fbsc: pd.DataFrame) -> Table: - # Item code for "Wheat". - ITEM_CODE_FOR_WHEAT = "00002511" - # Item code for "Maize". - ITEM_CODE_FOR_MAIZE = "00002514" - # Element code for "Exports". - ELEMENT_CODE_FOR_EXPORTS = "005911" - # Element code for "Feed". - ELEMENT_CODE_FOR_FEED = "005521" - # Element code for "Other uses". - ELEMENT_CODE_FOR_OTHER_USES = "005154" - - # Select the relevant items/elements. - maize_and_wheat = df_fbsc[ - (df_fbsc["item_code"].isin([ITEM_CODE_FOR_MAIZE, ITEM_CODE_FOR_WHEAT])) - & (df_fbsc["element_code"].isin([ELEMENT_CODE_FOR_EXPORTS, ELEMENT_CODE_FOR_FEED, ELEMENT_CODE_FOR_OTHER_USES])) - ] - - # Sanity check. - error = "Units have changed." - assert list(maize_and_wheat["unit"].unique()) == ["tonnes"], error - - # Transpose data and rename columns conveniently. - maize_and_wheat = maize_and_wheat.pivot( - index=["country", "year"], columns=["item_code", "element_code"], values="value" - ) - maize_and_wheat = maize_and_wheat.rename( - columns={ITEM_CODE_FOR_MAIZE: "maize", ITEM_CODE_FOR_WHEAT: "wheat"}, level=0 - ).rename( - columns={ - ELEMENT_CODE_FOR_EXPORTS: "exports", - ELEMENT_CODE_FOR_FEED: "animal_feed", - ELEMENT_CODE_FOR_OTHER_USES: "other_uses", - } - ) - maize_and_wheat.columns = [column[0] + "_" + column[1] for column in maize_and_wheat.columns] - - # Set an appropriate index and sort conveniently. - maize_and_wheat = ( - maize_and_wheat.reset_index() - .set_index(["country", "year"], verify_integrity=True) - .sort_index() - .sort_index(axis=1) - ) - - # Create a table with the generated data. - tb_maize_and_wheat = Table(maize_and_wheat, short_name="maize_and_wheat", underscore=True) - - # Add minimal variable metadata (more metadata will be added at the grapher step). - for column in tb_maize_and_wheat.columns: - tb_maize_and_wheat[column].metadata.unit = "tonnes" - tb_maize_and_wheat[column].metadata.short_unit = "t" - - return tb_maize_and_wheat - - -def generate_fertilizer_exports(df_rfn: pd.DataFrame) -> Table: - # Element code for "Export Quantity". - ELEMENT_CODE_FOR_EXPORTS = "005910" - # Item code for "Nutrient nitrogen N (total)". - ITEM_CODE_FOR_NITROGEN = "00003102" - # Item code for "Nutrient phosphate P2O5 (total)". - ITEM_CODE_FOR_PHOSPHATE = "00003103" - # Item code for "Nutrient potash K2O (total)". - ITEM_CODE_FOR_POTASH = "00003104" - - # Select the relevant items and elements. - fertilizer_exports = df_rfn[ - (df_rfn["element_code"] == ELEMENT_CODE_FOR_EXPORTS) - & (df_rfn["item_code"].isin([ITEM_CODE_FOR_NITROGEN, ITEM_CODE_FOR_PHOSPHATE, ITEM_CODE_FOR_POTASH])) - ].reset_index(drop=True) - - # Sanity check. - error = "Units have changed." - assert list(fertilizer_exports["unit"].unique()) == ["tonnes"], error - - # Rename columns and items conveniently. - fertilizer_exports = fertilizer_exports[["country", "year", "item_code", "value"]].rename( - columns={"item_code": "item", "value": "exports"} - ) - fertilizer_exports["item"] = fertilizer_exports["item"].replace( - {ITEM_CODE_FOR_NITROGEN: "Nitrogen", ITEM_CODE_FOR_PHOSPHATE: "Phosphorous", ITEM_CODE_FOR_POTASH: "Potassium"} - ) - - # Add column of global exports. - global_exports = ( - fertilizer_exports[fertilizer_exports["country"] == "World"].drop(columns=["country"]).reset_index(drop=True) - ) - fertilizer_exports = pd.merge( - fertilizer_exports, global_exports, how="left", on=["year", "item"], suffixes=("", "_global") - ) - - # Create columns for the share of exports. - fertilizer_exports["share_of_exports"] = 100 * fertilizer_exports["exports"] / fertilizer_exports["exports_global"] - - # Drop column of global exports. - fertilizer_exports = fertilizer_exports.drop(columns=["exports_global"]) - - # Set an appropriate index and sort conveniently. - fertilizer_exports = ( - fertilizer_exports.set_index(["country", "year", "item"], verify_integrity=True).sort_index().sort_index(axis=1) - ) - - # Create a table with the generated data. - tb_fertilizer_exports = Table(fertilizer_exports, short_name="fertilizer_exports", underscore=True) - - # Add minimal variable metadata (more metadata will be added at the grapher step). - tb_fertilizer_exports["share_of_exports"].metadata.unit = "%" - tb_fertilizer_exports["share_of_exports"].metadata.short_unit = "%" - tb_fertilizer_exports["exports"].metadata.unit = "tonnes" - tb_fertilizer_exports["exports"].metadata.short_unit = "t" - - return tb_fertilizer_exports - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load dataset about land use, load its main (long-format) table, and create a convenient dataframe. - ds_rl: Dataset = paths.load_dependency(f"{NAMESPACE}_rl") - tb_rl = ds_rl[f"{NAMESPACE}_rl"] - df_rl = pd.DataFrame(tb_rl).reset_index() - - # Load dataset about production indices, load its main (long-format) table, and create a convenient dataframe. - ds_qi: Dataset = paths.load_dependency(f"{NAMESPACE}_qi") - tb_qi = ds_qi[f"{NAMESPACE}_qi"] - df_qi = pd.DataFrame(tb_qi).reset_index() - - # Load dataset about crops and livestock, load its main (long-format) table, and create a convenient dataframe. - ds_qcl: Dataset = paths.load_dependency(f"{NAMESPACE}_qcl") - tb_qcl = ds_qcl[f"{NAMESPACE}_qcl"] - df_qcl = pd.DataFrame(tb_qcl).reset_index() - - # Load dataset about SDG indicators, load its main (long-format) table, and create a convenient dataframe. - ds_sdgb: Dataset = paths.load_dependency(f"{NAMESPACE}_sdgb") - tb_sdgb = ds_sdgb[f"{NAMESPACE}_sdgb"] - df_sdgb = pd.DataFrame(tb_sdgb).reset_index() - - # Load dataset about food balances, load its main (long-format) table, and create a convenient dataframe. - ds_fbsc: Dataset = paths.load_dependency(f"{NAMESPACE}_fbsc") - tb_fbsc = ds_fbsc[f"{NAMESPACE}_fbsc"] - df_fbsc = pd.DataFrame(tb_fbsc).reset_index() - - # Load dataset about fertilizers, load its main (long-format) table, and create a convenient dataframe. - ds_ef: Dataset = paths.load_dependency(f"{NAMESPACE}_ef") - tb_ef = ds_ef[f"{NAMESPACE}_ef"] - df_ef = pd.DataFrame(tb_ef).reset_index() - - # Load dataset about fertilizers by nutrient, load its main (long-format) table, and create a convenient dataframe. - ds_rfn: Dataset = paths.load_dependency(f"{NAMESPACE}_rfn") - tb_rfn = ds_rfn[f"{NAMESPACE}_rfn"] - df_rfn = pd.DataFrame(tb_rfn).reset_index() - - # - # Process data. - # - # Create table for arable land per crop output. - tb_arable_land_per_crop_output = generate_arable_land_per_crop_output(df_rl=df_rl, df_qi=df_qi) - - # Create table for area used for production per crop type. - tb_area_by_crop_type = generate_area_used_for_production_per_crop_type(df_qcl=df_qcl) - - # Create table for the share of sustainable and overexploited fish. - tb_sustainable_and_overexploited_fish = generate_percentage_of_sustainable_and_overexploited_fish(df_sdgb=df_sdgb) - - # Create table for spared land due to increased yields. - tb_spared_land_from_increased_yields = generate_spared_land_from_increased_yields(df_qcl=df_qcl) - - # Create table for dietary compositions by commodity group. - tb_food_available_for_consumption = generate_food_available_for_consumption(df_fbsc=df_fbsc) - - # Create table for macronutrient compositions. - tb_macronutrient_compositions = generate_macronutrient_compositions(df_fbsc=df_fbsc) - - # Create table for fertilizers data. - tb_fertilizers = generate_fertilizers(df_ef=df_ef, df_rl=df_rl) - - # Create table for vegetable oil yields. - tb_vegetable_oil_yields = generate_vegetable_oil_yields(df_qcl=df_qcl, df_fbsc=df_fbsc) - - # Create table for peak agricultural land. - tb_agriculture_land_use_evolution = generate_agriculture_land_evolution(df_rl=df_rl) - - # Create table for hypothetical meat consumption - tb_hypothetical_meat_consumption = generate_hypothetical_meat_consumption(df_qcl=df_qcl) - - # Create table for cereal allocation. - tb_cereal_allocation = generate_cereal_allocation(df_fbsc=df_fbsc) - - # Create table for maize and wheat data (used in the context of the Ukraine war). - tb_maize_and_wheat = generate_maize_and_wheat(df_fbsc=df_fbsc) - - # Create table for fertilizer exports (used in the context of the Ukraine war). - tb_fertilizer_exports = generate_fertilizer_exports(df_rfn=df_rfn) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset( - dest_dir, - tables=[ - tb_arable_land_per_crop_output, - tb_area_by_crop_type, - tb_sustainable_and_overexploited_fish, - tb_spared_land_from_increased_yields, - tb_food_available_for_consumption, - tb_macronutrient_compositions, - tb_fertilizers, - tb_vegetable_oil_yields, - tb_agriculture_land_use_evolution, - tb_hypothetical_meat_consumption, - tb_cereal_allocation, - tb_maize_and_wheat, - tb_fertilizer_exports, - ], - ) - ds_garden.save() diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ef.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_ef.py deleted file mode 100644 index 9969bfd421f..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ef.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ef dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ei.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_ei.py deleted file mode 100644 index 5bd7d23db88..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ei.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ei dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ek.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_ek.py deleted file mode 100644 index c6ec4c862e8..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ek.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ek dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_el.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_el.py deleted file mode 100644 index 43b06ade38c..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_el.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_el dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_emn.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_emn.py deleted file mode 100644 index 5f12637ec70..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_emn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_emn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ep.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_ep.py deleted file mode 100644 index 0a44564deea..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ep.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ep dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_esb.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_esb.py deleted file mode 100644 index 2a4896b9edf..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_esb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_esb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_fa.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_fa.py deleted file mode 100644 index e594773a567..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_fa.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_fa dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_fbsc.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_fbsc.py deleted file mode 100644 index 7ac8c6d544c..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_fbsc.py +++ /dev/null @@ -1,219 +0,0 @@ -"""FAOSTAT garden step for faostat_fbsc dataset (Food Balances Combined). - -Combine the old and new food balances datasets: -* `faostat_fbsh`: Old (historical) dataset. -* `faostat_fbs`: Current dataset. - -A new (combined) dataset will be generated: "faostat_fbsc". - -This is because a new version of the Food Balances dataset was launched in 2014 with a slightly new methodology: -https://fenixservices.fao.org/faostat/static/documents/FBS/New%20FBS%20methodology.pdf - -NOTE: It seems that FAOSTAT is possibly extending the coverage of the new methodology. So the year of intersection of -both datasets will be earlier and earlier. The global variable `FBS_FIRST_YEAR` may have to be redefined in a future -update. - -""" - -from pathlib import Path -from typing import cast - -import pandas as pd -from owid import catalog -from owid.datautils import dataframes -from shared import ( - ADDED_TITLE_TO_WIDE_TABLE, - CURRENT_DIR, - NAMESPACE, - add_per_capita_variables, - add_regions, - clean_data, - handle_anomalies, - harmonize_elements, - harmonize_items, - log, - parse_amendments_table, - prepare_long_table, - prepare_wide_table, -) - -from etl.helpers import PathFinder, create_dataset - -# First year for which we have data in fbs dataset (it defines the first year when new methodology is used). -FBS_FIRST_YEAR = 2010 -DATASET_TITLE = f"Food Balances (old methodology before {FBS_FIRST_YEAR}, and new from {FBS_FIRST_YEAR} onwards)" - - -def combine_fbsh_and_fbs_datasets( - fbsh_dataset: catalog.Dataset, - fbs_dataset: catalog.Dataset, -) -> pd.DataFrame: - """Combine `faostat_fbsh` and `faostat_fbs` meadow datasets. - - Parameters - ---------- - fbsh_dataset : catalog.Dataset - Meadow `faostat_fbsh` dataset. - fbs_dataset : catalog.Dataset - Meadow `faostat_fbs` dataset. - - Returns - ------- - fbsc : pd.DataFrame - Combination of the tables of the two input datasets (as a dataframe, not a dataset). - - """ - # Sanity checks. - error = "Description of fbs and fbsh datasets is different." - assert fbsh_dataset.metadata.description == fbs_dataset.metadata.description, error - error = "Licenses of fbsh and fbs are different." - assert fbsh_dataset.metadata.licenses == fbs_dataset.metadata.licenses, error - - # Load dataframes for fbs and fbsh datasets. - fbsh = pd.DataFrame(fbsh_dataset["faostat_fbsh"]).reset_index() - fbs = pd.DataFrame(fbs_dataset["faostat_fbs"]).reset_index() - - # Harmonize items and elements in both datasets. - fbsh = harmonize_items(df=fbsh, dataset_short_name="faostat_fbsh") - fbsh = harmonize_elements(df=fbsh) - fbs = harmonize_items(df=fbs, dataset_short_name="faostat_fbs") - fbs = harmonize_elements(df=fbs) - - # Ensure there is no overlap in data between the two datasets, and that there is no gap between them. - assert fbs["year"].min() == FBS_FIRST_YEAR, f"First year of fbs dataset is not {FBS_FIRST_YEAR}" - if fbsh["year"].max() >= fbs["year"].min(): - # There is overlapping data between fbsh and fbs datasets. Prioritising fbs over fbsh." - fbsh = fbsh.loc[fbsh["year"] < fbs["year"].min()].reset_index(drop=True) - if (fbsh["year"].max() + 1) < fbs["year"].min(): - log.warning("Data is missing for one or more years between fbsh and fbs datasets.") - - # Sanity checks. - # Ensure the elements that are in fbsh but not in fbs are covered by ITEMS_MAPPING. - error = "Mismatch between items in fbsh and fbs. Redefine shared.ITEM_AMENDMENTS." - assert set(fbsh["item"]) == set(fbs["item"]), error - # Some elements are found in fbs but not in fbsh. This is understandable, since fbs is - # more recent and may have additional elements. However, ensure that there are no - # elements in fbsh that are not in fbs. - error = "There are elements in fbsh that are not in fbs." - assert set(fbsh["element"]) < set(fbs["element"]), error - - # Concatenate old and new dataframes using function that keeps categoricals. - fbsc = dataframes.concatenate([fbsh, fbs]).sort_values(["area", "year"]).reset_index(drop=True) - - # Ensure that each element has only one unit and one description. - error = "Some elements in the combined dataset have more than one unit." - assert fbsc.groupby("element")["unit"].nunique().max() == 1, error - - return cast(pd.DataFrame, fbsc) - - -def _assert_df_size(df: pd.DataFrame, size_mb: float) -> None: - """Check that dataframe is smaller than given size to prevent OOM errors.""" - real_size_mb = df.memory_usage(deep=True).sum() / 1e6 - assert real_size_mb <= size_mb, f"DataFrame size is too big: {real_size_mb} MB > {size_mb} MB" - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Fetch the dataset short name from dest_dir. - dataset_short_name = Path(dest_dir).name - - # Define path to current step file. - current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") - - # Get paths and naming conventions for current data step. - paths = PathFinder(current_step_file.as_posix()) - - # Load fbsh and fbs. - log.info("faostat_fbsc.loading_datasets") - fbsh_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_fbsh") - fbs_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_fbs") - - # Load dataset of FAOSTAT metadata. - metadata: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_metadata") - - # Load dataset, items, element-units, and countries metadata. - dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() - items_metadata = pd.DataFrame(metadata["items"]).reset_index() - items_metadata = items_metadata[items_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - elements_metadata = pd.DataFrame(metadata["elements"]).reset_index() - elements_metadata = elements_metadata[elements_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() - amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) - - # - # Process data. - # - # Combine fbsh and fbs datasets. - log.info( - "faostat_fbsc.combine_fbsh_and_fbs_datasets", - fbsh_shape=fbsh_dataset["faostat_fbsh"].shape, - fbs_shape=fbs_dataset["faostat_fbs"].shape, - ) - data = combine_fbsh_and_fbs_datasets(fbsh_dataset, fbs_dataset) - - _assert_df_size(data, 2000) - - # Prepare data. - data = clean_data( - data=data, - items_metadata=items_metadata, - elements_metadata=elements_metadata, - countries_metadata=countries_metadata, - amendments=amendments, - ) - - # Add data for aggregate regions. - data = add_regions(data=data, elements_metadata=elements_metadata) - - # Add per-capita variables. - data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) - - # Handle detected anomalies in the data. - data, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=data) - - # Avoid objects as they would explode memory, use categoricals instead. - for col in data.columns: - assert data[col].dtype != object, f"Column {col} should not have object type" - - _assert_df_size(data, 2000) - - # Create a long table (with item code and element code as part of the index). - log.info("faostat_fbsc.prepare_long_table", shape=data.shape) - data_table_long = prepare_long_table(data=data) - - _assert_df_size(data_table_long, 2000) - - # Create a wide table (with only country and year as index). - log.info("faostat_fbsc.prepare_wide_table", shape=data.shape) - data_table_wide = prepare_wide_table(data=data) - - # - # Save outputs. - # - # Update tables metadata. - data_table_long.metadata.short_name = dataset_short_name - data_table_long.metadata.title = dataset_metadata["owid_dataset_title"] - data_table_wide.metadata.short_name = f"{dataset_short_name}_flat" - data_table_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE - - # Initialise new garden dataset. - ds_garden = create_dataset( - dest_dir=dest_dir, tables=[data_table_long, data_table_wide], default_metadata=fbs_dataset.metadata - ) - - # Check that the title assigned here coincides with the one in custom_datasets.csv (for consistency). - error = "Dataset title given to fbsc is different to the one in custom_datasets.csv. Update the latter file." - assert DATASET_TITLE == dataset_metadata["owid_dataset_title"], error - - # Update dataset metadata and add description of anomalies (if any) to the dataset description. - ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions - ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] - - # Update the main source's metadata description (which will be shown in charts). - ds_garden.metadata.sources[0].description = ds_garden.metadata.description - - # Create garden dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_fo.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_fo.py deleted file mode 100644 index bba98a5e224..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_fo.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_fo dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_food_explorer.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_food_explorer.py deleted file mode 100644 index b620f241446..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_food_explorer.py +++ /dev/null @@ -1,555 +0,0 @@ -"""Dataset feeding the global food explorer. - -Load the qcl and fbsc (combination of fbsh and fbs) datasets, and create a combined dataset of food items (now called -products). - -The resulting dataset will later be loaded by the `explorer/food_explorer` which feeds our -[Global food explorer](https://ourworldindata.org/explorers/global-food). - -""" - -from pathlib import Path -from typing import cast - -import pandas as pd -from owid import catalog -from owid.datautils import dataframes -from shared import CURRENT_DIR, NAMESPACE - -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - -# Dataset name and title. -DATASET_TITLE = "Food Explorer" -DATASET_DESCRIPTION = ( - "This dataset has been created by Our World in Data, merging existing FAOSTAT datasets. In " - "particular, we have used 'Crops and livestock products' (QCL) and 'Food Balances' (FBSH and " - "FBS) datasets. Each row contains all the metrics for a specific combination of (country, " - "product, year). The metrics may come from different datasets." -) - -# The names of the products to include in the food explorer will be further edited in owid-content, following to the -# following file: -# https://github.com/owid/owid-content/blob/master/scripts/global-food-explorer/foods.csv -ITEM_CODES_QCL = [ - "00000060", # From faostat_qcl - 'Maize oil' (previously 'Maize oil'). - "00000567", # From faostat_qcl - 'Watermelons' (previously 'Watermelons'). - "00000075", # From faostat_qcl - 'Oats' (previously 'Oats'). - "00000191", # From faostat_qcl - 'Chickpeas' (previously 'Chickpeas'). - "00001069", # From faostat_qcl - 'Meat of ducks, fresh or chilled' (previously 'Meat, duck'). - "00000957", # From faostat_qcl - 'Buffalo hides' (previously 'Buffalo hides'). - "00000176", # From faostat_qcl - 'Beans, dry' (previously 'Beans, dry'). - "00001182", # From faostat_qcl - 'Honey' (previously 'Honey'). - "00000399", # From faostat_qcl - 'Eggplants' (previously 'Eggplants'). - "00000554", # From faostat_qcl - 'Cranberries' (previously 'Cranberries'). - "00000296", # From faostat_qcl - 'Poppy seeds' (previously 'Poppy seeds'). - "00000201", # From faostat_qcl - 'Lentils, dry' (previously 'Lentils'). - "00000268", # From faostat_qcl - 'Sunflower oil' (previously 'Sunflower oil'). - "00001806", # From faostat_qcl - 'Meat, beef and buffalo' (previously 'Meat, beef and buffalo'). - "00000600", # From faostat_qcl - 'Papayas' (previously 'Papayas'). - "00000334", # From faostat_qcl - 'Linseed oil' (previously 'Linseed oil'). - "00001097", # From faostat_qcl - 'Horse meat, fresh or chilled' (previously 'Meat, horse'). - "00000165", # From faostat_qcl - 'Molasses' (previously 'Molasses'). - "00000426", # From faostat_qcl - 'Carrots and turnips' (previously 'Carrots and turnips'). - "00000216", # From faostat_qcl - 'Brazil nuts, in shell' (previously 'Brazil nuts, with shell'). - "00000137", # From faostat_qcl - 'Yams' (previously 'Yams'). - "00000222", # From faostat_qcl - 'Walnuts' (previously 'Walnuts'). - "00000289", # From faostat_qcl - 'Sesame seed' (previously 'Sesame seed'). - "00000122", # From faostat_qcl - 'Sweet potatoes' (previously 'Sweet potatoes'). - "00001738", # From faostat_qcl - 'Fruit' (previously 'Fruit'). - "00001780", # From faostat_qcl - 'Milk' (previously 'Milk'). - "00001804", # From faostat_qcl - 'Citrus Fruit' (previously 'Citrus Fruit'). - "00000656", # From faostat_qcl - 'Coffee, green' (previously 'Coffee, green'). - "00001019", # From faostat_qcl - 'Goat fat, unrendered' (previously 'Fat, goats'). - "00000225", # From faostat_qcl - 'Hazelnuts' (previously 'Hazelnuts'). - "00000406", # From faostat_qcl - 'Green garlic' (previously 'Garlic'). - "00000995", # From faostat_qcl - 'Skins, sheep' (previously 'Skins, sheep'). - "00000244", # From faostat_qcl - 'Groundnut oil' (previously 'Groundnut oil'). - "00000281", # From faostat_qcl - 'Safflower oil' (previously 'Safflower oil'). - "00000267", # From faostat_qcl - 'Sunflower seed' (previously 'Sunflower seed'). - "00001025", # From faostat_qcl - 'Skins, goat' (previously 'Skins, goat'). - "00000252", # From faostat_qcl - 'Coconut oil' (previously 'Coconut oil'). - "00000256", # From faostat_qcl - 'Palm kernels' (previously 'Palm kernels'). - "00000868", # From faostat_qcl - 'Offals, cattle' (previously 'Offals, cattle'). - "00000292", # From faostat_qcl - 'Mustard seed' (previously 'Mustard seed'). - "00000101", # From faostat_qcl - 'Canary seed' (previously 'Canary seed'). - "00001098", # From faostat_qcl - 'Edible offals of horses and other equines, fresh, chilled or frozen' (previously 'Offals, horses'). - "00001062", # From faostat_qcl - 'Eggs from hens' (previously 'Eggs from hens'). - "00001808", # From faostat_qcl - 'Meat, poultry' (previously 'Meat, poultry'). - "00000258", # From faostat_qcl - 'Palm kernel oil' (previously 'Palm kernel oil'). - "00000156", # From faostat_qcl - 'Sugar cane' (previously 'Sugar cane'). - "00000373", # From faostat_qcl - 'Spinach' (previously 'Spinach'). - "00000773", # From faostat_qcl - 'Flax fibre' (previously 'Flax fibre'). - "00000116", # From faostat_qcl - 'Potatoes' (previously 'Potatoes'). - "00000869", # From faostat_qcl - 'Cattle fat, unrendered' (previously 'Fat, cattle'). - "00000358", # From faostat_qcl - 'Cabbages' (previously 'Cabbages'). - "00000767", # From faostat_qcl - 'Cotton' (previously 'Cotton'). - "00000388", # From faostat_qcl - 'Tomatoes' (previously 'Tomatoes'). - "00000220", # From faostat_qcl - 'Chestnuts, in shell' (previously 'Chestnut'). - "00000027", # From faostat_qcl - 'Rice' (previously 'Rice'). - "00000367", # From faostat_qcl - 'Asparagus' (previously 'Asparagus'). - "00000977", # From faostat_qcl - 'Meat, lamb and mutton' (previously 'Meat, lamb and mutton'). - "00000015", # From faostat_qcl - 'Wheat' (previously 'Wheat'). - "00001127", # From faostat_qcl - 'Meat of camels, fresh or chilled' (previously 'Meat, camel'). - "00001183", # From faostat_qcl - 'Beeswax' (previously 'Beeswax'). - "00001720", # From faostat_qcl - 'Roots and tubers' (previously 'Roots and tubers'). - "00001186", # From faostat_qcl - 'Silk' (previously 'Silk'). - "00000826", # From faostat_qcl - 'Tobacco' (previously 'Tobacco'). - "00000978", # From faostat_qcl - 'Offals, sheep' (previously 'Offals, sheep'). - "00000948", # From faostat_qcl - 'Offals, buffaloes' (previously 'Offals, buffaloes'). - "00000226", # From faostat_qcl - 'Areca nuts' (previously 'Areca nuts'). - "00000417", # From faostat_qcl - 'Peas, green' (previously 'Peas, green'). - "00000407", # From faostat_qcl - 'Leeks' (previously 'Leeks'). - "00000224", # From faostat_qcl - 'Kola nuts' (previously 'Kola nuts'). - "00000079", # From faostat_qcl - 'Millet' (previously 'Millet'). - "00000568", # From faostat_qcl - 'Melon' (previously 'Melon'). - "00000900", # From faostat_qcl - 'Whey' (previously 'Whey'). - "00000544", # From faostat_qcl - 'Strawberries' (previously 'Strawberries'). - "00000333", # From faostat_qcl - 'Linseed' (previously 'Linseed'). - "00000571", # From faostat_qcl - 'Mangoes' (previously 'Mangoes'). - "00000534", # From faostat_qcl - 'Peaches and nectarines' (previously 'Peaches and nectarines'). - "00000372", # From faostat_qcl - 'Lettuce' (previously 'Lettuce'). - "00001080", # From faostat_qcl - 'Meat of turkeys, fresh or chilled' (previously 'Meat, turkey'). - "00000083", # From faostat_qcl - 'Sorghum' (previously 'Sorghum'). - "00001732", # From faostat_qcl - 'Oilcrops, Oil Equivalent' (previously 'Oilcrops, Oil Equivalent'). - "00000336", # From faostat_qcl - 'Hempseed' (previously 'Hempseed'). - "00000397", # From faostat_qcl - 'Cucumbers and gherkins' (previously 'Cucumbers and gherkins'). - "00000223", # From faostat_qcl - 'Pistachios, in shell' (previously 'Pistachios'). - "00000242", # From faostat_qcl - 'Groundnuts' (previously 'Groundnuts'). - "00000489", # From faostat_qcl - 'Plantains' (previously 'Plantains'). - "00000495", # From faostat_qcl - 'Tangerines' (previously 'Tangerines'). - "00000195", # From faostat_qcl - 'Cow peas' (previously 'Cow peas'). - "00000290", # From faostat_qcl - 'Sesame oil' (previously 'Sesame oil'). - "00000497", # From faostat_qcl - 'Lemons and limes' (previously 'Lemons and limes'). - "00000711", # From faostat_qcl - 'Herbs (e.g. fennel)' (previously 'Herbs (e.g. fennel)'). - "00001129", # From faostat_qcl - 'Fat of camels' (previously 'Fat, camels'). - "00000577", # From faostat_qcl - 'Dates' (previously 'Dates'). - "00001108", # From faostat_qcl - 'Meat of asses, fresh or chilled' (previously 'Meat, ass'). - "00000071", # From faostat_qcl - 'Rye' (previously 'Rye'). - "00001073", # From faostat_qcl - 'Meat of geese, fresh or chilled' (previously 'Meat, goose and guinea fowl'). - "00000687", # From faostat_qcl - 'Pepper' (previously 'Pepper'). - "00000280", # From faostat_qcl - 'Safflower seed' (previously 'Safflower seed'). - "00000157", # From faostat_qcl - 'Sugar beet' (previously 'Sugar beet'). - "00000271", # From faostat_qcl - 'Rapeseed oil' (previously 'Rapeseed oil'). - "00001735", # From faostat_qcl - 'Vegetables' (previously 'Vegetables'). - "00001035", # From faostat_qcl - 'Meat of pig with the bone, fresh or chilled' (previously 'Meat, pig'). - "00001128", # From faostat_qcl - 'Offals, camels' (previously 'Offals, camels'). - "00000564", # From faostat_qcl - 'Wine' (previously 'Wine'). - "00000092", # From faostat_qcl - 'Quinoa' (previously 'Quinoa'). - "00000507", # From faostat_qcl - 'Grapefruit' (previously 'Grapefruit'). - "00000089", # From faostat_qcl - 'Buckwheat' (previously 'Buckwheat'). - "00000949", # From faostat_qcl - 'Buffalo fat, unrendered' (previously 'Fat, buffaloes'). - "00000821", # From faostat_qcl - 'Fibre crops' (previously 'Fibre crops'). - "00000221", # From faostat_qcl - 'Almonds' (previously 'Almonds'). - "00000328", # From faostat_qcl - 'Seed cotton, unginned' (previously 'Seed cotton'). - "00001717", # From faostat_qcl - 'Cereals' (previously 'Cereals'). - "00000547", # From faostat_qcl - 'Raspberries' (previously 'Raspberries'). - "00000187", # From faostat_qcl - 'Peas, dry' (previously 'Peas, dry'). - "00000560", # From faostat_qcl - 'Grapes' (previously 'Grapes'). - "00000689", # From faostat_qcl - 'Chillies and peppers' (previously 'Chillies and peppers'). - "00001091", # From faostat_qcl - 'Eggs from other birds (excl. hens)' (previously 'Eggs from other birds (excl. hens)'). - "00001163", # From faostat_qcl - 'Game meat, fresh, chilled or frozen' (previously 'Meat, game'). - "00001807", # From faostat_qcl - 'Meat, sheep and goat' (previously 'Meat, sheep and goat'). - "00001141", # From faostat_qcl - 'Meat of rabbits and hares, fresh or chilled' (previously 'Meat, rabbit'). - "00000490", # From faostat_qcl - 'Oranges' (previously 'Oranges'). - "00001841", # From faostat_qcl - 'Oilcrops, Cake Equivalent' (previously 'Oilcrops, Cake Equivalent'). - "00000552", # From faostat_qcl - 'Blueberries' (previously 'Blueberries'). - "00001783", # From faostat_qcl - 'Eggs' (previously 'Eggs'). - "00000254", # From faostat_qcl - 'Palm fruit oil' (previously 'Palm fruit oil'). - "00000263", # From faostat_qcl - 'Karite nuts' (previously 'Karite nuts'). - "00000044", # From faostat_qcl - 'Barley' (previously 'Barley'). - "00001036", # From faostat_qcl - 'Offals, pigs' (previously 'Offals, pigs'). - "00000446", # From faostat_qcl - 'Green maize' (previously 'Green maize'). - "00001745", # From faostat_qcl - 'Cheese' (previously 'Cheese'). - "00000261", # From faostat_qcl - 'Olive oil' (previously 'Olive oil'). - "00000236", # From faostat_qcl - 'Soya beans' (previously 'Soybeans'). - "00000125", # From faostat_qcl - 'Cassava, fresh' (previously 'Cassava'). - "00000260", # From faostat_qcl - 'Olives' (previously 'Olives'). - "00000329", # From faostat_qcl - 'Cotton seed' (previously 'Cottonseed'). - "00000521", # From faostat_qcl - 'Pears' (previously 'Pears'). - "00001018", # From faostat_qcl - 'Offals, goats' (previously 'Offals, goats'). - "00001765", # From faostat_qcl - 'Meat, total' (previously 'Meat, total'). - "00000550", # From faostat_qcl - 'Currants' (previously 'Currants'). - "00001058", # From faostat_qcl - 'Meat of chickens, fresh or chilled' (previously 'Meat, chicken'). - "00000197", # From faostat_qcl - 'Pigeon peas, dry' (previously 'Pigeon peas'). - "00000270", # From faostat_qcl - 'Rape or colza seed' (previously 'Rapeseed'). - "00000526", # From faostat_qcl - 'Apricots' (previously 'Apricots'). - "00000592", # From faostat_qcl - 'Kiwi' (previously 'Kiwi'). - "00000237", # From faostat_qcl - 'Soybean oil' (previously 'Soybean oil'). - "00000947", # From faostat_qcl - 'Meat of buffalo, fresh or chilled' (previously 'Meat, buffalo'). - "00000265", # From faostat_qcl - 'Castor oil seeds' (previously 'Castor oil seed'). - "00000430", # From faostat_qcl - 'Okra' (previously 'Okra'). - "00000331", # From faostat_qcl - 'Cottonseed oil' (previously 'Cottonseed oil'). - "00000103", # From faostat_qcl - 'Mixed grains' (previously 'Mixed grains'). - "00000486", # From faostat_qcl - 'Bananas' (previously 'Bananas'). - "00000919", # From faostat_qcl - 'Cattle hides' (previously 'Cattle hides'). - "00001242", # From faostat_qcl - 'Margarine' (previously 'Margarine'). - "00000449", # From faostat_qcl - 'Mushrooms' (previously 'Mushrooms'). - "00001037", # From faostat_qcl - 'Fat of pigs' (previously 'Fat, pigs'). - "00001729", # From faostat_qcl - 'Treenuts' (previously 'Treenuts'). - "00000366", # From faostat_qcl - 'Artichokes' (previously 'Artichokes'). - "00000217", # From faostat_qcl - 'Cashew nuts' (previously 'Cashew nuts'). - "00000299", # From faostat_qcl - 'Melonseed' (previously 'Melonseed'). - "00000574", # From faostat_qcl - 'Pineapples' (previously 'Pineapples'). - "00000979", # From faostat_qcl - 'Sheep fat, unrendered' (previously 'Fat, sheep'). - "00000987", # From faostat_qcl - 'Wool' (previously 'Wool'). - "00000423", # From faostat_qcl - 'String beans' (previously 'String beans'). - "00000249", # From faostat_qcl - 'Coconuts, in shell' (previously 'Coconuts'). - "00000780", # From faostat_qcl - 'Jute, raw or retted' (previously 'Jute'). - "00000536", # From faostat_qcl - 'Plums' (previously 'Plums'). - "00001111", # From faostat_qcl - 'Meat of mules, fresh or chilled' (previously 'Meat, mule'). - "00001723", # From faostat_qcl - 'Sugar crops' (previously 'Sugar crops'). - "00001726", # From faostat_qcl - 'Pulses' (previously 'Pulses'). - "00000162", # From faostat_qcl - 'Sugar (raw)' (previously 'Sugar (raw)'). - "00000667", # From faostat_qcl - 'Tea leaves' (previously 'Tea'). - "00000056", # From faostat_qcl - 'Maize (corn)' (previously 'Maize'). - "00000257", # From faostat_qcl - 'Palm oil' (previously 'Palm oil'). - "00000393", # From faostat_qcl - 'Cauliflowers and broccoli' (previously 'Cauliflowers and broccoli'). - "00000531", # From faostat_qcl - 'Cherries' (previously 'Cherries'). - "00000572", # From faostat_qcl - 'Avocados' (previously 'Avocados'). - "00000403", # From faostat_qcl - 'Onions' (previously 'Onions'). - "00000515", # From faostat_qcl - 'Apples' (previously 'Apples'). - "00000414", # From faostat_qcl - 'Other beans, green' (previously 'Beans, green'). - "00001017", # From faostat_qcl - 'Meat of goat, fresh or chilled' (previously 'Meat, goat'). - "00000181", # From faostat_qcl - 'Broad beans' (previously 'Broad beans'). -] - -ITEM_CODES_FBSC = [ - "00002576", # From faostat_fbsc - 'Palm kernel oil' (previously 'Palm kernel oil'). - "00002516", # From faostat_fbsc - 'Oats' (previously 'Oats'). - "00002562", # From faostat_fbsc - 'Palm kernels' (previously 'Palm kernels'). - "00002551", # From faostat_fbsc - 'Nuts' (previously 'Nuts'). - "00002913", # From faostat_fbsc - 'Oilcrops' (previously 'Oilcrops'). - "00002533", # From faostat_fbsc - 'Sweet potatoes' (previously 'Sweet potatoes'). - "00002560", # From faostat_fbsc - 'Coconuts' (previously 'Coconuts'). - "00002511", # From faostat_fbsc - 'Wheat' (previously 'Wheat'). - "00002557", # From faostat_fbsc - 'Sunflower seed' (previously 'Sunflower seed'). - "00002602", # From faostat_fbsc - 'Onions' (previously 'Onions'). - "00002734", # From faostat_fbsc - 'Meat, poultry' (previously 'Meat, poultry'). - "00002572", # From faostat_fbsc - 'Groundnut oil' (previously 'Groundnut oil'). - "00002736", # From faostat_fbsc - 'Offals' (previously 'Offals'). - "00002579", # From faostat_fbsc - 'Sesame oil' (previously 'Sesame oil'). - "00002552", # From faostat_fbsc - 'Groundnuts' (previously 'Groundnuts'). - "00002943", # From faostat_fbsc - 'Meat, total' (previously 'Meat, total'). - "00002912", # From faostat_fbsc - 'Treenuts' (previously 'Treenuts'). - "00002611", # From faostat_fbsc - 'Oranges' (previously 'Oranges'). - "00002616", # From faostat_fbsc - 'Plantains' (previously 'Plantains'). - "00002617", # From faostat_fbsc - 'Apples' (previously 'Apples'). - "00002563", # From faostat_fbsc - 'Olives' (previously 'Olives'). - "00002513", # From faostat_fbsc - 'Barley' (previously 'Barley'). - "00002532", # From faostat_fbsc - 'Cassava' (previously 'Cassava'). - "00002918", # From faostat_fbsc - 'Vegetables' (previously 'Vegetables'). - "00002948", # From faostat_fbsc - 'Milk' (previously 'Milk'). - "00002613", # From faostat_fbsc - 'Grapefruit' (previously 'Grapefruit'). - "00002555", # From faostat_fbsc - 'Soybeans' (previously 'Soybeans'). - "00002537", # From faostat_fbsc - 'Sugar beet' (previously 'Sugar beet'). - "00002640", # From faostat_fbsc - 'Pepper' (previously 'Pepper'). - "00002536", # From faostat_fbsc - 'Sugar cane' (previously 'Sugar cane'). - "00002633", # From faostat_fbsc - 'Cocoa beans' (previously 'Cocoa beans'). - "00002561", # From faostat_fbsc - 'Sesame seed' (previously 'Sesame seed'). - "00002546", # From faostat_fbsc - 'Beans, dry' (previously 'Beans, dry'). - "00002740", # From faostat_fbsc - 'Butter and ghee' (previously 'Butter and ghee'). - "00002514", # From faostat_fbsc - 'Maize' (previously 'Maize'). - "00002575", # From faostat_fbsc - 'Cottonseed oil' (previously 'Cottonseed oil'). - "00002641", # From faostat_fbsc - 'Chillies and peppers' (previously 'Chillies and peppers'). - "00002733", # From faostat_fbsc - 'Pork' (previously 'Pork'). - "00002919", # From faostat_fbsc - 'Fruit' (previously 'Fruit'). - "00002655", # From faostat_fbsc - 'Wine' (previously 'Wine'). - "00002618", # From faostat_fbsc - 'Pineapples' (previously 'Pineapples'). - "00002612", # From faostat_fbsc - 'Lemons and limes' (previously 'Lemons and limes'). - "00002580", # From faostat_fbsc - 'Olive oil' (previously 'Olive oil'). - "00002515", # From faostat_fbsc - 'Rye' (previously 'Rye'). - "00002582", # From faostat_fbsc - 'Maize oil' (previously 'Maize oil'). - "00002731", # From faostat_fbsc - 'Meat, beef' (previously 'Meat, beef'). - "00002518", # From faostat_fbsc - 'Sorghum' (previously 'Sorghum'). - "00002949", # From faostat_fbsc - 'Eggs' (previously 'Eggs'). - "00002531", # From faostat_fbsc - 'Potatoes' (previously 'Potatoes'). - "00002615", # From faostat_fbsc - 'Bananas' (previously 'Bananas'). - "00002573", # From faostat_fbsc - 'Sunflower oil' (previously 'Sunflower oil'). - "00002578", # From faostat_fbsc - 'Coconut oil' (previously 'Coconut oil'). - "00002601", # From faostat_fbsc - 'Tomatoes' (previously 'Tomatoes'). - "00002571", # From faostat_fbsc - 'Soybean oil' (previously 'Soybean oil'). - "00002559", # From faostat_fbsc - 'Cottonseed' (previously 'Cottonseed'). - "00002732", # From faostat_fbsc - 'Meat, sheep and goat' (previously 'Meat, sheep and goat'). - "00002901", # From faostat_fbsc - 'Total' (previously 'Total'). - "00002619", # From faostat_fbsc - 'Dates' (previously 'Dates'). - "00002911", # From faostat_fbsc - 'Pulses' (previously 'Pulses'). - "00002535", # From faostat_fbsc - 'Yams' (previously 'Yams'). - "00002745", # From faostat_fbsc - 'Honey' (previously 'Honey'). - "00002737", # From faostat_fbsc - 'Animal fats' (previously 'Animal fats'). - "00002517", # From faostat_fbsc - 'Millet' (previously 'Millet'). - "00002547", # From faostat_fbsc - 'Peas, dry' (previously 'Peas, dry'). - "00002807", # From faostat_fbsc - 'Rice' (previously 'Rice'). - "00002960", # From faostat_fbsc - 'Fish and seafood' (previously 'Fish and seafood'). - "00002908", # From faostat_fbsc - 'Sugar crops' (previously 'Sugar crops'). -] - -# OWID item name, element name, and unit name for population (as given in faostat_qcl and faostat_fbsc datasets). -FAO_POPULATION_ITEM_NAME = "Population" -FAO_POPULATION_ELEMENT_NAME = "Total Population - Both sexes" -FAO_POPULATION_UNIT = "1000 persons" - -# List of element codes to consider from faostat_qcl. -ELEMENT_CODES_QCL = [ - "005312", - "005313", - "005314", - "005318", - "005320", - "005321", - "005410", - "005413", - "005417", - "005419", - "005420", - "005422", - "005424", - "005510", - "005513", - "5312pc", - "5320pc", - "5321pc", - "5510pc", -] -# List of element codes to consider from faostat_fbsc. -ELEMENT_CODES_FBSC = [ - "000645", - "000664", - "000674", - "000684", - "005072", - "005123", - "005131", - "005142", - "005154", - "005170", - "005171", - "005301", - # Element 'Production' (in tonnes, originally given in 1000 tonnes) is taken from qcl. - # Although fbsc has items for this element that are not in qcl, they overlap in a number of items with slightly - # different values. To avoid this issue, we ignore the element from fbsc and use only the one in qcl. - # '005511', - "005521", - "005527", - "005611", - "005911", - "0645pc", - "0664pc", - "0674pc", - "0684pc", - "5123pc", - "5142pc", - "5154pc", - "5301pc", - "5521pc", - "5611pc", - "5911pc", - # The following element code is for population. - "000511", -] - - -def combine_qcl_and_fbsc(qcl_table: catalog.Table, fbsc_table: catalog.Table) -> pd.DataFrame: - """Combine garden `faostat_qcl` and `faostat_fbsc` datasets. - - Parameters - ---------- - qcl_table : catalog.Table - Main table (in long format) of the `faostat_qcl` dataset. - fbsc_table : catalog.Table - Main table (in long format) of the `faostat_fbsc` dataset. - - Returns - ------- - combined : pd.DataFrame - Combined data (as a dataframe, not a table). - - """ - columns = [ - "country", - "year", - "item_code", - "element_code", - "item", - "element", - "unit", - "unit_short_name", - "value", - "population_with_data", - ] - qcl = pd.DataFrame(qcl_table).reset_index()[columns] - # Select relevant element codes. - qcl = qcl[qcl["element_code"].isin(ELEMENT_CODES_QCL)].reset_index(drop=True) - qcl["value"] = qcl["value"].astype(float) - qcl["element"] = [element for element in qcl["element"]] - qcl["unit"] = [unit for unit in qcl["unit"]] - qcl["item"] = [item for item in qcl["item"]] - fbsc = pd.DataFrame(fbsc_table).reset_index()[columns] - # Select relevant element codes. - fbsc = fbsc[fbsc["element_code"].isin(ELEMENT_CODES_FBSC)].reset_index(drop=True) - fbsc["value"] = fbsc["value"].astype(float) - fbsc["element"] = [element for element in fbsc["element"]] - fbsc["unit"] = [unit for unit in fbsc["unit"]] - fbsc["item"] = [item for item in fbsc["item"]] - - rename_columns = {"item": "product"} - combined = ( - dataframes.concatenate([qcl, fbsc], ignore_index=True).rename(columns=rename_columns).reset_index(drop=True) - ) - - # Sanity checks. - assert len(combined) == (len(qcl) + len(fbsc)), "Unexpected number of rows after combining qcl and fbsc datasets." - - assert len(combined[combined["value"].isnull()]) == 0, "Unexpected nan values." - - n_items_per_item_code = combined.groupby("item_code")["product"].transform("nunique") - assert combined[n_items_per_item_code > 1].empty, "There are item codes with multiple items." - - n_elements_per_element_code = combined.groupby("element_code")["element"].transform("nunique") - assert combined[n_elements_per_element_code > 1].empty, "There are element codes with multiple elements." - - n_units_per_element_code = combined.groupby("element_code")["unit"].transform("nunique") - assert combined[n_units_per_element_code > 1].empty, "There are element codes with multiple units." - - error = "There are unexpected duplicate rows. Rename items in custom_items.csv to avoid clashes." - assert combined[combined.duplicated(subset=["product", "country", "year", "element", "unit"])].empty, error - - return cast(pd.DataFrame, combined) - - -def get_fao_population(combined: pd.DataFrame) -> pd.DataFrame: - """Extract the FAO population data from data (in long format). - - Parameters - ---------- - combined : pd.DataFrame - Combination of `faostat_qcl` and `faostat_fbsc` data (although this function could also be applied to just - `faostat_fbsc` data, since `faostat_qcl` does not contain FAO population data). - - Returns - ------- - fao_population : pd.DataFrame - Population (by country and year) according to FAO, extracted from the `faostat_fbsc` dataset. - - """ - # Select the item and element that corresponds to population values. - fao_population = combined[ - (combined["product"] == FAO_POPULATION_ITEM_NAME) & (combined["element"] == FAO_POPULATION_ELEMENT_NAME) - ].reset_index(drop=True) - - # Check that population is given in "1000 persons" and convert to persons. - error = "FAOSTAT population changed item, element, or unit." - assert list(fao_population["unit"].unique()) == [FAO_POPULATION_UNIT], error - fao_population["value"] *= 1000 - - # Drop missing values and prepare output dataframe. - fao_population = ( - fao_population[["country", "year", "value"]].dropna(how="any").rename(columns={"value": "fao_population"}) - ) - - return fao_population - - -def process_combined_data(combined: pd.DataFrame) -> pd.DataFrame: - """Process combined data (combination of `faostat_qcl` and `faostat_fbsc` data) to have the content and format - required by the food explorer. - - Parameters - ---------- - combined : pd.DataFrame - Combination of `faostat_qcl` and `faostat_fbsc` data. - - Returns - ------- - data_wide : pd.DataFrame - Processed data (in wide format). - - """ - combined = combined.copy() - - # Get FAO population from data (it is given as another item). - fao_population = get_fao_population(combined=combined) - - # List of all item codes to select. - selected_item_codes = sorted(set(ITEM_CODES_FBSC).union(ITEM_CODES_QCL)) - - # Check that all expected products are included in the data. - missing_products = sorted(set(selected_item_codes) - set(set(combined["item_code"]))) - assert len(missing_products) == 0, f"{len(missing_products)} missing products for food explorer." - - # Select relevant products for the food explorer. - combined = combined[combined["item_code"].isin(selected_item_codes)].reset_index(drop=True) - - # Join element and unit into one title column. - combined["title"] = combined["element"] + " (" + combined["unit"] + ")" - - # This will create a table with just one column and country-year as index. - index_columns = ["product", "country", "year"] - data_wide = combined.pivot(index=index_columns, columns=["title"], values="value").reset_index() - - # Add column for FAO population. - data_wide = pd.merge(data_wide, fao_population, on=["country", "year"], how="left") - - # Add column for OWID population. - data_wide = geo.add_population_to_dataframe(df=data_wide, warn_on_missing_countries=False) - - # Fill gaps in OWID population with FAO population (for "* (FAO)" countries, i.e. countries that were not - # harmonized and for which there is no OWID population). - # Then drop "fao_population", since it is no longer needed. - data_wide["population"] = data_wide["population"].fillna(data_wide["fao_population"]) - data_wide = data_wide.drop(columns="fao_population") - - assert len(data_wide.columns[data_wide.isnull().all(axis=0)]) == 0, "Unexpected columns with only nan values." - - # Set a reasonable index. - data_wide = data_wide.set_index(index_columns, verify_integrity=True) - - return data_wide - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Fetch the dataset short name from dest_dir. - dataset_short_name = Path(dest_dir).name - - # Define path to current step file. - current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") - - # Get paths and naming conventions for current data step. - paths = PathFinder(current_step_file.as_posix()) - - # Load latest qcl and fbsc datasets from garden. - qcl_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_qcl") - fbsc_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_fbsc") - - # Get main long tables from qcl and fbsc datasets. - qcl_table = qcl_dataset[f"{NAMESPACE}_qcl"] - fbsc_table = fbsc_dataset[f"{NAMESPACE}_fbsc"] - - # - # Process data. - # - # Combine qcl and fbsc data. - data = combine_qcl_and_fbsc(qcl_table=qcl_table, fbsc_table=fbsc_table) - - # Prepare data in the format required by the food explorer. - data = process_combined_data(combined=data) - - # Create table of products. - table = catalog.Table(data, short_name=dataset_short_name) - - # - # Save outputs. - # - # Initialise new garden dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[table], default_metadata=fbsc_dataset.metadata) - - # Update dataset metadata and combine sources from qcl and fbsc datasets. - ds_garden.metadata.title = DATASET_TITLE - ds_garden.metadata.description = DATASET_DESCRIPTION - ds_garden.metadata.sources = fbsc_dataset.metadata.sources + qcl_dataset.metadata.sources - - # Create new dataset in garden. - ds_garden.save() diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_fs.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_fs.py deleted file mode 100644 index a836381fb94..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_fs.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_fs dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_gn.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_gn.py deleted file mode 100644 index 4d77b41c597..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_gn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_gn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ic.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_ic.py deleted file mode 100644 index 5e86234ddde..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ic.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ic dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_lc.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_lc.py deleted file mode 100644 index ab508fd95ad..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_lc.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_lc dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_metadata.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_metadata.py deleted file mode 100644 index 4c571988e9a..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_metadata.py +++ /dev/null @@ -1,1052 +0,0 @@ -"""FAOSTAT garden step for faostat_metadata dataset. - -This step reads from: -* The (additional) metadata dataset. The only crucial ingredients from here (that will be used later on in other garden - steps are element, item and units descriptions, and country groups (used to check that we do not double count - countries when aggregating data for regions). -* Custom datasets file ('./custom_datasets.csv'). -* Custom elements and units file ('./custom_elements_and_units.csv'). -* Custom items file ('./custom_items.csv'). -* Value amendments file ('./value_amendments.csv'). -* Each of the individual meadow datasets. They are loaded to extract their countries, items, elements and units, and - some sanity checks are performed. - -This step will: -* Output a dataset (to be loaded by all garden datasets) with tables 'countries, 'datasets', 'elements', 'items' - and 'amendments'. -* Apply sanity checks to countries, elements, items, and units. -* Apply custom names and descriptions to datasets, elements, items and units. -* Check that spurious values in value_amendments.csv are in the data, and whether there are new spurious values. -* Harmonize country names. -* Find countries that correspond to aggregates of other countries (e.g. 'Melanesia'). -* Ensure there are no degeneracies within a dataset (i.e. ensure each index is unique). -* Ensure there are no degeneracies between datasets (using dataset, item_code, element_code as keys). - -There are some non-trivial issues with the definitions of items at FAOSTAT: -* Some item codes in the data are missing in the metadata, and vice versa. -* The mapping item_code -> item in the data files is sometimes different from the mapping item_code -> item - in the (additional) metadata dataset. Some examples: - * In dataset qv, item code 221 in the data corresponds to item "Almonds, in shell", whereas in the metadata, - item code 221 corresponds to item "Almonds, with shell", which is the same item, but with a slightly different - name. This happens with many items. On the website (https://www.fao.org/faostat/en/?#data/QV) they seem to be - using the naming from the metadata. We can safely ignore this issue, and stick to the names in the data. - -There are several cases in which one or a few item codes in the data are missing in the metadata. Also, there are -several cases in which an item code in the data has an item name slightly different in the metadata. But these are not -important issues (since we use item_code to merge different datasets, and we use metadata only to fetch descriptions). - -""" - -import json -import sys -from copy import deepcopy -from pathlib import Path -from typing import Dict, List, Tuple, cast - -import pandas as pd -from owid import catalog -from owid.datautils import dataframes, io -from shared import ( - CURRENT_DIR, - FAOSTAT_METADATA_SHORT_NAME, - FLAGS_RANKING, - N_CHARACTERS_ELEMENT_CODE, - N_CHARACTERS_ITEM_CODE, - NAMESPACE, - harmonize_elements, - harmonize_items, - log, - optimize_table_dtypes, - prepare_dataset_description, -) -from tqdm.auto import tqdm - -from etl.helpers import PathFinder - -# Minimum number of issues in the comparison of items and item codes from data and metadata to raise a warning. -N_ISSUES_ON_ITEMS_FOR_WARNING = 1 - - -def create_dataset_descriptions_dataframe_for_domain(table: catalog.Table, dataset_short_name: str) -> pd.DataFrame: - """Create a single row dataframe with the dataset name, title and description, for a given domain. - - Parameters - ---------- - table : catalog.Table - Latest table for considered domain. - dataset_short_name : str - Dataset short name (e.g. 'faostat_qcl'). - - Returns - ------- - dataset_descriptions_df : pd.DataFrame - Dataframe of name, title and description of a domain. - - """ - dataset_descriptions_df = pd.DataFrame( - { - "dataset": [dataset_short_name], - "fao_dataset_title": [table.metadata.dataset.title], - "fao_dataset_description": [table.metadata.dataset.description], - } - ) - - return dataset_descriptions_df - - -def clean_global_dataset_descriptions_dataframe( - datasets_df: pd.DataFrame, custom_datasets: pd.DataFrame -) -> pd.DataFrame: - """Apply sanity checks to the dataframe gathered from the data of each individual datasets, and add custom dataset - titles and descriptions. - - Parameters - ---------- - datasets_df : pd.DataFrame - Dataframe of descriptions gathered from the data of each individual dataset. - custom_datasets : pd.DataFrame - Data from the custom_datasets.csv file. - - Returns - ------- - datasets_df : pd.Dataframe - Clean dataframe of dataset titles and descriptions (customized and original FAO ones). - - """ - datasets_df = datasets_df.copy() - - # Check that the dataset descriptions of fbsh and fbs are identical. - error = ( - "Datasets fbsh and fbs have different descriptions. " - "This may happen in the future: Simply check that nothing significant has changed and remove this assertion." - ) - assert ( - datasets_df[datasets_df["dataset"] == "faostat_fbsh"]["fao_dataset_description"].item() - == datasets_df[datasets_df["dataset"] == "faostat_fbs"]["fao_dataset_description"].item() - ), error - # Drop row for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). - datasets_df = datasets_df[datasets_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) - datasets_df.loc[datasets_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" - - # Add custom dataset titles. - datasets_df = pd.merge( - datasets_df, - custom_datasets, - on="dataset", - how="left", - suffixes=("_new", "_old"), - ) - - changed_titles = datasets_df[ - datasets_df["fao_dataset_title_old"].fillna("") != datasets_df["fao_dataset_title_new"].fillna("") - ] - changed_descriptions = datasets_df[ - datasets_df["fao_dataset_description_old"].fillna("") != datasets_df["fao_dataset_description_new"].fillna("") - ] - - if len(changed_titles) > 0: - log.warning(f"{len(changed_titles)} domains have changed titles, consider updating custom_datasets.csv.") - if len(changed_descriptions) > 0: - log.warning( - f"{len(changed_descriptions)} domains have changed descriptions. " f"Consider updating custom_datasets.csv." - ) - datasets_df = datasets_df.drop(columns=["fao_dataset_title_old", "fao_dataset_description_old"]).rename( - columns={ - "fao_dataset_title_new": "fao_dataset_title", - "fao_dataset_description_new": "fao_dataset_description", - } - ) - - datasets_df["owid_dataset_title"] = datasets_df["owid_dataset_title"].fillna(datasets_df["fao_dataset_title"]) - error = "Custom titles for different datasets are equal. Edit custom_datasets.csv file." - assert len(set(datasets_df["dataset"])) == len(set(datasets_df["owid_dataset_title"])), error - - # The final description will be the owid description (if there is any) followed by the original FAO description - # (if there is any). - datasets_df["owid_dataset_description"] = [ - prepare_dataset_description( - fao_description=dataset["fao_dataset_description"], - owid_description=dataset["owid_dataset_description"], - ) - for _, dataset in datasets_df.fillna("").iterrows() - ] - - # Reorder columns. - datasets_df = datasets_df[ - [ - "dataset", - "fao_dataset_title", - "owid_dataset_title", - "fao_dataset_description", - "owid_dataset_description", - ] - ] - - return datasets_df - - -def check_that_item_and_element_harmonization_does_not_trim_codes(data: pd.DataFrame, category: str) -> None: - # Ensure that the number of digits of all item and element codes is smaller than the limits defined - # at the beginning of the garden shared module, by N_CHARACTERS_ITEM_CODE and N_CHARACTERS_ELEMENT_CODE, - # respectively. - n_characters = {"element": N_CHARACTERS_ELEMENT_CODE, "item": N_CHARACTERS_ITEM_CODE} - error = ( - f"{category.capitalize()} codes found with more than N_CHARACTERS_{category.upper()}_CODE digits. " - f"This parameter is defined in garden shared module and may need to be increased. " - f"This would change how {category} codes are harmonized, increasing the length of variable names. " - f"It may have further unwanted consequences, so do it with caution." - ) - assert all([len(str(code)) <= n_characters[category] for code in data[f"{category}_code"].unique()]), error - - -def create_items_dataframe_for_domain( - table: catalog.Table, metadata: catalog.Dataset, dataset_short_name: str -) -> pd.DataFrame: - """Apply sanity checks to the items of a table in a dataset, and to the items from the metadata, harmonize all item - codes and items, and add item descriptions. - - Parameters - ---------- - table : catalog.Table - Data for a given domain. - metadata: catalog.Dataset - Metadata dataset from meadow. - dataset_short_name : str - Dataset short name (e.g. 'faostat_qcl'). - - Returns - ------- - items_from_data : pd.Dataframe - Item names and descriptions (customized ones and FAO original ones) for a particular domain. - - """ - df = pd.DataFrame(table).reset_index() - - # Load items from data. - items_from_data = ( - df.rename(columns={"item": "fao_item"})[["item_code", "fao_item"]].drop_duplicates().reset_index(drop=True) - ) - # Sanity check. - check_that_item_and_element_harmonization_does_not_trim_codes(data=df, category="item") - # Ensure items are well constructed and amend already known issues (defined in shared.ITEM_AMENDMENTS). - items_from_data = harmonize_items(df=items_from_data, dataset_short_name=dataset_short_name, item_col="fao_item") - - # Load items from metadata. - items_columns = { - "item_code": "item_code", - "item": "fao_item", - "description": "fao_item_description", - } - _items_df = ( - metadata[f"{dataset_short_name}_item"] - .reset_index()[list(items_columns)] - .rename(columns=items_columns) - .drop_duplicates() - .sort_values(list(items_columns.values())) - .reset_index(drop=True) - ) - _items_df = harmonize_items(df=_items_df, dataset_short_name=dataset_short_name, item_col="fao_item") - _items_df["fao_item_description"] = _items_df["fao_item_description"].astype("string") - - # Add descriptions (from metadata) to items (from data). - items_from_data = ( - pd.merge(items_from_data, _items_df, on=["item_code", "fao_item"], how="left") - .sort_values(["item_code", "fao_item"]) - .reset_index(drop=True) - ) - items_from_data["dataset"] = dataset_short_name - items_from_data["fao_item_description"] = items_from_data["fao_item_description"].fillna("") - - # Sanity checks for items in current dataset: - - # Check that in data, there is only one item per item code. - n_items_per_item_code = items_from_data.groupby("item_code")["fao_item"].transform("nunique") - error = f"Multiple items for a given item code in dataset {dataset_short_name}." - assert items_from_data[n_items_per_item_code > 1].empty, error - - # Check that all item codes in data are defined in metadata, and check that the mapping item code -> item in - # the data is the same as in the metadata (which often is not the case). - compared = pd.merge( - items_from_data[["item_code", "fao_item"]], - _items_df[["item_code", "fao_item"]], - on="item_code", - how="left", - suffixes=("_in_data", "_in_metadata"), - ) - different_items = compared[compared["fao_item_in_data"] != compared["fao_item_in_metadata"]] - missing_item_codes = set(items_from_data["item_code"]) - set(_items_df["item_code"]) - if (len(different_items) + len(missing_item_codes)) > N_ISSUES_ON_ITEMS_FOR_WARNING: - log.warning( - f"{len(missing_item_codes)} item codes in {dataset_short_name} missing in metadata. " - f"{len(different_items)} item codes in data mapping to different items in metadata." - ) - - return items_from_data - - -def clean_global_items_dataframe(items_df: pd.DataFrame, custom_items: pd.DataFrame) -> pd.DataFrame: - """Apply global sanity checks to items gathered from all datasets, and create a clean global items dataframe. - - Parameters - ---------- - items_df : pd.DataFrame - Items dataframe gathered from all domains. - custom_items : pd.DataFrame - Data from custom_items.csv file. - - Returns - ------- - items_df : pd.DataFrame - Clean global items dataframe. - - """ - items_df = items_df.copy() - - # Check that fbs and fbsh have the same contributions, remove one of them, and rename the other to fbsc. - check = pd.merge( - items_df[items_df["dataset"] == "faostat_fbsh"].reset_index(drop=True)[["item_code", "fao_item"]], - items_df[items_df["dataset"] == "faostat_fbs"].reset_index(drop=True)[["item_code", "fao_item"]], - how="outer", - on=["item_code"], - suffixes=("_fbsh", "_fbs"), - ) - assert (check["fao_item_fbsh"] == check["fao_item_fbs"]).all() - # Drop all rows for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). - items_df = items_df[items_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) - items_df.loc[items_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" - - # Add custom item names. - items_df = pd.merge( - items_df, - custom_items.rename(columns={"fao_item": "fao_item_check"}), - on=["dataset", "item_code"], - how="left", - suffixes=("_new", "_old"), - ) - - changed_descriptions = items_df[ - (items_df["fao_item_description_old"] != items_df["fao_item_description_new"]) - & (items_df["fao_item_description_old"].notnull()) - ] - if len(changed_descriptions) > 0: - log.warning( - f"{len(changed_descriptions)} domains have changed item descriptions. " - f"Consider updating custom_items.csv." - ) - - items_df = items_df.drop(columns="fao_item_description_old").rename( - columns={"fao_item_description_new": "fao_item_description"} - ) - - # Check that item names have not changed. - # NOTE: This condition used to raise an error if not fulfilled. Consider making it an assertion. - if not ( - items_df[items_df["fao_item_check"].notnull()]["fao_item_check"] - == items_df[items_df["fao_item_check"].notnull()]["fao_item"] - ).all(): - log.warning("Item names may have changed with respect to custom items file. Update custom items file.") - items_df = items_df.drop(columns=["fao_item_check"]) - - # Assign original FAO name to all owid items that do not have a custom name. - items_df["owid_item"] = items_df["owid_item"].fillna(items_df["fao_item"]) - - # Add custom item descriptions, and assign original FAO descriptions to items that do not have a custom description. - items_df["owid_item_description"] = items_df["owid_item_description"].fillna(items_df["fao_item_description"]) - - # Check that we have not introduced ambiguities when assigning custom item names. - n_owid_items_per_item_code = items_df.groupby(["dataset", "item_code"])["owid_item"].transform("nunique") - error = "Multiple owid items for a given item code in a dataset." - assert items_df[n_owid_items_per_item_code > 1].empty, error - - items_df = ( - items_df[ - [ - "dataset", - "item_code", - "fao_item", - "owid_item", - "fao_item_description", - "owid_item_description", - ] - ] - .sort_values(["dataset", "item_code"]) - .reset_index(drop=True) - ) - - return items_df - - -def create_elements_dataframe_for_domain( - table: catalog.Table, metadata: catalog.Dataset, dataset_short_name: str -) -> pd.DataFrame: - """Apply sanity checks to the elements and units of a table in a dataset, and to the elements and units from the - metadata, harmonize all element code, and add descriptions. - - Parameters - ---------- - table : catalog.Table - Data for a given domain. - metadata: catalog.Dataset - Additional metadata dataset from meadow. - dataset_short_name : str - Dataset short name (e.g. 'faostat_qcl'). - - Returns - ------- - elements_from_data : pd.Dataframe - Element names and descriptions and unit names and descriptions (customized ones and FAO original ones) for a - particular domain. - - """ - - df = pd.DataFrame(table).reset_index() - # Load elements from data. - elements_from_data = ( - df.rename(columns={"element": "fao_element", "unit": "fao_unit_short_name"})[ - ["element_code", "fao_element", "fao_unit_short_name"] - ] - .drop_duplicates() - .reset_index(drop=True) - ) - # Sanity check. - check_that_item_and_element_harmonization_does_not_trim_codes(data=df, category="element") - # Ensure element_code is always a string of a fix number of characters. - elements_from_data = harmonize_elements(df=elements_from_data, element_col="fao_element") - - # Load elements from metadata. - elements_columns = { - "element_code": "element_code", - "element": "fao_element", - "description": "fao_element_description", - } - _elements_df = ( - metadata[f"{dataset_short_name}_element"] - .reset_index()[list(elements_columns)] - .rename(columns=elements_columns) - .drop_duplicates() - .sort_values(list(elements_columns.values())) - .reset_index(drop=True) - ) - _elements_df = harmonize_elements(df=_elements_df, element_col="fao_element") - _elements_df["fao_element_description"] = _elements_df["fao_element_description"].astype("string") - - # Load units metadata. - units_columns = { - "unit_name": "fao_unit_short_name", - "description": "fao_unit", - } - _units_df = ( - metadata[f"{dataset_short_name}_unit"] - .reset_index()[list(units_columns)] - .rename(columns=units_columns) - .drop_duplicates() - .sort_values(list(units_columns.values())) - .reset_index(drop=True) - ) - _units_df["fao_unit"] = _units_df["fao_unit"].astype("string") - - # Add element descriptions (from metadata). - elements_from_data = ( - pd.merge( - elements_from_data, - _elements_df, - on=["element_code", "fao_element"], - how="left", - ) - .sort_values(["element_code", "fao_element"]) - .reset_index(drop=True) - ) - elements_from_data["dataset"] = dataset_short_name - elements_from_data["fao_element_description"] = elements_from_data["fao_element_description"].fillna("") - - # Add unit descriptions (from metadata). - elements_from_data = ( - pd.merge(elements_from_data, _units_df, on=["fao_unit_short_name"], how="left") - .sort_values(["fao_unit_short_name"]) - .reset_index(drop=True) - ) - elements_from_data["fao_unit"] = elements_from_data["fao_unit"].fillna(elements_from_data["fao_unit_short_name"]) - - # Sanity checks: - - # Check that in data, there is only one unit per element code. - n_units_per_element_code = df.groupby("element_code")["unit"].transform("nunique") - error = f"Multiple units for a given element code in dataset {dataset_short_name}." - assert df[n_units_per_element_code > 1].empty, error - - # Check that in data, there is only one element per element code. - n_elements_per_element_code = elements_from_data.groupby("element_code")["fao_element"].transform("nunique") - error = f"Multiple elements for a given element code in dataset {dataset_short_name}." - assert elements_from_data[n_elements_per_element_code > 1].empty, error - - return elements_from_data - - -def clean_global_elements_dataframe(elements_df: pd.DataFrame, custom_elements: pd.DataFrame) -> pd.DataFrame: - """Apply global sanity checks to elements and units gathered from all datasets, and create a clean global elements - and units dataframe. - - Parameters - ---------- - elements_df : pd.DataFrame - Elements and units dataframe gathered from all domains. - custom_elements : pd.DataFrame - Data from custom_element_and_units.csv file. - - Returns - ------- - elements_df : pd.DataFrame - Clean global elements and units dataframe. - - """ - elements_df = elements_df.copy() - - # Check that all elements of fbsh are in fbs (although fbs may contain additional elements). - assert set(elements_df[elements_df["dataset"] == "faostat_fbsh"]["element_code"]) <= set( - elements_df[elements_df["dataset"] == "faostat_fbs"]["element_code"] - ) - # Drop all rows for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). - elements_df = elements_df[elements_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) - elements_df.loc[elements_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" - - elements_df = pd.merge( - elements_df, - custom_elements.rename( - columns={ - "fao_element": "fao_element_check", - "fao_unit_short_name": "fao_unit_short_name_check", - } - ), - on=["dataset", "element_code"], - how="left", - suffixes=("_new", "_old"), - ) - - changed_units = elements_df[ - (elements_df["fao_unit_new"] != elements_df["fao_unit_old"]) & (elements_df["fao_unit_old"].notnull()) - ] - if len(changed_units) > 0: - log.warning(f"{len(changed_units)} domains have changed units, consider updating custom_elements.csv.") - - changed_descriptions = elements_df[ - (elements_df["fao_element_description_new"] != elements_df["fao_element_description_old"]) - & (elements_df["fao_element_description_old"].notnull()) - ] - if len(changed_descriptions) > 0: - log.warning( - f"{len(changed_descriptions)} domains have changed element descriptions. " - f"Consider updating custom_elements.csv." - ) - - elements_df = elements_df.drop(columns=["fao_unit_old", "fao_element_description_old"]).rename( - columns={ - "fao_element_description_new": "fao_element_description", - "fao_unit_new": "fao_unit", - } - ) - - error = "Element names have changed with respect to custom elements file. Update custom elements file." - assert ( - elements_df[elements_df["fao_element_check"].notnull()]["fao_element_check"] - == elements_df[elements_df["fao_element_check"].notnull()]["fao_element"] - ).all(), error - elements_df = elements_df.drop(columns=["fao_element_check"]) - - error = "Unit names have changed with respect to custom elements file. Update custom elements file." - assert ( - elements_df[elements_df["fao_unit_short_name_check"].notnull()]["fao_unit_short_name_check"] - == elements_df[elements_df["fao_unit_short_name_check"].notnull()]["fao_unit_short_name"] - ).all(), error - elements_df = elements_df.drop(columns=["fao_unit_short_name_check"]) - - # Assign original FAO names where there is no custom one. - elements_df["owid_element"] = elements_df["owid_element"].fillna(elements_df["fao_element"]) - elements_df["owid_unit"] = elements_df["owid_unit"].fillna(elements_df["fao_unit"]) - elements_df["owid_element_description"] = elements_df["owid_element_description"].fillna( - elements_df["fao_element_description"] - ) - elements_df["owid_unit_short_name"] = elements_df["owid_unit_short_name"].fillna(elements_df["fao_unit_short_name"]) - - # Assume variables were not per capita, if was_per_capita is not informed, and make boolean. - elements_df["was_per_capita"] = elements_df["was_per_capita"].fillna("0").replace({"0": False, "1": True}) - - # Idem for variables to make per capita. - elements_df["make_per_capita"] = elements_df["make_per_capita"].fillna("0").replace({"0": False, "1": True}) - - # Check that we have not introduced ambiguities when assigning custom element or unit names. - n_owid_elements_per_element_code = elements_df.groupby(["dataset", "element_code"])["owid_element"].transform( - "nunique" - ) - error = "Multiple owid elements for a given element code in a dataset." - assert elements_df[n_owid_elements_per_element_code > 1].empty, error - - # Check that we have not introduced ambiguities when assigning custom element or unit names. - n_owid_units_per_element_code = elements_df.groupby(["dataset", "element_code"])["owid_unit"].transform("nunique") - error = "Multiple owid elements for a given element code in a dataset." - assert elements_df[n_owid_units_per_element_code > 1].empty, error - - # NOTE: We assert that there is one element for each element code. But the opposite may not be true: there can be - # multiple element codes with the same element. And idem for items. - - return elements_df - - -def check_countries_to_exclude_or_harmonize( - countries_in_data: pd.DataFrame, excluded_countries: List[str], countries_harmonization: Dict[str, str] -) -> None: - # Check that all excluded countries are in the data. - unknown_excluded_countries = set(excluded_countries) - set(countries_in_data["fao_country"]) - error = ( - f"Uknown excluded countries (to be removed from faostat.excluded_countries.json): {unknown_excluded_countries}" - ) - assert len(unknown_excluded_countries) == 0, error - - # Check that all countries to be harmonized are in the data. - unknown_countries_to_harmonize = set(countries_harmonization) - set(countries_in_data["fao_country"]) - error = f"Unknown countries to be harmonized (to be removed or edited in faostat.countries.json): {unknown_countries_to_harmonize}" - assert len(unknown_countries_to_harmonize) == 0, error - - # Check that all countries in the data are either to be excluded or to be harmonized. - unknown_countries = set(countries_in_data["fao_country"]) - set(excluded_countries) - set(countries_harmonization) - error = f"Unknown countries in the data (to be added either to faostat.excluded_countries.json or to faostat.countries.json): {unknown_countries}" - assert len(unknown_countries) == 0, error - - -def clean_global_countries_dataframe( - countries_in_data: pd.DataFrame, - country_groups: Dict[str, List[str]], - countries_harmonization: Dict[str, str], - excluded_countries: List[str], -) -> pd.DataFrame: - """Clean dataframe of countries gathered from the data of the individual domains, harmonize country names (and - country names of members of regions), and create a clean global countries dataframe. - - Parameters - ---------- - countries_in_data : pd.DataFrame - Countries gathered from the data of all domains. - country_groups : dict - Countries and their members, gathered from the data. - countries_harmonization : dict - Mapping of country names (from FAO names to OWID names). - excluded_countries : list - Country names to be ignored. - - Returns - ------- - countries_df : pd.DataFrame - Clean global countries dataframe. - - """ - countries_df = countries_in_data.copy() - - # Sanity checks. - check_countries_to_exclude_or_harmonize( - countries_in_data=countries_in_data, - excluded_countries=excluded_countries, - countries_harmonization=countries_harmonization, - ) - - # Harmonize country groups and members. - country_groups_harmonized = { - countries_harmonization[group]: sorted([countries_harmonization[member] for member in country_groups[group]]) - for group in country_groups - if group in countries_harmonization - } - - # Harmonize country names. - countries_df["country"] = dataframes.map_series( - series=countries_df["fao_country"], - mapping=countries_harmonization, - warn_on_missing_mappings=False, - warn_on_unused_mappings=False, - make_unmapped_values_nan=True, - show_full_warning=True, - ) - - # Add country members to countries dataframe. - countries_df["members"] = dataframes.map_series( - series=countries_df["country"], - mapping=country_groups_harmonized, - make_unmapped_values_nan=True, - ) - - # Feather does not support object types, so convert column of lists to column of strings. - countries_df["members"] = [ - json.dumps(members) if isinstance(members, list) else members for members in countries_df["members"] - ] - - return countries_df - - -def create_table(df: pd.DataFrame, short_name: str, index_cols: List[str]) -> catalog.Table: - """Create a table with optimal format and basic metadata, out of a dataframe. - - Parameters - ---------- - df : pd.DataFrame - Input dataframe. - short_name : str - Short name to add in the metadata of the new table. - index_cols : list - Columns to use as indexes of the new table. - - Returns - ------- - table : catalog.Table - New table. - - """ - table = catalog.Table(df).copy() - - # Optimize column dtypes before storing feather file, and ensure codes are categories (instead of ints). - table = optimize_table_dtypes(table) - - # Set indexes and other necessary metadata. - table = table.set_index(index_cols, verify_integrity=True) - - table.metadata.short_name = short_name - table.metadata.primary_key = index_cols - - return cast(catalog.Table, table) - - -def check_that_flag_definitions_in_dataset_agree_with_those_in_flags_ranking( - metadata: catalog.Dataset, -) -> None: - """Check that the definition of flags in the additional metadata for current dataset agree with the ones we have - manually written down in our flags ranking (raise error otherwise). - - Parameters - ---------- - metadata : catalog.Dataset - Additional metadata dataset (that must contain one table for current dataset). - - """ - for table_name in metadata.table_names: - if "flag" in table_name: - flag_df = metadata[table_name].reset_index() - comparison = pd.merge(FLAGS_RANKING, flag_df, on="flag", how="inner") - error_message = ( - f"Flag definitions in file {table_name} are different to those in our flags ranking. " - f"Redefine shared.FLAGS_RANKING." - ) - assert (comparison["description"] == comparison["flags"]).all(), error_message - - -def check_that_all_flags_in_dataset_are_in_ranking(table: catalog.Table, metadata_for_flags: catalog.Table) -> None: - """Check that all flags found in current dataset are defined in our flags ranking (raise error otherwise). - - Parameters - ---------- - table : pd.DataFrame - Data table for current dataset. - metadata_for_flags : catalog.Table - Flags for current dataset, as defined in dataset of additional metadata. - - """ - if not set(table["flag"]) < set(FLAGS_RANKING["flag"]): - missing_flags = set(table["flag"]) - set(FLAGS_RANKING["flag"]) - flags_data = pd.DataFrame(metadata_for_flags).reset_index() - if set(missing_flags) < set(flags_data["flag"]): - message = "Missing flags. Copy the following lines to FLAGS_RANKING (and put them in the right order):" - for i, j in pd.DataFrame(metadata_for_flags).loc[list(missing_flags)].iterrows(): - message += f"\n{(i, j['flags'])}," - log.warning(message) - else: - log.warning( - f"Missing flags. {missing_flags} are not defined in additional metadata. Get definition from " - f"https://www.fao.org/faostat/en/#definitions" - ) - raise AssertionError("Flags in dataset not found in FLAGS_RANKING. Manually add those flags.") - - -def check_definitions_in_value_amendments( - table: catalog.Table, dataset_short_name: str, value_amendments: pd.DataFrame -) -> None: - """Check definitions in the value_amendments.csv file. - - This function will assert that: - * All spurious values defined in the file are still found in the data. - * There are no unexpected spurious values in the data. - - Spurious values are only searched for in "value" column if it has "category" dtype. - See regular expression below used to search for spurious values. - - Parameters - ---------- - table : catalog.Table - _description_ - dataset_short_name : str - _description_ - value_amendments : pd.DataFrame - _description_ - """ - # Regular expression used to search for spurious values in the "value" column. - regex_spurious_values = "<|,|N" - - # Select value amendments for the specified dataset. - _value_amendments = value_amendments[value_amendments["dataset"] == dataset_short_name] - if not _value_amendments.empty: - # Check that spurious values defined in value_amendments.csv are indeed found in the data. - expected_spurious_values_not_found = set(_value_amendments["spurious_value"]) - set(table["value"]) - error = ( - f"Expected spurious values {expected_spurious_values_not_found} not found in {dataset_short_name}. " - f"Remove them from value_amendments.csv." - ) - assert len(expected_spurious_values_not_found) == 0, error - - # Search for additional spurious values (only if data values are of "category" type). - if table["value"].dtype == "category": - # Find any possible spurious values in the data. - spurious_values = ( - table[table["value"].astype(str).str.contains(regex_spurious_values, regex=True)]["value"].unique().tolist() - ) - # Find if any of those were not accounted for already in value_amendments. - new_spurious_values = set(spurious_values) - set(_value_amendments["spurious_value"]) - error = f"Unexpected spurious values found in {dataset_short_name}. Add the following values to value_amendments.csv: {new_spurious_values}" - assert len(new_spurious_values) == 0, error - - -def process_metadata( - paths: PathFinder, - metadata: catalog.Dataset, - custom_datasets: pd.DataFrame, - custom_elements: pd.DataFrame, - custom_items: pd.DataFrame, - countries_harmonization: Dict[str, str], - excluded_countries: List[str], - value_amendments: pd.DataFrame, -) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """Apply various sanity checks, gather data (about dataset, item, element and unit names and descriptions) from all - domains, compare with data from its corresponding metadata file, and create clean dataframes of metadata about - dataset, elements, units, items, and countries. - - Parameters - ---------- - metadata : catalog.Dataset - Additional metadata dataset from meadow. - custom_datasets : pd.DataFrame - Data from custom_datasets.csv file. - custom_elements : pd.DataFrame - Data from custom_elements_and_units.csv file. - custom_items : pd.DataFrame - Data from custom_items.csv file. - countries_harmonization : dict - Data from faostat.countries.json file. - excluded_countries : list - Data from faostat.excluded_countries.json file. - value_amendments : pd.DataFrame - Data from value_amendments.csv file. - - Returns - ------- - countries_df : pd.DataFrame - Clean dataframe of global countries. - datasets_df : pd.DataFrame - Clean dataframe of global dataset names and descriptions. - elements_df : pd.DataFrame - Clean dataframe of global element and unit names and descriptions. - items_df : pd.DataFrame - Clean dataframe of global item names and descriptions. - - """ - # Check if flags definitions need to be updated. - check_that_flag_definitions_in_dataset_agree_with_those_in_flags_ranking(metadata) - - # List all FAOSTAT dataset short names. - dataset_short_names = sorted( - set([NAMESPACE + "_" + table_name.split("_")[1] for table_name in metadata.table_names]) - ) - - # Initialise dataframe of dataset descriptions, items, and element-units. - # We cannot remove "dataset" from the items and elements dataframes, because it can happen that, for a given - # item code, the item name is slightly different in two different datasets. - datasets_df = pd.DataFrame({"dataset": [], "fao_dataset_title": [], "fao_dataset_description": []}) - items_df = pd.DataFrame({"dataset": [], "item_code": [], "fao_item": [], "fao_item_description": []}) - elements_df = pd.DataFrame( - { - "dataset": [], - "element_code": [], - "fao_element": [], - "fao_element_description": [], - "fao_unit": [], - "fao_unit_short_name": [], - } - ) - - # Initialise list of all countries in all datasets, and all country groups. - countries_in_data = pd.DataFrame({"area_code": [], "fao_country": []}).astype({"area_code": "Int64"}) - country_groups_in_data: Dict[str, List[str]] = {} - - # Gather all variables from the latest version of each meadow dataset. - for dataset_short_name in tqdm(dataset_short_names, file=sys.stdout): - # Load latest meadow table for current dataset. - ds_latest: catalog.Dataset = paths.load_dependency(dataset_short_name) - table = ds_latest[dataset_short_name] - df = pd.DataFrame(table.reset_index()).rename( - columns={ - "area": "fao_country", - "recipient_country": "fao_country", - "recipient_country_code": "area_code", - } - )[["area_code", "fao_country"]] - - df["area_code"] = df["area_code"].astype("Int64") - - # Temporary patch. - if dataset_short_name == "faostat_wcad": - error = ( - "Dataset faostat_wcad had 'French Guiana' for area code 69 (unlike other datasets, that had " - "'French Guyana'). But this may no longer the case, so this patch in the code can be removed." - ) - assert "French Guiana" in df["fao_country"].unique(), error - df["fao_country"] = dataframes.map_series(df["fao_country"], mapping={"French Guiana": "French Guyana"}) - - if f"{dataset_short_name}_flag" in metadata.table_names: - check_that_all_flags_in_dataset_are_in_ranking( - table=table, metadata_for_flags=metadata[f"{dataset_short_name}_flag"] - ) - - # Check if spurious values defined in value_amendments.csv are still in the data, - # and whether there are new spurious values to be amended. - check_definitions_in_value_amendments( - table=table, dataset_short_name=dataset_short_name, value_amendments=value_amendments - ) - - # Gather dataset descriptions, items, and element-units for current domain. - datasets_from_data = create_dataset_descriptions_dataframe_for_domain( - table, dataset_short_name=dataset_short_name - ) - - items_from_data = create_items_dataframe_for_domain( - table=table, metadata=metadata, dataset_short_name=dataset_short_name - ) - - elements_from_data = create_elements_dataframe_for_domain( - table=table, metadata=metadata, dataset_short_name=dataset_short_name - ) - - # Add countries in this dataset to the list of all countries. - countries_in_data = pd.concat([countries_in_data, df]).drop_duplicates() - - # Get country groups in this dataset. - area_group_table_name = f"{dataset_short_name}_area_group" - if area_group_table_name in metadata: - country_groups = ( - metadata[f"{dataset_short_name}_area_group"] - .reset_index() - .drop_duplicates(subset=["country_group", "country"]) - .groupby("country_group") - .agg({"country": list}) - .to_dict()["country"] - ) - # Add new groups to country_groups_in_data; if they are already there, ensure they contain all members. - for group in list(country_groups): - if group not in countries_in_data["fao_country"]: - # This should not happen, but skip just in case. - continue - if group in list(country_groups_in_data): - all_members = set(country_groups_in_data[group]) | set(country_groups[group]) - country_groups_in_data[group] = list(all_members) - else: - country_groups_in_data[group] = country_groups[group] - - # Add dataset descriptions, items, and element-units from current dataset to global dataframes. - datasets_df = dataframes.concatenate([datasets_df, datasets_from_data], ignore_index=True) - items_df = dataframes.concatenate([items_df, items_from_data], ignore_index=True) - elements_df = dataframes.concatenate([elements_df, elements_from_data], ignore_index=True) - - datasets_df = clean_global_dataset_descriptions_dataframe(datasets_df=datasets_df, custom_datasets=custom_datasets) - items_df = clean_global_items_dataframe(items_df=items_df, custom_items=custom_items) - - elements_df = clean_global_elements_dataframe(elements_df=elements_df, custom_elements=custom_elements) - countries_df = clean_global_countries_dataframe( - countries_in_data=countries_in_data, - country_groups=country_groups_in_data, - countries_harmonization=countries_harmonization, - excluded_countries=excluded_countries, - ) - - return countries_df, datasets_df, elements_df, items_df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Fetch the dataset short name from dest_dir. - dataset_short_name = Path(dest_dir).name - - # Define path to current step file. - current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") - - # Get paths and naming conventions for current data step. - paths = PathFinder(current_step_file.as_posix()) - - # Path to file with custom dataset titles and descriptions. - custom_datasets_file = paths.directory / "custom_datasets.csv" - # Path to file with custom item names and descriptions. - custom_items_file = paths.directory / "custom_items.csv" - # Path to file with custom element and unit names and descriptions. - custom_elements_and_units_file = paths.directory / "custom_elements_and_units.csv" - # Path to file with mapping from FAO names to OWID harmonized country names. - countries_file = paths.directory / f"{NAMESPACE}.countries.json" - # Path to file with list of excluded countries and regions. - excluded_countries_file = paths.directory / f"{NAMESPACE}.excluded_countries.json" - # Path to file with spurious values and amendments. - value_amendments_file = paths.directory / "value_amendments.csv" - - # Load metadata from meadow. - metadata: catalog.Dataset = paths.load_dependency(FAOSTAT_METADATA_SHORT_NAME) - - # Load custom dataset names, items, element-unit names, and value amendments. - custom_datasets = pd.read_csv(custom_datasets_file, dtype=str) - custom_elements = pd.read_csv(custom_elements_and_units_file, dtype=str) - custom_items = pd.read_csv(custom_items_file, dtype=str) - value_amendments = pd.read_csv(value_amendments_file, dtype=str) - - # Load country mapping and excluded countries files. - countries_harmonization = io.load_json(countries_file) - excluded_countries = io.load_json(excluded_countries_file) - - # - # Process data. - # - countries_df, datasets_df, elements_df, items_df = process_metadata( - paths=paths, - metadata=metadata, - custom_datasets=custom_datasets, - custom_elements=custom_elements, - custom_items=custom_items, - countries_harmonization=countries_harmonization, - excluded_countries=excluded_countries, - value_amendments=value_amendments, - ) - - # - # Save outputs. - # - # Initialize new garden dataset. - dataset_garden = catalog.Dataset.create_empty(dest_dir) - dataset_garden.short_name = FAOSTAT_METADATA_SHORT_NAME - # Keep original dataset's metadata from meadow. - dataset_garden.metadata = deepcopy(metadata.metadata) - # Create new dataset in garden. - dataset_garden.save() - - # Create new garden dataset with all dataset descriptions, items, element-units, and countries. - datasets_table = create_table(df=datasets_df, short_name="datasets", index_cols=["dataset"]) - items_table = create_table(df=items_df, short_name="items", index_cols=["dataset", "item_code"]) - elements_table = create_table(df=elements_df, short_name="elements", index_cols=["dataset", "element_code"]) - countries_table = create_table(df=countries_df, short_name="countries", index_cols=["area_code"]) - amendments_table = catalog.Table(value_amendments, short_name="amendments").set_index( - ["dataset", "spurious_value"], verify_integrity=True - ) - - # Add tables to dataset. - dataset_garden.add(datasets_table, repack=False) - dataset_garden.add(items_table, repack=False) - dataset_garden.add(elements_table, repack=False) - dataset_garden.add(countries_table, repack=False) - dataset_garden.add(amendments_table, repack=False) diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_qcl.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_qcl.py deleted file mode 100644 index 3fde76098ce..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_qcl.py +++ /dev/null @@ -1,532 +0,0 @@ -"""FAOSTAT garden step for faostat_qcl dataset.""" - -from pathlib import Path - -import numpy as np -import pandas as pd -from owid import catalog -from owid.datautils import dataframes -from shared import ( - ADDED_TITLE_TO_WIDE_TABLE, - CURRENT_DIR, - FLAG_MULTIPLE_FLAGS, - NAMESPACE, - REGIONS_TO_ADD, - add_per_capita_variables, - add_regions, - clean_data, - handle_anomalies, - harmonize_elements, - harmonize_items, - log, - parse_amendments_table, - prepare_long_table, - prepare_wide_table, -) - -from etl.helpers import PathFinder, create_dataset - -# Item and item code for 'Meat, poultry'. -ITEM_POULTRY = "Meat, poultry" -ITEM_CODE_MEAT_POULTRY = "00001808" -# Item code for 'Meat, chicken'. -ITEM_CODE_MEAT_CHICKEN = "00001058" -# List item codes to sum as part of "Meat, total" (avoiding double-counting items). -MEAT_TOTAL_ITEM_CODES = [ - "00000977", # 'Meat, lamb and mutton' (previously 'Meat, lamb and mutton') - "00001035", # 'Meat of pig with the bone, fresh or chilled' (previously 'Meat, pig') - "00001097", # 'Horse meat, fresh or chilled' (previously 'Meat, horse') - "00001108", # 'Meat of asses, fresh or chilled' (previously 'Meat, ass') - "00001111", # 'Meat of mules, fresh or chilled' (previously 'Meat, mule') - "00001127", # 'Meat of camels, fresh or chilled' (previously 'Meat, camel') - "00001141", # 'Meat of rabbits and hares, fresh or chilled' (previously 'Meat, rabbit') - "00001806", # 'Meat, beef and buffalo' (previously 'Meat, beef and buffalo') - "00001807", # 'Meat, sheep and goat' (previously 'Meat, sheep and goat') - ITEM_CODE_MEAT_POULTRY, # 'Meat, poultry' (previously 'Meat, poultry') -] - -# List of element codes for "Producing or slaughtered animals" (they have different items assigned). -SLAUGHTERED_ANIMALS_ELEMENT_CODES = ["005320", "005321"] -# For the resulting dataframe, we arbitrarily assign the first of those codes. -SLAUGHTERED_ANIMALS_ELEMENT_CODE = SLAUGHTERED_ANIMALS_ELEMENT_CODES[0] -# Item code for 'Meat, total'. -TOTAL_MEAT_ITEM_CODE = "00001765" -# OWID item name for total meat. -TOTAL_MEAT_ITEM = "Meat, total" -# OWID element name, unit name, and unit short name for number of slaughtered animals. -SLAUGHTERED_ANIMALS_ELEMENT = "Producing or slaughtered animals" -SLAUGHTERED_ANIMALS_UNIT = "animals" -SLAUGHTERED_ANIMALS_UNIT_SHORT_NAME = "animals" -# Text to be added to the dataset description (after the description of anomalies). -SLAUGHTERED_ANIMALS_ADDITIONAL_DESCRIPTION = ( - "\n\nFAO does not provide data for the total number of slaughtered animals " - "to produce meat. We calculate this metric by adding up the number of slaughtered animals of all meat groups. " - "However, when data for slaughtered poultry (which usually outnumbers other meat groups) is not provided, we do " - "not calculate the total (to avoid spurious dips in the data)." -) - - -def fill_slaughtered_poultry_with_slaughtered_chicken(data: pd.DataFrame) -> pd.DataFrame: - """Fill missing data on slaughtered poultry with slaughtered chicken. - - Most of poultry meat comes from chicken. However, sometimes chicken is informed, but the rest of poultry isn't, - which causes poultry data to be empty (e.g. Spain in 2018). - Therefore, we fill missing data for poultry with chicken data. - """ - data = data.copy() - - # Prepare a slice of the data to extract additional data fields. - additional_fields = ( - data[(data["item_code"] == ITEM_CODE_MEAT_POULTRY) & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT)][ - ["fao_item", "item_description", "fao_unit_short_name"] - ] - .drop_duplicates() - .iloc[0] - ) - - # Select data for the number of slaughtered chicken. - chickens_slaughtered = data[ - (data["item_code"] == ITEM_CODE_MEAT_CHICKEN) - & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) - ] - - # Select data for the number of slaughtered poultry. - poultry_slaughtered = data[ - (data["item_code"] == ITEM_CODE_MEAT_POULTRY) - & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) - ][["country", "year", "value"]] - - # Combine poultry and chicken data. - compared = pd.merge( - chickens_slaughtered, - poultry_slaughtered, - on=["country", "year"], - how="outer", - indicator=True, - suffixes=("_chicken", "_poultry"), - ) - - error = "There are cases where slaughtered poultry is informed, but slaughered chicken is not." - assert compared[compared["_merge"] == "right_only"].empty, error - - error = "There are rows where there is more slaughtered poultry than slaughtered chicken." - assert compared[compared["value_poultry"] < compared["value_chicken"]].empty, error - - # Prepare a replacement dataframe for missing data on slaughtered poultry. - poultry_slaughtered_missing_data = ( - compared[compared["_merge"] == "left_only"] - .assign( - **{ - "item_code": ITEM_CODE_MEAT_POULTRY, - "item": ITEM_POULTRY, - "fao_item": additional_fields["fao_item"], - "fao_unit_short_name": additional_fields["fao_unit_short_name"], - "item_description": additional_fields["item_description"], - } - ) - .drop(columns=["_merge", "value_poultry"]) - .rename(columns={"value_chicken": "value"}) - ) - - log.info( - f"Filling {len(poultry_slaughtered_missing_data)} rows of missing data for slaughtered poultry with " - "slaughtered chicken." - ) - # Add chicken data to the full dataframe. - data = pd.concat([data, poultry_slaughtered_missing_data], ignore_index=True) - - return data - - -def add_slaughtered_animals_to_meat_total(data: pd.DataFrame) -> pd.DataFrame: - """Add number of slaughtered animals to meat total. - - There is no FAOSTAT data on slaughtered animals for total meat. We construct this data by aggregating that element - for the items specified in items_to_aggregate (which corresponds to all meat items after removing redundancies). - - If the number of slaughtered poultry is not informed, we remove the number of total animals slaughtered - (since poultry are by far the most commonly slaughtered animals). - - Parameters - ---------- - data : pd.DataFrame - Processed data where meat total does not have number of slaughtered animals. - - Returns - ------- - combined_data : pd.DataFrame - Data after adding the new variable. - - """ - data = data.copy() - - error = f"Some items required to get the aggregate '{TOTAL_MEAT_ITEM}' are missing in data." - assert set(MEAT_TOTAL_ITEM_CODES) < set(data["item_code"]), error - assert SLAUGHTERED_ANIMALS_ELEMENT in data["element"].unique() - assert SLAUGHTERED_ANIMALS_UNIT in data["unit"].unique() - - # Check that, indeed, the number of slaughtered animals for total meat is not given in the original data. - assert data[ - (data["item"] == TOTAL_MEAT_ITEM) - & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) - ].empty - - # There are two element codes for the same element (they have different items assigned). - error = "Element codes for 'Producing or slaughtered animals' may have changed." - assert ( - data[(data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) & ~(data["element_code"].str.contains("pc"))][ - "element_code" - ] - .unique() - .tolist() - == SLAUGHTERED_ANIMALS_ELEMENT_CODES - ), error - - # Check that the items assigned to each the two element codes do not overlap. - error = "Element codes for 'Producing or slaughtered animals' have overlapping items." - items_for_different_elements = ( - data[(data["element_code"].isin(SLAUGHTERED_ANIMALS_ELEMENT_CODES))] - .groupby("element_code", observed=True) - .agg({"item_code": lambda x: list(x.unique())}) - .to_dict()["item_code"] - ) - assert set.intersection(*[set(x) for x in items_for_different_elements.values()]) == set(), error - - # Confirm the item code for total meat. - error = f"Item code for '{TOTAL_MEAT_ITEM}' may have changed." - assert list(data[data["item"] == TOTAL_MEAT_ITEM]["item_code"].unique()) == [TOTAL_MEAT_ITEM_CODE], error - - # Select the subset of data to aggregate. - data_to_aggregate = ( - data[ - (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) - & (data["item_code"].isin(MEAT_TOTAL_ITEM_CODES)) - ] - .dropna(subset="value") - .reset_index(drop=True) - ) - - # Create a dataframe with the total number of animals used for meat. - animals = dataframes.groupby_agg( - data_to_aggregate, - groupby_columns=[ - "area_code", - "fao_country", - "fao_element", - "country", - "year", - "population_with_data", - ], - aggregations={ - "value": "sum", - "flag": lambda x: x if len(x) == 1 else FLAG_MULTIPLE_FLAGS, - }, - ).reset_index() - - # Get element description for selected element code (so far it's always been an empty string). - _slaughtered_animals_element_description = data[data["element_code"].isin(SLAUGHTERED_ANIMALS_ELEMENT_CODES)][ - "element_description" - ].unique() - assert len(_slaughtered_animals_element_description) == 1 - slaughtered_animals_element_description = _slaughtered_animals_element_description[0] - - # Get item description for selected item code. - _total_meat_item_description = data[data["item_code"] == TOTAL_MEAT_ITEM_CODE]["item_description"].unique() - assert len(_total_meat_item_description) == 1 - total_meat_item_description = _total_meat_item_description[0] - - # Get FAO item name for selected item code. - _total_meat_fao_item = data[data["item_code"] == TOTAL_MEAT_ITEM_CODE]["fao_item"].unique() - assert len(_total_meat_fao_item) == 1 - total_meat_fao_item = _total_meat_fao_item[0] - - # Get FAO unit for selected item code. - _total_meat_fao_unit = data[data["item_code"] == TOTAL_MEAT_ITEM_CODE]["fao_unit_short_name"].unique() - assert len(_total_meat_fao_unit) == 1 - total_meat_fao_unit = _total_meat_fao_unit[0] - - # Manually include the rest of columns. - animals["element"] = SLAUGHTERED_ANIMALS_ELEMENT - animals["element_description"] = slaughtered_animals_element_description - animals["unit"] = SLAUGHTERED_ANIMALS_UNIT - animals["unit_short_name"] = SLAUGHTERED_ANIMALS_UNIT_SHORT_NAME - # We arbitrarily assign the first element code (out of the two available) to the resulting variables. - animals["element_code"] = SLAUGHTERED_ANIMALS_ELEMENT_CODE - animals["item_code"] = TOTAL_MEAT_ITEM_CODE - animals["item"] = TOTAL_MEAT_ITEM - animals["item_description"] = total_meat_item_description - animals["fao_item"] = total_meat_fao_item - animals["fao_unit_short_name"] = total_meat_fao_unit - - log.info(f"Adding {len(animals)} rows with the total number of slaughtered animals for meat.") - - # For each year, we are adding up the number of animals slaughtered to compute the total, regardless of how many - # of those animals have data. - # However, some years do not have data for a particular animal; this is acceptable except if the animal is poultry, - # which is the most commonly slaughtered animal. Therefore, if data is missing for poultry, the total will show a - # significant (and spurious) decrease (this happens, e.g. in Estonia in 2019). - # Therefore, we remove data points for which poultry is not informed. - - # Find country-years for which we have the number of poultry slaughtered. - country_years_with_poultry_data = ( - data[ - (data["item_code"] == ITEM_CODE_MEAT_POULTRY) - & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) - ] - .dropna(subset="value")[["country", "year"]] - .drop_duplicates() - .reset_index(drop=True) - ) - - # Add a column to inform of all those rows for which we don't have poultry data. - compared = pd.merge(animals, country_years_with_poultry_data, how="outer", indicator=True) - - assert compared[compared["_merge"] == "right_only"].empty, "Expected 'left_only' or 'both', not 'right_only'." - - log.info( - f"Removed {len(compared[compared['_merge'] == 'left_only'])} rows for which we don't have the number of " - "poultry slaughtered." - ) - - animals_corrected = compared[compared["_merge"] == "both"].reset_index(drop=True).drop(columns=["_merge"]) - - # Check that we are not missing any column. - assert set(data.columns) == set(animals_corrected.columns) - - # Add animals data to the original dataframe. - combined_data = ( - pd.concat([data, animals_corrected], ignore_index=True) - .reset_index(drop=True) - .astype( - { - "element_code": "category", - "item_code": "category", - "fao_item": "category", - "fao_unit_short_name": "category", - "flag": "category", - "item": "category", - "item_description": "category", - "element": "category", - "unit": "category", - "element_description": "category", - "unit_short_name": "category", - } - ) - ) - - return combined_data - - -def add_yield_to_aggregate_regions(data: pd.DataFrame) -> pd.DataFrame: - """Add yield (production / area harvested) to data for aggregate regions (i.e. continents and income groups). - - This data is not included in aggregate regions because it cannot be aggregated by simply summing the contribution of - the individual countries. Instead, we need to aggregate production, then aggregate area harvested, and then divide - one by the other. - - Note: Here, we divide production (the sum of the production from a list of countries in a region) by area (the sum - of the area from a list of countries in a region) to obtain yield. But the list of countries that contributed to - production may not be the same as the list of countries that contributed to area. We could impose that they must be - the same, but this causes the resulting series to have gaps. Additionally, it seems that FAO also constructs yield - in the same way. This was checked by comparing the resulting yield curves for 'Almonds' for all aggregate regions - with their corresponding *(FAO) regions; they were identical. - - Parameters - ---------- - data : pd.DataFrame - Data that does not contain yield for aggregate regions. - - Returns - ------- - combined_data : pd.DataFrame - Data after adding yield. - - """ - # Element code of production, area harvested, and yield. - production_element_code = "005510" - area_element_code = "005312" - yield_element_code = "005419" - - # Check that indeed regions do not contain any data for yield. - assert data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == yield_element_code)].empty - - # Gather all fields that should stay the same. - additional_fields = data[data["element_code"] == yield_element_code][ - [ - "element", - "element_description", - "fao_element", - "fao_unit_short_name", - "unit", - "unit_short_name", - ] - ].drop_duplicates() - assert len(additional_fields) == 1 - - # Create a dataframe of production of regions. - data_production = data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == production_element_code)] - - # Create a dataframe of area of regions. - data_area = data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == area_element_code)] - - # Merge the two dataframes and create the new yield variable. - merge_cols = [ - "area_code", - "year", - "item_code", - "fao_country", - "fao_item", - "item", - "item_description", - "country", - ] - combined = pd.merge( - data_production, - data_area[merge_cols + ["flag", "value"]], - on=merge_cols, - how="inner", - suffixes=("_production", "_area"), - ) - - combined["value"] = combined["value_production"] / combined["value_area"] - - # Replace infinities (caused by dividing by zero) by nan. - combined["value"] = combined["value"].replace(np.inf, np.nan) - - # If both fields have the same flag, use that, otherwise use the flag of multiple flags. - combined["flag"] = [ - flag_production if flag_production == flag_area else FLAG_MULTIPLE_FLAGS - for flag_production, flag_area in zip(combined["flag_production"], combined["flag_area"]) - ] - - # Drop rows of nan and unnecessary columns. - combined = combined.drop(columns=["flag_production", "flag_area", "value_production", "value_area"]) - combined = combined.dropna(subset="value").reset_index(drop=True) - - # Replace fields appropriately. - combined["element_code"] = yield_element_code - # Replace all other fields from the corresponding fields in yield (tonnes per hectare) variable. - for field in additional_fields.columns: - combined[field] = additional_fields[field].item() - assert set(data.columns) == set(combined.columns) - combined_data = ( - pd.concat([data, combined], ignore_index=True) - .reset_index(drop=True) - .astype( - { - "element_code": "category", - "fao_element": "category", - "fao_unit_short_name": "category", - "flag": "category", - "element": "category", - "unit": "category", - "element_description": "category", - "unit_short_name": "category", - } - ) - ) - - return combined_data - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Fetch the dataset short name from dest_dir. - dataset_short_name = Path(dest_dir).name - - # Define path to current step file. - current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") - - # Get paths and naming conventions for current data step. - paths = PathFinder(current_step_file.as_posix()) - - # Load latest meadow dataset and keep its metadata. - ds_meadow: catalog.Dataset = paths.load_dependency(dataset_short_name) - # Load main table from dataset. - tb_meadow = ds_meadow[dataset_short_name] - data = pd.DataFrame(tb_meadow).reset_index() - - # Load dataset of FAOSTAT metadata. - metadata: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_metadata") - - # Load dataset, items, element-units, and countries metadata. - dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() - items_metadata = pd.DataFrame(metadata["items"]).reset_index() - items_metadata = items_metadata[items_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - elements_metadata = pd.DataFrame(metadata["elements"]).reset_index() - elements_metadata = elements_metadata[elements_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() - amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) - - # - # Process data. - # - # Harmonize items and elements, and clean data. - data = harmonize_items(df=data, dataset_short_name=dataset_short_name) - data = harmonize_elements(df=data) - - # Prepare data. - data = clean_data( - data=data, - items_metadata=items_metadata, - elements_metadata=elements_metadata, - countries_metadata=countries_metadata, - amendments=amendments, - ) - - # Fill missing data for slaughtered poultry with slaughtered chicken. - data = fill_slaughtered_poultry_with_slaughtered_chicken(data=data) - - # Include number of slaughtered animals in total meat (which is missing). - data = add_slaughtered_animals_to_meat_total(data=data) - - # Add data for aggregate regions. - data = add_regions(data=data, elements_metadata=elements_metadata) - - # Add per-capita variables. - data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) - - # Add yield (production per area) to aggregate regions. - data = add_yield_to_aggregate_regions(data) - - # Handle detected anomalies in the data. - data, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=data) - - # Create a long table (with item code and element code as part of the index). - data_table_long = prepare_long_table(data=data) - - # Create a wide table (with only country and year as index). - data_table_wide = prepare_wide_table(data=data) - - # - # Save outputs. - # - # Update tables metadata. - data_table_long.metadata.short_name = dataset_short_name - data_table_long.metadata.title = dataset_metadata["owid_dataset_title"] - data_table_wide.metadata.short_name = f"{dataset_short_name}_flat" - data_table_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE - - # Initialise new garden dataset. - ds_garden = create_dataset( - dest_dir=dest_dir, tables=[data_table_long, data_table_wide], default_metadata=ds_meadow.metadata - ) - - # Update dataset metadata and add description of anomalies (if any) to the dataset description. - ds_garden.metadata.description = ( - dataset_metadata["owid_dataset_description"] + anomaly_descriptions + SLAUGHTERED_ANIMALS_ADDITIONAL_DESCRIPTION - ) - ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] - - # Update the main source's metadata description (which will be shown in charts). - ds_garden.metadata.sources[0].description = ds_garden.metadata.description - - # Create garden dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_qi.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_qi.py deleted file mode 100644 index 8c271f07bc2..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_qi.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_qi dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_qv.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_qv.py deleted file mode 100644 index f564688376e..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_qv.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_qv dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_rfb.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_rfb.py deleted file mode 100644 index 68669b4cbd4..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_rfb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_rfb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_rfn.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_rfn.py deleted file mode 100644 index 4ebfe341728..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_rfn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_rfn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_rl.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_rl.py deleted file mode 100644 index f43cbe31912..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_rl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_rl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_rp.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_rp.py deleted file mode 100644 index f15e468d920..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_rp.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_rp dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_rt.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_rt.py deleted file mode 100644 index 8b7a9257526..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_rt.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_rt dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_scl.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_scl.py deleted file mode 100644 index 00d0d6eb376..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_scl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_scl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_sdgb.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_sdgb.py deleted file mode 100644 index 67932fa7aaf..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_sdgb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_sdgb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_tcl.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_tcl.py deleted file mode 100644 index 2df286d3992..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_tcl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_tcl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ti.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_ti.py deleted file mode 100644 index 682199d79d9..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_ti.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_ti dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/faostat/2023-02-22/faostat_wcad.py b/etl/steps/archive/garden/faostat/2023-02-22/faostat_wcad.py deleted file mode 100644 index 77ba310b738..00000000000 --- a/etl/steps/archive/garden/faostat/2023-02-22/faostat_wcad.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT garden step for faostat_wcad dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/garden/gcp/2022-09-29/global_carbon_budget_additional.countries.json b/etl/steps/archive/garden/gcp/2022-09-29/global_carbon_budget_additional.countries.json deleted file mode 100644 index 2efa5c8a073..00000000000 --- a/etl/steps/archive/garden/gcp/2022-09-29/global_carbon_budget_additional.countries.json +++ /dev/null @@ -1,232 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Africa": "Africa (GCP)", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Asia": "Asia (GCP)", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cape Verde": "Cape Verde", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cura\u00e7ao": "Curacao", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Ethiopia": "Ethiopia", - "Europe": "Europe (GCP)", - "Faeroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Laos": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Moldova": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North America": "North America (GCP)", - "North Korea": "North Korea", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Occupied Palestinian Territory": "Palestine", - "Oceania": "Oceania (GCP)", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Qatar": "Qatar", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "R\u00e9union": "Reunion", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South America": "South America (GCP)", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syria": "Syria", - "Taiwan": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "USA": "United States", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Viet Nam": "Vietnam", - "Wallis and Futuna Islands": "Wallis and Futuna", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Central America": "Central America (GCP)", - "Middle East": "Middle East (GCP)", - "Non-OECD": "Non-OECD (GCP)", - "OECD": "OECD (GCP)" -} diff --git a/etl/steps/archive/garden/gcp/2022-09-29/global_carbon_budget_additional.meta.yml b/etl/steps/archive/garden/gcp/2022-09-29/global_carbon_budget_additional.meta.yml deleted file mode 100644 index d2f2462dfac..00000000000 --- a/etl/steps/archive/garden/gcp/2022-09-29/global_carbon_budget_additional.meta.yml +++ /dev/null @@ -1,78 +0,0 @@ -dataset: - title: Global Carbon Budget (Global Carbon Project, v2021b) - sources: - - name: Global Carbon Project (2021) - published_by: Global Carbon Budget - Global Carbon Project (2021) - description: | - The Global Carbon Budget dataset is available [here](https://www.icos-cp.eu/science-and-impact/global-carbon-budget/2021) and [here](https://doi.org/10.5281/zenodo.5569235). - - Variables include each country, region and World Bank income group's share of the global population; production-based (territorial); and consumption-based (trade-adjusted) carbon dioxide emissions. - - This was calculated by Our World in Data based on CO₂ figures produced by the Global Carbon Project. This is given as production (territorial) emissions in addition to trade-adjusted consumption-based emissions. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. - - Note that consumption-based emissions are not available for all countries; although those without complete data are a small fraction (3%) of the global total. Each country's share of world emissions are based on the share of the global total minus categories termed 'bunkers' and 'statistical differences' (which include cross-boundary emissions such as international travel and shipping. - - Calculation of each country's share of the global population is calculated using our population dataset, based on [different sources](https://ourworldindata.org/population-sources)). - - Data on global emissions has been converted by Our World in Data from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. - - The full reference for the Carbon Budget 2021 is: - [Global Carbon Budget 2021](https://doi.org/10.5194/essd-14-1917-2022), by Pierre Friedlingstein, Matthew W. Jones, Michael O'Sullivan, Robbie M. Andrew, Dorothee C. E. Bakker, Judith Hauck, Corinne Le Quéré, Glen P. Peters, Wouter Peters, Julia Pongratz, Stephen Sitch, Josep G. Canadell, Philippe Ciais, Rob B. Jackson, Simone R. Alin, Peter Anthoni, Nicholas R. Bates, Meike Becker, Nicolas Bellouin, Laurent Bopp, Thi Tuyet Trang Chau, Frédéric Chevallier, Louise P. Chini, Margot Cronin, Kim I. Currie, Bertrand Decharme, Laique M. Djeutchouang, Xinyu Dou, Wiley Evans, Richard A. Feely, Liang Feng, Thomas Gasser, Dennis Gilfillan, Thanos Gkritzalis, Giacomo Grassi, Luke Gregor, Nicolas Gruber, Özgür Gürses, Ian Harris, Richard A. Houghton, George C. Hurtt, Yosuke Iida, Tatiana Ilyina, Ingrid T. Luijkx, Atul Jain, Steve D. Jones, Etsushi Kato, Daniel Kennedy, Kees Klein Goldewijk, Jürgen Knauer, Jan Ivar Korsbakken, Arne Körtzinger, Peter Landschützer, Siv K. Lauvset, Nathalie Lefèvre, Sebastian Lienert, Junjie Liu, Gregg Marland, Patrick C. McGuire, Joe R. Melton, David R. Munro, Julia E. M. S. Nabel, Shin-Ichiro Nakaoka, Yosuke Niwa, Tsuneo Ono, Denis Pierrot, Benjamin Poulter, Gregor Rehder, Laure Resplandy, Eddy Robertson, Christian Rödenbeck, Thais M. Rosan, Jörg Schwinger, Clemens Schwingshackl, Roland Séférian, Adrienne J. Sutton, Colm Sweeney, Toste Tanhua, Pieter P. Tans, Hanqin Tian, Bronte Tilbrook, Francesco Tubiello, Guido R. van der Werf, Nicolas Vuichard, Chisato Wada, Rik Wanninkhof, Andrew J. Watson, David Willis, Andrew J. Wiltshire, Wenping Yuan, Chao Yue, Xu Yue, Sönke Zaehle and Jiye Zeng (2022), Earth System Science Data, 14, 1917–2005, 2022, DOI: 10.5194/essd-14-1917-2022. -tables: - global_carbon_budget_additional: - variables: - consumption_emissions: - title: "Consumption-based CO₂ emissions" - unit: tonnes - short_unit: t - consumption_emissions_as_share_of_global: - title: "Consumption-based CO₂ emissions (% of global total)" - unit: '%' - short_unit: '%' - consumption_emissions_per_capita: - title: "Consumption-based CO₂ per capita" - unit: tonnes of CO₂ per capita - short_unit: t - global_bunker_emissions: - title: Global bunker emissions - unit: tonnes - short_unit: t - global_fossil_emissions: - title: "CO₂ emissions from fossil fuels and industry" - unit: tonnes - short_unit: t - description: "Global CO₂ emissions from fossil fuels and industry (which includes coal, oil, gas, cement and flaring)." - global_fossil_and_land_use_change_emissions: - title: "CO₂ emissions from fossil fuels and land use change" - unit: tonnes - short_unit: t - description: "Global CO₂ emissions from fossil fuels and industry (which includes coal, oil, gas, cement and flaring) plus land-use change." - global_land_use_change_emissions: - title: "CO₂ emissions from land use change" - unit: tonnes - short_unit: t - description: "Global CO₂ emissions from land use change." - global_population: - title: Global population - unit: persons - short_unit: persons - population: - title: Population - unit: persons - short_unit: persons - population_as_share_of_global: - title: Share of global population - unit: '%' - short_unit: '%' - production_emissions: - title: "Production-based CO₂ emissions" - unit: tonnes - short_unit: t - production_emissions_as_share_of_global: - title: Production-based CO₂ emissions (% of global total) - unit: "%" - short_unit: "%" - production_emissions_per_capita: - title: "Production-based CO₂ per capita" - unit: tonnes of CO₂ per capita - short_unit: t diff --git a/etl/steps/archive/garden/gcp/2022-09-29/global_carbon_budget_additional.py b/etl/steps/archive/garden/gcp/2022-09-29/global_carbon_budget_additional.py deleted file mode 100644 index b33ba08379d..00000000000 --- a/etl/steps/archive/garden/gcp/2022-09-29/global_carbon_budget_additional.py +++ /dev/null @@ -1,279 +0,0 @@ -"""This step creates a dataset that has additional variables that are currently not included in the Global Carbon -Budget (GCB) dataset (which was created in importers). - -In the future (next time GCB dataset is updated and moved to ETL), a newer version of this step should create the -entire GCB dataset. - -""" - -from typing import Tuple, cast - -import pandas as pd -from owid.catalog import Dataset, Table -from shared import CURRENT_DIR - -from etl.data_helpers import geo -from etl.helpers import PathFinder - -# Regions and income groups to create (by aggregating), following OWID definitions. -REGIONS = [ - "Africa", - "Asia", - "Europe", - "European Union (27)", - "North America", - "Oceania", - "South America", - "Low-income countries", - "Upper-middle-income countries", - "Lower-middle-income countries", - "High-income countries", -] -# Variables to aggregate and type of aggregation to apply. -AGGREGATES = {"production_emissions": "sum", "consumption_emissions": "sum"} - -# Naming conventions. -N = PathFinder(str(CURRENT_DIR / "global_carbon_budget_additional")) - - -def prepare_national_and_global_data( - production_df: pd.DataFrame, consumption_df: pd.DataFrame, historical_df: pd.DataFrame -) -> Tuple[pd.DataFrame, pd.DataFrame]: - """Separate and prepare national and global GCB data. - - Parameters - ---------- - production_df : pd.DataFrame - Production-based emissions (from the national data file). - consumption_df : pd.DataFrame - Consumption-based emissions (from the national data file). - historical_df : pd.DataFrame - Historical budget emissions (from the global data file). - - Returns - ------- - national_df : pd.DataFrame - Prepared national emissions data. - globl_df : pd.DataFrame - Prepared global emissions data. - - """ - production_df = production_df.copy() - consumption_df = consumption_df.copy() - historical_df = historical_df.copy() - - # In the original data, Bunkers was included in the national data file, as another country. - # But I suppose it should be considered as another kind of global emission. - # In fact, bunker emissions should coincide for production and consumption emissions. - global_bunkers_emissions = ( - production_df[production_df["country"] == "Bunkers"][["year", "production_emissions"]] - .reset_index(drop=True) - .rename(columns={"production_emissions": "global_bunker_emissions"}) - ) - - # Check that we get exactly the same array of bunker emissions from the consumption emissions dataframe. - check = ( - consumption_df[consumption_df["country"] == "Bunkers"][["year", "consumption_emissions"]] - .reset_index(drop=True) - .rename(columns={"consumption_emissions": "global_bunker_emissions"}) - ) - error = "Bunker emissions were expected to coincide in production and consumption emissions dataframes." - assert global_bunkers_emissions.equals(check), error - - # Now remove rows for bunker emissions from both production and consumption emissions. - production_df = production_df[production_df["country"] != "Bunkers"].reset_index(drop=True) - consumption_df = consumption_df[consumption_df["country"] != "Bunkers"].reset_index(drop=True) - - # Combine production and consumption dataframes. - national_df = pd.merge(production_df, consumption_df, how="outer", on=["country", "year"]) - - # Check that, for the World, production emissions coincides with consumption emissions. - error = "Production and consumption emissions for the world were expected to be identical." - assert ( - production_df[production_df["country"] == "World"] - .reset_index(drop=True)["production_emissions"] - .equals(consumption_df[consumption_df["country"] == "World"].reset_index(drop=True)["consumption_emissions"]) - ), error - - # Check that production emissions for the World coincide with global fossil emissions (from the historical dataframe). - check = pd.merge( - production_df[production_df["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), - historical_df[["year", "global_fossil_emissions"]], - how="inner", - on="year", - ) - error = "Production emissions for the world were expected to coincide with global fossil emissions." - assert check[check["production_emissions"] != check["global_fossil_emissions"]].empty, error - - # Given that, we can ignore production and consumption emissions for the world, and take it from - # the global fossil emissions (which has data since 1750 instead of 1959). - complete_world_emissions = historical_df[["country", "year", "global_fossil_emissions"]].rename( - columns={"global_fossil_emissions": "production_emissions"} - ) - # Create an additional column of global consumption emissions (which, as we just checked, should be identical to - # production emissions). - complete_world_emissions["consumption_emissions"] = complete_world_emissions["production_emissions"] - national_df = pd.concat( - [national_df[national_df["country"] != "World"].reset_index(drop=True), complete_world_emissions], - ignore_index=True, - ) - - # Add bunker emissions to the rest of global emissions. - global_df = pd.merge(historical_df, global_bunkers_emissions, how="outer", on="year") - - # Add variable of total emissions including fossil fuels and land use change. - global_df["global_fossil_and_land_use_change_emissions"] = ( - global_df["global_fossil_emissions"] + global_df["global_land_use_change_emissions"] - ) - - # Add global population. - global_df = geo.add_population_to_dataframe(df=global_df, population_col="global_population") - - return cast(pd.DataFrame, national_df), cast(pd.DataFrame, global_df) - - -def add_per_capita_variables(national_df: pd.DataFrame) -> pd.DataFrame: - """Add per capita variables to national emissions data. - - Parameters - ---------- - national_df : pd.DataFrame - National emissions data, after selecting variables and preparing them. - - Returns - ------- - national_df : pd.DataFrame - National emissions data, after adding per capita variables. - - """ - national_df = national_df.copy() - - # Add population to each country and year. - national_df = geo.add_population_to_dataframe(df=national_df, warn_on_missing_countries=False) - - # Create per capita variables. - national_df["consumption_emissions_per_capita"] = national_df["consumption_emissions"] / national_df["population"] - national_df["production_emissions_per_capita"] = national_df["production_emissions"] / national_df["population"] - - return national_df - - -def add_share_variables(combined_df: pd.DataFrame) -> pd.DataFrame: - """Add "share variables" (e.g. national emissions as share of global emissions). - - Parameters - ---------- - combined_df : pd.DataFrame - Combined dataframe of production and consumption based emissions (national data). - - Returns - ------- - combined_df : pd.DataFrame - Combined dataframe after adding share variables. - - """ - combined_df = combined_df.copy() - - # Create variables of production and consumption emissions as a share of global emissions. - combined_df["production_emissions_as_share_of_global"] = ( - combined_df["production_emissions"] / combined_df["global_fossil_emissions"] * 100 - ) - combined_df["consumption_emissions_as_share_of_global"] = ( - combined_df["consumption_emissions"] / combined_df["global_fossil_emissions"] * 100 - ) - - # Create variable of population as a share of global population. - combined_df["population_as_share_of_global"] = combined_df["population"] / combined_df["global_population"] * 100 - - # Sanity checks. - error = "Production emissions as a share of global emissions should be 100% for 'World'." - assert combined_df[ - (combined_df["country"] == "World") & (combined_df["production_emissions_as_share_of_global"] != 100) - ].empty, error - error = "Consumption emissions as a share of global emissions should be 100% for 'World'." - assert combined_df[ - (combined_df["country"] == "World") & (combined_df["consumption_emissions_as_share_of_global"] != 100) - ].empty, error - error = "Population as a share of global population should be 100% for 'World'." - assert combined_df[ - (combined_df["country"] == "World") & (combined_df["population_as_share_of_global"].fillna(100) != 100) - ].empty, error - - return combined_df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load dataset from meadow. - ds_meadow = N.meadow_dataset - # Load required tables with additional variables. - consumption_tb = ds_meadow["consumption_emissions"] - production_tb = ds_meadow["production_emissions"] - historical_tb = ds_meadow["historical_emissions"] - # Create a convenient dataframe for each table. - production_df = pd.DataFrame(production_tb).reset_index() - consumption_df = pd.DataFrame(consumption_tb).reset_index() - historical_df = pd.DataFrame(historical_tb).reset_index() - - # - # Process data. - # - # Separate national data (at the country level, although it includes "World") and global data. - national_df, global_df = prepare_national_and_global_data( - production_df=production_df, consumption_df=consumption_df, historical_df=historical_df - ) - - # Harmonize country names. - national_df = ( - geo.harmonize_countries( - df=national_df, - countries_file=N.country_mapping_path, - warn_on_missing_countries=False, - make_missing_countries_nan=True, - ) - .dropna(subset="country") - .reset_index(drop=True) - ) - - # Add contributions from regions. - for region in REGIONS: - national_df = geo.add_region_aggregates( - df=national_df, - region=region, - countries_that_must_have_data=[], - num_allowed_nans_per_year=None, - frac_allowed_nans_per_year=0.9, - aggregations=AGGREGATES, - ) - - # Add per capita variables. - national_df = add_per_capita_variables(national_df=national_df) - - # Combine national and global variables. - combined_df = pd.merge(national_df, global_df.drop(columns="country"), how="inner", on="year") - - # Add production and consumption emissions as a share of global emissions. - combined_df = add_share_variables(combined_df=combined_df) - - # Set an index and sort conveniently. - combined_df = combined_df.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # - # Save outputs. - # - # Create a new garden dataset and use metadata from meadow dataset. - ds_garden = Dataset.create_empty(dest_dir) - ds_garden.metadata = ds_meadow.metadata - ds_garden.metadata.short_name = N.short_name - # Update metadata using the information in the yaml file. - ds_garden.metadata.update_from_yaml(N.metadata_path, if_source_exists="replace") - - # Create a table with the combined data. - tb_garden = Table(combined_df) - # Use metadata from yaml file. - tb_garden.update_metadata_from_yaml(N.metadata_path, "global_carbon_budget_additional") - # Add combined table to garden dataset and save dataset. - ds_garden.add(tb_garden) - ds_garden.save() diff --git a/etl/steps/archive/garden/gcp/2022-09-29/shared.py b/etl/steps/archive/garden/gcp/2022-09-29/shared.py deleted file mode 100644 index 9e4c1438f7b..00000000000 --- a/etl/steps/archive/garden/gcp/2022-09-29/shared.py +++ /dev/null @@ -1,4 +0,0 @@ -from pathlib import Path - -# Naming conventions. -CURRENT_DIR = Path(__file__).parent diff --git a/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget.meta.yml b/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget.meta.yml deleted file mode 100644 index 4668cd55152..00000000000 --- a/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget.meta.yml +++ /dev/null @@ -1,488 +0,0 @@ -dataset: - namespace: gcp - short_name: global_carbon_budget - title: Global Carbon Budget (Global Carbon Project, 2022) - description: | - The Global Carbon Budget dataset is available [here](https://globalcarbonbudget.org/archive/). - - Full reference: - Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Alkama, R., Arneth, A., Arora, V. K., Bates, N. R., Becker, M., Bellouin, N., Bittig, H. C., Bopp, L., Chevallier, F., Chini, L. P., Cronin, M., Evans, W., Falk, S., Feely, R. A., Gasser, T., Gehlen, M., Gkritzalis, T., Gloege, L., Grassi, G., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jain, A. K., Jersild, A., Kadono, K., Kato, E., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Landschützer, P., Lefèvre, N., Lindsay, K., Liu, J., Liu, Z., Marland, G., Mayot, N., McGrath, M. J., Metzl, N., Monacci, N. M., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K., Ono, T., Palmer, P. I., Pan, N., Pierrot, D., Pocock, K., Poulter, B., Resplandy, L., Robertson, E., Rödenbeck, C., Rodriguez, C., Rosan, T. M., Schwinger, J., Séférian, R., Shutler, J. D., Skjelvan, I., Steinhoff, T., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tanhua, T., Tans, P. P., Tian, X., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., Walker, A. P., Wanninkhof, R., Whitehead, C., Willstrand Wranne, A., Wright, R., Yuan, W., Yue, C., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2022, Earth Syst. Sci. Data, 14, 4811-4900, https://doi.org/10.5194/essd-14-4811-2022, 2022. - - Variables include each country, region and World Bank income group's share of the global population; production-based (territorial); and consumption-based (trade-adjusted) carbon dioxide emissions. - - This was calculated by Our World in Data based on CO₂ figures produced by the Global Carbon Project. This is given as production (territorial) emissions in addition to trade-adjusted consumption-based emissions. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. - - Note that consumption-based emissions are not available for all countries; although those without complete data are a small fraction (3%) of the global total. Each country's share of world emissions are based on the share of the global total minus categories termed 'bunkers' and 'statistical differences' (which include cross-boundary emissions such as international travel and shipping. - - Calculation of each country's share of the global population is calculated using our population dataset, based on [different sources](https://ourworldindata.org/population-sources)). - - Data on global emissions has been converted by Our World in Data from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. - - Our World in Data have renamed bunker fuels as "International transport" for improved clarity, which includes emissions from international aviation and shipping. - Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. - - licenses: - - name: Creative Commons Attribution 4.0 International - url: https://zenodo.org/record/7215364 - version: '2022-11-11' - sources: - - name: Our World in Data based on the Global Carbon Project (2022) - published_by: "Our World in Data based on Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Alkama, R., Arneth, A., Arora, V. K., Bates, N. R., Becker, M., Bellouin, N., Bittig, H. C., Bopp, L., Chevallier, F., Chini, L. P., Cronin, M., Evans, W., Falk, S., Feely, R. A., Gasser, T., Gehlen, M., Gkritzalis, T., Gloege, L., Grassi, G., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jain, A. K., Jersild, A., Kadono, K., Kato, E., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Landschützer, P., Lefèvre, N., Lindsay, K., Liu, J., Liu, Z., Marland, G., Mayot, N., McGrath, M. J., Metzl, N., Monacci, N. M., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K., Ono, T., Palmer, P. I., Pan, N., Pierrot, D., Pocock, K., Poulter, B., Resplandy, L., Robertson, E., Rödenbeck, C., Rodriguez, C., Rosan, T. M., Schwinger, J., Séférian, R., Shutler, J. D., Skjelvan, I., Steinhoff, T., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tanhua, T., Tans, P. P., Tian, X., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., Walker, A. P., Wanninkhof, R., Whitehead, C., Willstrand Wranne, A., Wright, R., Yuan, W., Yue, C., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2022, Earth Syst. Sci. Data, 14, 4811-4900, https://doi.org/10.5194/essd-14-4811-2022, 2022." - url: https://www.globalcarbonproject.org/ - date_accessed: 2022-11-11 - -tables: - global_carbon_budget: - variables: - consumption_emissions: - title: "Annual consumption-based CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Data has been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - consumption_emissions_as_share_of_global: - title: "Share of global annual CO₂ consumption-based emissions" - unit: "%" - short_unit: "%" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured as a percentage of global consumption-based emissions of CO₂ in the same year. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide." - consumption_emissions_per_capita: - title: "Annual consumption-based CO₂ emissions (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes per person. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - consumption_emissions_per_gdp: - title: "Annual consumption-based CO₂ emissions per GDP (kg per international-$)" - unit: "kilograms per international-$" - short_unit: "kg/$" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in kilograms per dollar of GDP (2011 international-$). Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - cumulative_consumption_emissions: - title: "Cumulative CO₂ consumption-based emissions" - unit: "tonnes" - short_unit: "t" - description: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of data availability, measured in tonnes. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - cumulative_consumption_emissions_as_share_of_global: - title: "Share of global cumulative CO₂ consumption-based emissions" - unit: "%" - short_unit: "%" - description: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of data availability, measured as a percentage of global cumulative consumption-based emissions of CO₂ since the first year of data availability. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide." - cumulative_emissions_from_cement: - title: "Cumulative CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from cement since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_cement_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from cement" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from cement since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from cement since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from cement has been calculated by Our World in Data using global CO₂ emissions from cement provided in the Global Carbon Budget dataset." - cumulative_emissions_from_coal: - title: "Cumulative CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from coal since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_coal_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from coal" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from coal since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from coal since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from coal has been calculated by Our World in Data using global CO₂ emissions from coal provided in the Global Carbon Budget dataset." - cumulative_emissions_from_flaring: - title: "Cumulative CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from flaring since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_flaring_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from flaring" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from flaring since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from flaring since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from flaring has been calculated by Our World in Data using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset." - cumulative_emissions_from_gas: - title: "Cumulative CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from gas since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_gas_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from gas" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from gas since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from gas since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from gas has been calculated by Our World in Data using global CO₂ emissions from gas provided in the Global Carbon Budget dataset." - cumulative_emissions_from_land_use_change: - title: "Cumulative CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from land-use change since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_land_use_change_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from land-use change" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from land-use change since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from land-use chang since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset." - cumulative_emissions_from_oil: - title: "Cumulative CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from oil since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_oil_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from oil" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from oil since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from oil since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from oil has been calculated by Our World in Data using global CO₂ emissions from oil provided in the Global Carbon Budget dataset. Global oil emissions include all country emissions as well as emissions from international aviation and shipping." - cumulative_emissions_from_other_industry: - title: "Cumulative CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from other industry sources since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_other_industry_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from other industry" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from other industry sources since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from other industry sources since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from other industry sources has been calculated by Our World in Data using global CO₂ emissions from other industry sources provided in the Global Carbon Budget dataset. Global emissions from other industry sources include all country emissions." - cumulative_emissions_total: - title: "Cumulative CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_total_as_share_of_global: - title: "Share of global cumulative CO₂ emissions" - unit: "%" - short_unit: "%" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of data availability, measured as a percentage of global total cumulative production-based emissions of CO₂ since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - cumulative_emissions_total_including_land_use_change: - title: "Cumulative CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), including land-use change, since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_total_including_land_use_change_as_share_of_global: - title: "Share of global cumulative CO₂ emissions including land-use change" - unit: "%" - short_unit: "%" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), including land-use change, since the first year of data availability, measured as a percentage of global total cumulative production-based emissions of CO₂ (including land-use change) since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_from_cement: - title: "Annual CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_cement_as_share_of_global: - title: "Share of global annual CO₂ emissions from cement" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured as a percentage of global production-based emissions of CO₂ from cement in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from cement has been calculated by Our World in Data using global CO₂ emissions from cement provided in the Global Carbon Budget dataset." - emissions_from_cement_per_capita: - title: "Annual CO₂ emissions from cement (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_coal: - title: "Annual CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_coal_as_share_of_global: - title: "Share of global annual CO₂ emissions from coal" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured as a percentage of global production-based emissions of CO₂ from coal in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from coal has been calculated by Our World in Data using global CO₂ emissions from coal provided in the Global Carbon Budget dataset." - emissions_from_coal_per_capita: - title: "Annual CO₂ emissions from coal (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_flaring: - title: "Annual CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_flaring_as_share_of_global: - title: "Share of global annual CO₂ emissions from flaring" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured as a percentage of global production-based emissions of CO₂ from flaring in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from flaring has been calculated by Our World in Data using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset." - emissions_from_flaring_per_capita: - title: "Annual CO₂ emissions from flaring (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_gas: - title: "Annual CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_gas_as_share_of_global: - title: "Share of global annual CO₂ emissions from gas" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured as a percentage of global production-based emissions of CO₂ from gas in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from gas has been calculated by Our World in Data using global CO₂ emissions from gas provided in the Global Carbon Budget dataset. Global gas emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_from_gas_per_capita: - title: "Annual CO₂ emissions from gas (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_land_use_change: - title: "Annual CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_land_use_change_as_share_of_global: - title: "Share of global annual CO₂ emissions from land-use change" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured as a percentage of global production-based emissions of CO₂ from land-use change in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset." - emissions_from_land_use_change_per_capita: - title: "Annual CO₂ emissions from land-use change per capita" - unit: "tonnes of CO₂ per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_oil: - title: "Annual CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_oil_as_share_of_global: - title: "Share of global annual CO₂ emissions from oil" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured as a percentage of global production-based emissions of CO₂ from oil in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from oil has been calculated by Our World in Data using global CO₂ emissions from oil provided in the Global Carbon Budget dataset. Global oil emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_from_oil_per_capita: - title: "Annual CO₂ emissions from oil (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_other_industry: - title: "Annual CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_other_industry_as_share_of_global: - title: "Share of global annual CO₂ emissions from other industry" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured as a percentage of global production-based emissions of CO₂ from other industry sources in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from other industry sources has been calculated by Our World in Data using global CO₂ emissions from other industry sources provided in the Global Carbon Budget dataset. Global emissions form other industry sources include all country emissions." - emissions_from_other_industry_per_capita: - title: "Annual CO₂ emissions from other industry (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total: - title: "Annual CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_as_share_of_global: - title: "Share of global annual CO₂ emissions" - unit: "%" - short_unit: "%" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured as a percentage of global production-based emissions of CO₂ in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_total_including_land_use_change: - title: "Annual CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_including_land_use_change_as_share_of_global: - title: "Share of global annual CO₂ emissions including land-use change" - unit: "%" - short_unit: "%" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured as a percentage of global total production-based emissions of CO₂ in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_total_including_land_use_change_per_capita: - title: "Annual CO₂ emissions including land-use change per capita" - unit: "tonnes of CO₂ per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_including_land_use_change_per_gdp: - title: "Annual CO₂ emissions including land-use change per GDP" - unit: "kilograms per international-$" - short_unit: "kg/$" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per dollar of GDP (2011 international-$). Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_including_land_use_change_per_unit_energy: - title: "Annual CO₂ emissions including land-use change per unit energy" - unit: "kilograms per kilowatt-hour" - short_unit: "kg/kWh" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per kilowatt-hour of primary energy consumption. Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_per_capita: - title: "Annual CO₂ emissions (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_per_gdp: - title: "Annual CO₂ emissions per GDP (kg per international-$)" - unit: "kilograms per international-$" - short_unit: "kg/$" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per dollar of GDP (2011 international-$). Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_per_unit_energy: - title: "Annual CO₂ emissions per unit energy (kg per kilowatt-hour)" - unit: "kilograms per kilowatt-hour" - short_unit: "kg/kWh" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per kilowatt-hour of primary energy consumption. Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - gdp: - title: "GDP" - unit: "2011 international-$" - short_unit: "$" - description: >- - Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) - and price differences between countries. - global_cumulative_emissions_from_cement: - title: "Global cumulative CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_coal: - title: "Global cumulative CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_flaring: - title: "Global cumulative CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_gas: - title: "Global cumulative CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_land_use_change: - title: "Global cumulative CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_oil: - title: "Global cumulative CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_other_industry: - title: "Global cumulative CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_total: - title: "Global cumulative CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_total_including_land_use_change: - title: "Global cumulative CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_cement: - title: "Global annual CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_coal: - title: "Global annual CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_flaring: - title: "Global annual CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_gas: - title: "Global annual CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_international_transport: - title: "Global annual CO₂ emissions from international transport" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_land_use_change: - title: "Global annual CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_oil: - title: "Global annual CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_other_industry: - title: "Global annual CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_total: - title: "Global annual CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_total_including_land_use_change: - title: "Global annual CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_population: - title: "Global population" - unit: "persons" - short_unit: "persons" - description: "World population." - growth_emissions_total: - title: "Annual CO₂ emissions growth (abs)" - unit: "tonnes" - short_unit: "t" - description: "Annual growth in total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - growth_emissions_total_including_land_use_change: - title: "Growth rate of emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual growth in total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - land_use_change_quality_flag: - title: "Land-use change quality flag" - unit: "" - short_unit: "" - description: "Carbon dioxide emissions from land use change vary significantly in their degree of certainty. The quality flag is 1 if the different estimates of land-use change emissions considered by the Global Carbon Project have a reasonable agrement. Otherwise the quality flag is 0. The flag is also set to zero if not all estimates have data for a given country. For a more detailed definition, see the original paper." - pct_growth_emissions_total: - title: "Annual CO₂ emissions growth (%)" - unit: "%" - short_unit: "%" - description: "Annual percentage growth in total production-based emissions of carbon dioxide (CO₂), excluding land-use change. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - pct_growth_emissions_total_including_land_use_change: - title: "Growth rate of emissions including land-use change (%)" - unit: "%" - short_unit: "%" - description: "Annual percentage growth in total production-based emissions of carbon dioxide (CO₂), including land-use change. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - pct_traded_emissions: - title: "Share of annual CO₂ emissions embedded in trade" - unit: "%" - short_unit: "%" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured as a percentage of production-based emissions of CO₂. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - pct_traded_emissions_including_land_use_change: - title: "Traded emissions including land-use change (%)" - unit: "%" - short_unit: "%" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured as a percentage of production-based emissions of CO₂, including land-use change. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - population: - title: "Population" - unit: "persons" - short_unit: "persons" - description: "" - population_as_share_of_global: - title: "Share of population" - unit: "%" - short_unit: "%" - description: "Population, measured as a percentage of global total population in the same year." - primary_energy_consumption: - title: "Primary energy consumption" - unit: "terawatt-hours" - short_unit: "TWh" - description: "Primary energy consumption, measured in terawatt-hours per year." - traded_emissions: - title: "Annual CO₂ emissions embedded in trade" - unit: "tonnes" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - traded_emissions_including_land_use_change: - title: "Traded emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured in tonnes. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - traded_emissions_including_land_use_change_per_capita: - title: "Traded emissions including land-use change per capita" - unit: "tonnes of CO₂ per capita" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured in tonnes per person. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - traded_emissions_per_capita: - title: "Annual CO₂ emissions embedded in trade (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes per person. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." diff --git a/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget.py b/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget.py deleted file mode 100644 index 703a4c962af..00000000000 --- a/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget.py +++ /dev/null @@ -1,1045 +0,0 @@ -"""This step creates the Global Carbon Budget (GCB) dataset, by the Global Carbon Project (GCP). - -It combines the following datasets: -- GCP's Fossil CO2 emissions (long-format csv). -- GCP's official GCB global emissions (excel file) containing global bunker fuel and land-use change emissions. -- GCP's official GCB national emissions (excel file) containing consumption-based emissions for each country. - - Production-based emissions from this file are also used, but just to include total emissions of regions - according to GCP (e.g. "Africa (GCP)") and for sanity checks. -- GCP's official GCB national land-use change emissions (excel file) with land-use change emissions for each country. -And additionally: -- GGDC's Maddison dataset on GDP, used to calculate emissions per GDP. -- Primary Energy Consumption (mix of sources from the 'energy' namespace) to calculate emissions per unit energy. -- Population (mix of sources from the 'owid' namespace), to calculate emissions per capita. -- Countries-regions (mix of sources from the 'reference' namespace), to generate aggregates for different continents. -- WorldBank's Income groups, to generate aggregates for different income groups. - -""" - -from typing import Dict, List, Optional - -import numpy as np -import pandas as pd -from owid import catalog -from owid.datautils import dataframes - -from etl.data_helpers import geo -from etl.paths import DATA_DIR, STEP_DIR - -# Define inputs. -MEADOW_VERSION = "2022-11-11" -# Country names harmonization file for fossil CO2 emissions data. -FOSSIL_CO2_EMISSIONS_COUNTRIES_FILE = ( - STEP_DIR / f"data/garden/gcp/{MEADOW_VERSION}/global_carbon_budget_fossil_co2_emissions.countries.json" -) -# Country names harmonization file for national emissions data. -NATIONAL_EMISSIONS_COUNTRIES_FILE = ( - STEP_DIR / f"data/garden/gcp/{MEADOW_VERSION}/global_carbon_budget_national_emissions.countries.json" -) -# Country names harmonization file for national land-use change emissions data. -LAND_USE_EMISSIONS_COUNTRIES_FILE = ( - STEP_DIR / f"data/garden/gcp/{MEADOW_VERSION}/global_carbon_budget_land_use_change_emissions.countries.json" -) -# Meadow dataset on GCB fossil CO2 emissions. -MEADOW_CO2_DATASET_PATH = DATA_DIR / f"meadow/gcp/{MEADOW_VERSION}/global_carbon_budget_fossil_co2_emissions" -# Meadow dataset on global emissions. -MEADOW_GLOBAL_EMISSIONS_DATASET_PATH = DATA_DIR / f"meadow/gcp/{MEADOW_VERSION}/global_carbon_budget_global_emissions" -# Meadow dataset on national emissions. -MEADOW_NATIONAL_EMISSIONS_DATASET_PATH = ( - DATA_DIR / f"meadow/gcp/{MEADOW_VERSION}/global_carbon_budget_national_emissions" -) -# Meadow dataset on GCB national land-use change emissions. -MEADOW_LAND_USE_EMISSIONS_DATASET_PATH = ( - DATA_DIR / f"meadow/gcp/{MEADOW_VERSION}/global_carbon_budget_land_use_change_emissions" -) -# Garden dataset on primary energy consumption. -GARDEN_PRIMARY_ENERGY_DATASET_PATH = DATA_DIR / "garden/energy/2022-07-29/primary_energy_consumption" -# Garden dataset on GDP. -GARDEN_GDP_DATASET_PATH = DATA_DIR / "garden/ggdc/2020-10-01/ggdc_maddison" -# Additionally, population dataset and income groups are also used (through datautils.geo functions). - -# Define outputs. -# Name of output dataset. -VERSION = MEADOW_VERSION -DATASET_NAME = "global_carbon_budget" -# Path to metadata file. -METADATA_PATH = STEP_DIR / f"data/garden/gcp/{MEADOW_VERSION}/global_carbon_budget.meta.yml" - -# Expected outliers in consumption-based emissions (with negative emissions in the original data, that will be removed). -OUTLIERS_IN_CONSUMPTION_DF = [ - ("Panama", 2003), - ("Panama", 2004), - ("Panama", 2005), - ("Panama", 2006), - ("Panama", 2011), - ("Panama", 2012), - ("Panama", 2013), - ("Venezuela", 2018), -] - -# Label used for international transport (emissions from oil in bunker fuels), included as a country in the -# fossil CO2 emissions dataset. -INTERNATIONAL_TRANSPORT_LABEL = "International Transport" - -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -REGIONS = { - # Default continents. - "Africa": {}, - "Asia": {}, - "Europe": {}, - "European Union (27)": {}, - "North America": {}, - "Oceania": {}, - "South America": {}, - # Income groups. - "Low-income countries": {}, - "Upper-middle-income countries": {}, - "Lower-middle-income countries": {}, - "High-income countries": {}, - # Additional composite regions. - "Asia (excl. China and India)": { - "regions_included": ["Asia"], - "countries_excluded": ["China", "India"], - }, - "Europe (excl. EU-27)": {"regions_included": ["Europe"], "regions_excluded": ["European Union (27)"]}, - "Europe (excl. EU-28)": { - "regions_included": ["Europe"], - "regions_excluded": ["European Union (27)"], - "countries_excluded": ["United Kingdom"], - }, - "European Union (28)": { - "regions_included": ["European Union (27)"], - "countries_included": ["United Kingdom"], - }, - "North America (excl. USA)": { - "regions_included": ["North America"], - "countries_excluded": ["United States"], - }, -} - -# Columns to use from GCB fossil CO2 emissions data and how to rename them. -CO2_COLUMNS = { - "country": "country", - "year": "year", - "cement": "emissions_from_cement", - "coal": "emissions_from_coal", - "flaring": "emissions_from_flaring", - "gas": "emissions_from_gas", - "oil": "emissions_from_oil", - "other": "emissions_from_other_industry", - "total": "emissions_total", -} - -# List all sources of emissions considered. -EMISSION_SOURCES = [column for column in CO2_COLUMNS.values() if column not in ["country", "year"]] - -# Columns to use from primary energy consumption data and how to rename them. -PRIMARY_ENERGY_COLUMNS = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "primary_energy_consumption", -} - -# Columns to use from GDP data and how to rename them. -GDP_COLUMNS = { - "country": "country", - "year": "year", - "gdp": "gdp", -} - -# Columns to use from primary energy consumption data and how to rename them. -HISTORICAL_EMISSIONS_COLUMNS = { - "country": "country", - "year": "year", - # Global fossil emissions are used only for sanity checks. - "global_fossil_emissions": "global_fossil_emissions", - "global_land_use_change_emissions": "global_emissions_from_land_use_change", -} - -# Columns to use from consumption-based emissions data and how to rename them. -CONSUMPTION_EMISSIONS_COLUMNS = { - "country": "country", - "year": "year", - "consumption_emissions": "consumption_emissions", -} - -# Conversion from terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - -# Conversion from million tonnes of CO2 to tonnes of CO2. -MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 = 1e6 - -# Conversion from tonnes of CO2 to kg of CO2 (used for emissions per GDP and per unit energy). -TONNES_OF_CO2_TO_KG_OF_CO2 = 1000 - -# In order to remove uninformative columns, keep only rows where at least one of the following columns has data. -# All other columns are either derived variables, or global variables, or auxiliary variables from other datasets. -COLUMNS_THAT_MUST_HAVE_DATA = [ - "emissions_from_cement", - "emissions_from_coal", - "emissions_from_flaring", - "emissions_from_gas", - "emissions_from_oil", - "emissions_from_other_industry", - "emissions_total", - "consumption_emissions", - "emissions_from_land_use_change", - # 'land_use_change_quality_flag', -] - - -def get_countries_in_region( - region: str, region_modifications: Optional[Dict[str, Dict[str, List[str]]]] = None -) -> List[str]: - """Get countries in a region, both for known regions (e.g. "Africa") and custom ones (e.g. "Europe (excl. EU-27)"). - - Parameters - ---------- - region : str - Region name (e.g. "Africa", or "Europe (excl. EU-27)"). - region_modifications : dict or None - If None (or an empty dictionary), the region should be in OWID's countries-regions dataset. - If not None, it should be a dictionary with any (or all) of the following keys: - - "regions_included": List of regions whose countries will be included. - - "regions_excluded": List of regions whose countries will be excluded. - - "countries_included": List of additional individual countries to be included. - - "countries_excluded": List of additional individual countries to be excluded. - NOTE: All regions and countries defined in this dictionary should be in OWID's countries-regions dataset. - - Returns - ------- - countries : list - List of countries in the specified region. - - """ - if region_modifications is None: - region_modifications = {} - - # Check that the fields in the regions_modifications dictionary are well defined. - expected_fields = ["regions_included", "regions_excluded", "countries_included", "countries_excluded"] - assert all([field in expected_fields for field in region_modifications]) - - # Get lists of regions whose countries will be included and excluded. - regions_included = region_modifications.get("regions_included", [region]) - regions_excluded = region_modifications.get("regions_excluded", []) - # Get lists of additional individual countries to include and exclude. - countries_included = region_modifications.get("countries_included", []) - countries_excluded = region_modifications.get("countries_excluded", []) - - # List countries from the list of regions included. - countries_set = set( - sum([geo.list_countries_in_region(region_included) for region_included in regions_included], []) - ) - - # Remove all countries from the list of regions excluded. - countries_set -= set( - sum([geo.list_countries_in_region(region_excluded) for region_excluded in regions_excluded], []) - ) - - # Add the list of individual countries to be included. - countries_set |= set(countries_included) - - # Remove the list of individual countries to be excluded. - countries_set -= set(countries_excluded) - - # Convert set of countries into a sorted list. - countries = sorted(countries_set) - - return countries - - -def sanity_checks_on_input_data( - production_df: pd.DataFrame, consumption_df: pd.DataFrame, historical_df: pd.DataFrame, co2_df: pd.DataFrame -) -> None: - """Run sanity checks on input data files. - - These checks should be used prior to country harmonization, but after basic processing of the dataframes. - - Parameters - ---------- - production_df : pd.DataFrame - Production-based emissions from GCP's official national emissions dataset (excel file). - consumption_df : pd.DataFrame - Consumption-based emissions from GCP's official national emissions dataset (excel file). - historical_df : pd.DataFrame - Historical emissions from GCP's official global emissions dataset (excel file). - co2_df : pd.DataFrame - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - - """ - production_df = production_df.copy() - consumption_df = consumption_df.copy() - historical_df = historical_df.copy() - co2_df = co2_df.copy() - - # In the original data, Bunkers was included in the national data file, as another country. - # But I suppose it should be considered as another kind of global emission. - # In fact, bunker emissions should coincide for production and consumption emissions. - global_bunkers_emissions = ( - production_df[production_df["country"] == "Bunkers"][["year", "production_emissions"]] - .reset_index(drop=True) - .rename(columns={"production_emissions": "global_bunker_emissions"}, errors="raise") - ) - - # Check that we get exactly the same array of bunker emissions from the consumption emissions dataframe - # (on years where there is data for bunker emissions in both datasets). - comparison = pd.merge( - global_bunkers_emissions, - consumption_df[consumption_df["country"] == "Bunkers"][["year", "consumption_emissions"]] - .reset_index(drop=True) - .rename(columns={"consumption_emissions": "global_bunker_emissions"}, errors="raise"), - how="inner", - on="year", - suffixes=("", "_check"), - ) - - error = "Bunker emissions were expected to coincide in production and consumption emissions dataframes." - assert (comparison["global_bunker_emissions"] == comparison["global_bunker_emissions_check"]).all(), error - - # Check that all production-based emissions are positive. - error = "There are negative emissions in production_df (from the additional variables dataset)." - assert (production_df.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that all production-based emissions from the fossil CO2 dataset are positive. - error = "There are negative emissions in co2_df (from the fossil CO2 dataset)." - assert (co2_df.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that all consumption-based emissions are positive. - error = "There are negative emissions in consumption_df (from the national emissions dataset)." - assert (consumption_df.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that, for the World, production emissions coincides with consumption emissions (on common years). - error = "Production and consumption emissions for the world were expected to be identical." - comparison = pd.merge( - production_df[production_df["country"] == "World"].reset_index(drop=True), - consumption_df[consumption_df["country"] == "World"].reset_index(drop=True), - how="inner", - on="year", - ) - assert (comparison["production_emissions"] == comparison["consumption_emissions"]).all(), error - - # Check that production emissions for the World coincide with global (historical) emissions (on common years). - comparison = pd.merge( - production_df[production_df["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), - historical_df[["year", "global_fossil_emissions"]], - how="inner", - on="year", - ) - error = "Production emissions for the world were expected to coincide with global fossil emissions." - assert ( - abs(comparison["production_emissions"] - comparison["global_fossil_emissions"]) - / (comparison["global_fossil_emissions"]) - < 0.001 - ).all(), error - - # Check that emissions in production_df (emissions from the national excel file) coincide with emissions in co2_df - # (from the Fossil CO2 emissions csv file). - # Given that country names have not yet been harmonized, rename the only countries that are present in both datasets. - comparison = pd.merge( - co2_df[["country", "year", "emissions_total"]], - production_df.replace({"Bunkers": "International Transport", "World": "Global"}), - on=["country", "year"], - how="inner", - ).dropna(subset=["emissions_total", "production_emissions"], how="any") - # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in production_df), - # omit that row in the comparison. - comparison = comparison.drop( - comparison[(comparison["country"] == "Kuwait") & (comparison["year"] == 1991)].index - ).reset_index(drop=True) - - error = "Production emissions from national file were expected to coincide with the Fossil CO2 emissions dataset." - assert ( - ( - 100 - * abs(comparison["production_emissions"] - comparison["emissions_total"]) - / (comparison["emissions_total"]) - ).fillna(0) - < 0.1 - ).all(), error - - -def sanity_checks_on_output_data(combined_df: pd.DataFrame) -> None: - """Run sanity checks on output data. - - These checks should be run on the very final output dataframe (with an index) prior to storing it as a table. - - Parameters - ---------- - combined_df : pd.DataFrame - Combination of all input dataframes, after processing, harmonization, and addition of variables. - - """ - combined_df = combined_df.reset_index() - error = "All variables (except traded emissions, growth, and land-use change) should be >= 0 or nan." - positive_variables = [ - col - for col in combined_df.columns - if col != "country" - if "traded" not in col - if "growth" not in col - if "land_use" not in col - ] - assert (combined_df[positive_variables].fillna(0) >= 0).all().all(), error - - error = "Production emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert combined_df[ - (combined_df["country"] == "World") & (abs(combined_df["emissions_total_as_share_of_global"] - 100) > 2) - ].empty, error - - error = "Consumption emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert combined_df[ - (combined_df["country"] == "World") & (abs(combined_df["consumption_emissions_as_share_of_global"] - 100) > 2) - ].empty, error - - error = "Population as a share of global population should be 100% for 'World'." - assert combined_df[ - (combined_df["country"] == "World") & (combined_df["population_as_share_of_global"].fillna(100) != 100) - ].empty, error - - error = "All share of global emissions should be smaller than 100% (within 2% error)." - share_variables = [col for col in combined_df.columns if "share" in col] - assert (combined_df[share_variables].fillna(0) <= 102).all().all(), error - - # Check that cumulative variables are monotonically increasing. - # Firstly, list columns of cumulative variables, but ignoring cumulative columns as a share of global - # (since they are not necessarily monotonic) and land-use change (which can be negative). - cumulative_cols = [ - col for col in combined_df.columns if "cumulative" in col if "share" not in col if "land_use" not in col - ] - # Using ".is_monotonic_increasing" can fail when differences between consecutive numbers are very small. - # Instead, sort data backwards in time, and check that consecutive values of cumulative variables always have - # a percentage change that is smaller than, say, 0.1%. - error = ( - "Cumulative variables (not given as a share of global) should be monotonically increasing (except when " - "including land-use change emissions, which can be negative)." - ) - assert ( - combined_df.sort_values("year", ascending=False) - .groupby("country") - .agg({col: lambda x: ((x.pct_change().dropna() * 100) <= 0.1).all() for col in cumulative_cols}) - .all() - .all() - ), error - - error = ( - "Production emissions as a share of global production emissions for the World should always be 100% " - "(or larger than 98%, given small discrepancies)." - ) - # Consumption emissions as a share of global production emissions is allowed to be smaller than 100%. - share_variables = [col for col in combined_df.columns if "share" in col if "consumption" not in col] - assert (combined_df[combined_df["country"] == "World"][share_variables].fillna(100) > 98).all().all(), error - - error = "Traded emissions for the World should be close to zero (within 2% error)." - world_mask = combined_df["country"] == "World" - assert ( - abs( - 100 - * combined_df[world_mask]["traded_emissions"].fillna(0) - / combined_df[world_mask]["emissions_total"].fillna(1) - ) - < 2 - ).all(), error - - -def prepare_fossil_co2_emissions(co2_df: pd.DataFrame) -> pd.DataFrame: - """Prepare Fossil CO2 emissions data (basic processing). - - Select and rename columns to be used, adapt units, and fix known issues. - - Parameters - ---------- - co2_df : pd.DataFrame - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - - Returns - ------- - co2_df : pd.DataFrame - Fossil CO2 emissions data after basic processing. - - """ - # Select and rename columns from fossil CO2 data. - co2_df = co2_df[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") - - # Ensure all emissions are given in tonnes of CO2. - co2_df[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 - - #################################################################################################################### - # NOTE: For certain years, column "emissions_from_other_industry" is not informed for "World" but it is informed - # for some countries (namely China and US). - # This causes the cumulative emissions from other industry as share of global for those countries to become larger - # than 100%. - # This temporary solution fixes the issue: We aggregate the data for China and US on those years when the world's - # data is missing (without touching other years or other columns). - # Firstly, list of years for which the world has no data for emissions_from_other_industry. - world_missing_years = ( - co2_df[(co2_df["country"] == "Global") & (co2_df["emissions_from_other_industry"].isnull())]["year"] - .unique() - .tolist() # type: ignore - ) - # Data that needs to be aggregated. - data_missing_in_world = co2_df[ - co2_df["year"].isin(world_missing_years) & (co2_df["emissions_from_other_industry"].notnull()) - ] - # Check that there is indeed data to be aggregated (that is missing for the World). - error = ( - "Expected emissions_from_other_industry to be null for the world but not null for certain countries " - "(which was an issue in the original fossil CO2 data). The issue may be fixed and the code can be simplified." - ) - assert len(data_missing_in_world) > 0, error - # Create a dataframe of aggregate data for the World, on those years when it's missing. - aggregated_missing_data = ( - data_missing_in_world.groupby("year") - .agg({"emissions_from_other_industry": "sum"}) - .reset_index() - .assign(**{"country": "Global"}) - ) - # Combine the new dataframe of aggregate data with the main dataframe. - co2_df = dataframes.combine_two_overlapping_dataframes( - df1=co2_df, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True - ) - #################################################################################################################### - - # We add the emissions from "Kuwaiti Oil Fires" (which is also included as a separate country) as part of the - # emissions of Kuwait. This ensures that they will be included in region aggregates. - error = "'Kuwaiti Oil Fires' was expected to only have not-null data for 1991." - assert co2_df[ - (co2_df["country"] == "Kuwaiti Oil Fires") - & (co2_df["emissions_total"].notnull()) - & (co2_df["emissions_total"] != 0) - ]["year"].tolist() == [1991], error - - co2_df.loc[(co2_df["country"] == "Kuwait") & (co2_df["year"] == 1991), EMISSION_SOURCES] = ( - co2_df[(co2_df["country"] == "Kuwaiti Oil Fires") & (co2_df["year"] == 1991)][EMISSION_SOURCES].values - + co2_df[(co2_df["country"] == "Kuwait") & (co2_df["year"] == 1991)][EMISSION_SOURCES].values - ) - - # Check that "emissions_total" agrees with the sum of emissions from individual sources. - error = "The sum of all emissions should add up to total emissions (within 1%)." - assert ( - abs( - co2_df.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) - - co2_df["emissions_total"].fillna(0) - ) - / (co2_df["emissions_total"].fillna(0) + 1e-7) - < 1e-2 - ).all(), error - - # Many rows have zero total emissions, but actually the individual sources are nan. - # Total emissions in those cases should be nan, instead of zero. - no_individual_emissions = co2_df.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) - co2_df.loc[no_individual_emissions, "emissions_total"] = np.nan - - return co2_df - - -def prepare_consumption_emissions(consumption_df: pd.DataFrame) -> pd.DataFrame: - """Prepare consumption-based emissions data (basic processing). - - Select and rename columns to be used, adapt units, and fix known issues. - - Parameters - ---------- - consumption_df : pd.DataFrame - Consumption-based emissions from GCP's official national emissions dataset (excel file). - - Returns - ------- - consumption_df : pd.DataFrame - Consumption-based emissions after basic processing. - - """ - # Select and rename columns. - consumption_df = consumption_df[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( - columns=CONSUMPTION_EMISSIONS_COLUMNS, errors="raise" - ) - - # List indexes of rows in consumption_df corresponding to outliers (defined above in OUTLIERS_IN_CONSUMPTION_DF). - outlier_indexes = [ - consumption_df[(consumption_df["country"] == outlier[0]) & (consumption_df["year"] == outlier[1])].index.item() - for outlier in OUTLIERS_IN_CONSUMPTION_DF - ] - - error = ( - "Outliers were expected to have negative consumption emissions. " - "Maybe outliers have been fixed (and should be removed from the code)." - ) - assert (consumption_df.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error - - # Remove outliers. - consumption_df = consumption_df.drop(outlier_indexes).reset_index(drop=True) - - return consumption_df - - -def extract_global_emissions(co2_df: pd.DataFrame, historical_df: pd.DataFrame) -> pd.DataFrame: - """Extract World emissions by combining data from the Fossil CO2 emissions and the global emissions dataset. - - The resulting global emissions data includes bunker and land-use change emissions. - - NOTE: This function has to be used after selecting and renaming columns in co2_df, but before harmonizing country - names in co2_df (so that "International Transport" is still listed as a country). - - Parameters - ---------- - co2_df : pd.DataFrame - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - historical_df : pd.DataFrame - Historical emissions from GCP's official global emissions dataset (excel file). - - Returns - ------- - global_emissions : pd.DataFrame - World emissions. - - """ - # For some reason, "International Transport" is included as another country, that only has emissions from oil. - # We separate it as another variable (only given at the global level). - global_transport = co2_df[co2_df["country"] == INTERNATIONAL_TRANSPORT_LABEL].reset_index(drop=True) - - # Check that total emissions for international transport coincide with oil emissions. - error = "Total emissions from international transport do not coincide with oil emissions." - assert all((global_transport["emissions_from_oil"] - global_transport["emissions_total"]).dropna() == 0), error - - # Therefore, we can keep only one column for international transport emissions. - global_transport = ( - global_transport[["year", "emissions_from_oil"]] - .dropna() - .rename(columns={"emissions_from_oil": "global_emissions_from_international_transport"}, errors="raise") - ) - - # Create a new dataframe of global emissions. - global_emissions = ( - co2_df[co2_df["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] - .rename(columns={column: f"global_{column}" for column in EMISSION_SOURCES}, errors="raise") - .sort_values("year") - .reset_index(drop=True) - ) - - # Add bunker fuels to global emissions. - global_emissions = pd.merge(global_emissions, global_transport, on=["year"], how="outer") - - # Add historical land-use change emissions to dataframe of global emissions. - global_emissions = pd.merge( - global_emissions, historical_df[["year", "global_emissions_from_land_use_change"]], how="left", on="year" - ) - - # Add variable of total emissions including fossil fuels and land use change. - global_emissions["global_emissions_total_including_land_use_change"] = ( - global_emissions["global_emissions_total"] + global_emissions["global_emissions_from_land_use_change"] - ) - - # Calculate global cumulative emissions. - for column in EMISSION_SOURCES + ["emissions_from_land_use_change", "emissions_total_including_land_use_change"]: - global_emissions[f"global_cumulative_{column}"] = global_emissions[f"global_{column}"].cumsum() - - # Add a country column and add global population. - global_emissions["country"] = "World" - - # Add global population. - global_emissions = geo.add_population_to_dataframe(df=global_emissions, population_col="global_population") - - return global_emissions - - -def harmonize_co2_data(co2_df: pd.DataFrame) -> pd.DataFrame: - """Harmonize country names in Fossil CO2 data, and fix known issues with certain regions. - - Parameters - ---------- - co2_df : pd.DataFrame - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - - Returns - ------- - co2_df : pd.DataFrame - Fossil CO2 emissions data after harmonizing country names. - - """ - # Harmonize country names in fossil CO2 data. - co2_df = geo.harmonize_countries( - df=co2_df, - countries_file=FOSSIL_CO2_EMISSIONS_COUNTRIES_FILE, - warn_on_missing_countries=True, - warn_on_unused_countries=True, - ) - - # Check that there is only one data point for each country-year. - # After harmonization, "Pacific Islands (Palau)" is mapped to "Palau", and therefore there are rows with different - # data for the same country-year. - # However, "Pacific Islands (Palau)" have data until 1991, and "Palau" has data from 1992 onwards. - # After removing empty rows, there should be no overlap. - columns_that_must_have_data = co2_df.drop(columns=["country", "year"]).columns - check = co2_df.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - error = "After harmonizing country names, there is more than one data point for the same country-year." - assert check[check.duplicated(subset=["country", "year"])].empty, error - - return co2_df - - -def combine_data_and_add_variables( - co2_df: pd.DataFrame, - production_df: pd.DataFrame, - consumption_df: pd.DataFrame, - global_emissions_df: pd.DataFrame, - land_use_df: pd.DataFrame, - gdp_df: pd.DataFrame, - primary_energy_df: pd.DataFrame, -) -> pd.DataFrame: - """Combine all relevant data into one dataframe, add region aggregates, and add custom variables (e.g. emissions per - capita). - - Parameters - ---------- - co2_df : pd.DataFrame - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file), after harmonization. - production_df : pd.DataFrame - Production-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - consumption_df : pd.DataFrame - Consumption-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - global_emissions_df : pd.DataFrame - World emissions (including bunker and land-use change emissions). - land_use_df : pd.DataFrame - National land-use change emissions from GCP's official dataset (excel file), after harmonization. - gdp_df : pd.DataFrame - GDP data. - primary_energy_df : pd.DataFrame - Primary energy data. - - Returns - ------- - combined_df : pd.DataFrame - Combined data, with all additional variables and with region aggregates. - - """ - # Add region aggregates that were included in the national emissions file, but not in the Fossil CO2 emissions dataset. - gcp_aggregates = sorted(set(production_df["country"]) - set(co2_df["country"])) - co2_df = pd.concat( - [ - co2_df, - production_df[production_df["country"].isin(gcp_aggregates)] - .rename(columns={"production_emissions": "emissions_total"}) - .astype({"year": int}), - ], - ignore_index=True, - ).reset_index(drop=True) - - # Add consumption emissions to main dataframe (keep only the countries of the main dataframe). - # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to co2_df - # (when merging with production_df), all countries from consumption_df should be included in co2_df. - error = "Some countries in consumption_df are not included in co2_df." - assert set(consumption_df["country"]) < set(co2_df["country"]), error - co2_df = pd.merge(co2_df, consumption_df, on=["country", "year"], how="outer") - - # Add population to dataframe. - co2_df = geo.add_population_to_dataframe(df=co2_df, warn_on_missing_countries=False) - - # Add GDP to main dataframe. - co2_df = pd.merge(co2_df, gdp_df, on=["country", "year"], how="left") - - # Add primary energy to main dataframe. - co2_df = pd.merge(co2_df, primary_energy_df, on=["country", "year"], how="left") - - # For convenience, rename columns in land-use change emissions data. - land_use_df = land_use_df.rename( - columns={"emissions": "emissions_from_land_use_change", "quality_flag": "land_use_change_quality_flag"} - ) - - # Land-use change data does not include data for the World. Include it by merging with the global dataset. - land_use_df = pd.concat( - [ - land_use_df, - global_emissions_df.rename( - columns={"global_emissions_from_land_use_change": "emissions_from_land_use_change"} - )[["year", "emissions_from_land_use_change"]] - .dropna() - .assign(**{"country": "World"}), - ], - ignore_index=True, - ).astype({"year": int}) - - # Add land-use change emissions to main dataframe. - co2_df = pd.merge(co2_df, land_use_df, on=["country", "year"], how="outer") - - # Add total emissions (including land-use change) for each country. - co2_df["emissions_total_including_land_use_change"] = ( - co2_df["emissions_total"] + co2_df["emissions_from_land_use_change"] - ) - - # Add region aggregates. - # Aggregate not only emissions data, but also population, gdp and primary energy. - # This way we ensure that custom regions (e.g. "North America (excl. USA)") will have all required data. - aggregations = { - column: "sum" for column in co2_df.columns if column not in ["country", "year", "land_use_change_quality_flag"] - } - for region in REGIONS: - countries_in_region = get_countries_in_region(region=region, region_modifications=REGIONS[region]) - co2_df = geo.add_region_aggregates( - df=co2_df, - region=region, - countries_in_region=countries_in_region, - countries_that_must_have_data=[], - frac_allowed_nans_per_year=0.999, - aggregations=aggregations, - ) - - # Add global emissions and global cumulative emissions columns to main dataframe. - co2_df = pd.merge(co2_df, global_emissions_df.drop(columns="country"), on=["year"], how="left") - - # Ensure main dataframe is sorted (so that cumulative emissions are properly calculated). - co2_df = co2_df.sort_values(["country", "year"]).reset_index(drop=True) - - # Temporarily add certain global emissions variables. - # This is done simply to be able to consider "consumption_emissions" as just another type of emission - # when creating additional variables. - co2_df["global_consumption_emissions"] = co2_df["global_emissions_total"] - co2_df["global_cumulative_consumption_emissions"] = co2_df["global_cumulative_emissions_total"] - - # Add new variables for each source of emissions. - for column in EMISSION_SOURCES + [ - "consumption_emissions", - "emissions_from_land_use_change", - "emissions_total_including_land_use_change", - ]: - # Add per-capita variables. - co2_df[f"{column}_per_capita"] = co2_df[column] / co2_df["population"] - - # Add columns for cumulative emissions. - # Rows that had nan emissions will have nan cumulative emissions. - # But nans will not be propagated in the sum. - # This means that countries with some (not all) nans will have the cumulative sum of the informed emissions - # (treating nans as zeros), but will have nan on those rows that were not informed. - co2_df[f"cumulative_{column}"] = co2_df.groupby(["country"])[column].cumsum() - - # Add share of global emissions. - co2_df[f"{column}_as_share_of_global"] = 100 * co2_df[column] / co2_df[f"global_{column}"] - - # Add share of global cumulative emissions. - co2_df[f"cumulative_{column}_as_share_of_global"] = ( - 100 * co2_df[f"cumulative_{column}"] / co2_df[f"global_cumulative_{column}"] - ) - - # Add total emissions per unit energy (in kg of emissions per kWh). - co2_df["emissions_total_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * co2_df["emissions_total"] / (co2_df["primary_energy_consumption"] * TWH_TO_KWH) - ) - - # Add total emissions (including land-use change) per unit energy (in kg of emissions per kWh). - co2_df["emissions_total_including_land_use_change_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * co2_df["emissions_total_including_land_use_change"] - / (co2_df["primary_energy_consumption"] * TWH_TO_KWH) - ) - - # Add total emissions per unit GDP. - co2_df["emissions_total_per_gdp"] = TONNES_OF_CO2_TO_KG_OF_CO2 * co2_df["emissions_total"] / co2_df["gdp"] - - # Add total emissions (including land-use change) per unit GDP. - co2_df["emissions_total_including_land_use_change_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * co2_df["emissions_total_including_land_use_change"] / co2_df["gdp"] - ) - - # Add total consumption emissions per unit GDP. - co2_df["consumption_emissions_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * co2_df["consumption_emissions"] / co2_df["gdp"] - ) - - # Add variable of emissions embedded in trade. - co2_df["traded_emissions"] = co2_df["consumption_emissions"] - co2_df["emissions_total"] - co2_df["pct_traded_emissions"] = 100 * co2_df["traded_emissions"] / co2_df["emissions_total"] - co2_df["traded_emissions_per_capita"] = co2_df["traded_emissions"] / co2_df["population"] - - # Add variable of emissions embedded in trade, including land-use change emissions. - co2_df["traded_emissions_including_land_use_change"] = ( - co2_df["consumption_emissions"] - co2_df["emissions_total_including_land_use_change"] - ) - co2_df["pct_traded_emissions_including_land_use_change"] = ( - 100 * co2_df["traded_emissions_including_land_use_change"] / co2_df["emissions_total_including_land_use_change"] - ) - co2_df["traded_emissions_including_land_use_change_per_capita"] = ( - co2_df["traded_emissions_including_land_use_change"] / co2_df["population"] - ) - - # Remove temporary columns. - co2_df = co2_df.drop(columns=["global_consumption_emissions", "global_cumulative_consumption_emissions"]) - - # Add annual percentage growth of total emissions. - co2_df["pct_growth_emissions_total"] = co2_df.groupby("country")["emissions_total"].pct_change() * 100 - - # Add annual percentage growth of total emissions (including land-use change). - co2_df["pct_growth_emissions_total_including_land_use_change"] = ( - co2_df.groupby("country")["emissions_total_including_land_use_change"].pct_change() * 100 - ) - - # Add annual absolute growth of total emissions. - co2_df["growth_emissions_total"] = co2_df.groupby("country")["emissions_total"].diff() - - # Add annual absolute growth of total emissions (including land-use change). - co2_df["growth_emissions_total_including_land_use_change"] = co2_df.groupby("country")[ - "emissions_total_including_land_use_change" - ].diff() - - # Create variable of population as a share of global population. - co2_df["population_as_share_of_global"] = co2_df["population"] / co2_df["global_population"] * 100 - - # Replace infinity values (for example when calculating growth from zero to non-zero) in the data by nan. - for column in co2_df.drop(columns=["country", "year"]).columns: - co2_df.loc[np.isinf(co2_df[column]), column] = np.nan - - # For special GCP countries/regions (e.g. "Africa (GCP)") we should keep only the original data. - # Therefore, make nan all additional variables for those countries/regions, and keep only GCP's original data. - added_variables = co2_df.drop(columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA).columns.tolist() - co2_df.loc[(co2_df["country"].str.contains(" (GCP)", regex=False)), added_variables] = np.nan - - # Remove uninformative rows (those that have only data for, say, gdp, but not for variables related to emissions). - co2_df = co2_df.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index(drop=True) - - return co2_df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load fossil CO2 emissions data from Meadow. - co2_ds = catalog.Dataset(MEADOW_CO2_DATASET_PATH) - # Load main table from CO2 dataset. - co2_tb = co2_ds[co2_ds.table_names[0]] - # Create a dataframe out of the CO2 table. - co2_df = pd.DataFrame(co2_tb).reset_index() - - # Load global (historical) emissions data from Meadow. - historical_ds = catalog.Dataset(MEADOW_GLOBAL_EMISSIONS_DATASET_PATH) - historical_tb = historical_ds[historical_ds.table_names[0]] - historical_df = pd.DataFrame(historical_tb).reset_index() - - # Load national emissions data from Meadow. - national_ds = catalog.Dataset(MEADOW_NATIONAL_EMISSIONS_DATASET_PATH) - # Load tables for national production-based emissions and consumption-based emissions. - production_tb = national_ds["production_emissions"] - production_df = pd.DataFrame(production_tb).reset_index() - consumption_tb = national_ds["consumption_emissions"] - consumption_df = pd.DataFrame(consumption_tb).reset_index() - - # Load national land-use change emissions from Meadow. - land_use_ds = catalog.Dataset(MEADOW_LAND_USE_EMISSIONS_DATASET_PATH) - land_use_tb = land_use_ds[land_use_ds.table_names[0]] - land_use_df = pd.DataFrame(land_use_tb).reset_index() - - # Load primary energy consumption from garden. - primary_energy_ds = catalog.Dataset(GARDEN_PRIMARY_ENERGY_DATASET_PATH) - # Create a dataframe out of the main table of primary energy. - primary_energy_df = pd.DataFrame(primary_energy_ds[primary_energy_ds.table_names[0]]).reset_index() - - # Load GDP dataset from garden. - gdp_ds = catalog.Dataset(GARDEN_GDP_DATASET_PATH) - # Create a dataframe out of the main table of GDP. - gdp_df = pd.DataFrame(gdp_ds[gdp_ds.table_names[0]]).reset_index() - - # - # Process data. - # - # Prepare fossil CO2 emissions data. - co2_df = prepare_fossil_co2_emissions(co2_df=co2_df) - - # Prepare consumption-based emission data. - consumption_df = prepare_consumption_emissions(consumption_df=consumption_df) - - # Select and rename columns from primary energy data. - primary_energy_df = primary_energy_df[list(PRIMARY_ENERGY_COLUMNS)].rename( - columns=PRIMARY_ENERGY_COLUMNS, errors="raise" - ) - - # Select and rename columns from primary energy data. - gdp_df = gdp_df[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") - - # Select and rename columns from historical emissions data. - historical_df = historical_df[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( - columns=HISTORICAL_EMISSIONS_COLUMNS, errors="raise" - ) - - # Run sanity checks on input data. - sanity_checks_on_input_data( - production_df=production_df, consumption_df=consumption_df, historical_df=historical_df, co2_df=co2_df - ) - - # For some reason, "International Transport" is included as another country, that only has emissions from oil. - # Extract that data and remove it from the rest of national emissions. - global_emissions_df = extract_global_emissions(co2_df=co2_df, historical_df=historical_df) - - # Harmonize country names in consumption-based emissions data. - consumption_df = ( - geo.harmonize_countries( - df=consumption_df, - countries_file=NATIONAL_EMISSIONS_COUNTRIES_FILE, - warn_on_missing_countries=False, - make_missing_countries_nan=True, - ) - .dropna(subset="country") - .reset_index(drop=True) - ) - - # Harmonize country names in production-based emissions data. - production_df = ( - geo.harmonize_countries( - df=production_df, - countries_file=NATIONAL_EMISSIONS_COUNTRIES_FILE, - warn_on_missing_countries=False, - make_missing_countries_nan=True, - ) - .dropna(subset="country") - .reset_index(drop=True) - ) - - # Harmonize national land-use change emissions data. - land_use_df = ( - geo.harmonize_countries( - df=land_use_df, - countries_file=LAND_USE_EMISSIONS_COUNTRIES_FILE, - warn_on_missing_countries=True, - make_missing_countries_nan=True, - ) - .dropna(subset="country") - .reset_index(drop=True) - ) - - # Harmonize fossil CO2 data. - co2_df = harmonize_co2_data(co2_df=co2_df) - - # Add new variables to main dataframe (consumption-based emissions, emission intensity, per-capita emissions, etc.). - combined_df = combine_data_and_add_variables( - co2_df=co2_df, - production_df=production_df, - consumption_df=consumption_df, - global_emissions_df=global_emissions_df, - land_use_df=land_use_df, - gdp_df=gdp_df, - primary_energy_df=primary_energy_df, - ) - - # Set an appropriate index, ensure there are no rows that only have nan, and sort conveniently. - combined_df = combined_df.set_index(["country", "year"], verify_integrity=True) - combined_df = combined_df.dropna(subset=combined_df.columns, how="all").sort_index().sort_index(axis=1) - - # Run sanity checks on output data. - sanity_checks_on_output_data(combined_df) - - # - # Save outputs. - # - # Create a new garden dataset and use metadata from meadow dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - ds_garden.metadata = co2_ds.metadata - # Update metadata using the information in the yaml file. - ds_garden.metadata.update_from_yaml(METADATA_PATH, if_source_exists="replace") - - # Create a table with the combined data. - tb_garden = catalog.Table(combined_df) - # Use metadata from yaml file. - tb_garden.update_metadata_from_yaml(METADATA_PATH, DATASET_NAME) - - # Add combined table to garden dataset and save dataset. - ds_garden.add(tb_garden) - ds_garden.save() diff --git a/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget_fossil_co2_emissions.countries.json b/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget_fossil_co2_emissions.countries.json deleted file mode 100644 index 41128d4002f..00000000000 --- a/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget_fossil_co2_emissions.countries.json +++ /dev/null @@ -1,234 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antarctica": "Antarctica", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cape Verde": "Cape Verde", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Christmas Island": "Christmas Island", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cura\u00e7ao": "Curacao", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Ethiopia": "Ethiopia", - "Faeroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Global": "World", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "International Transport": "International transport", - "Iran": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Laos": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Moldova": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North Korea": "North Korea", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Occupied Palestinian Territory": "Palestine", - "Oman": "Oman", - "Pacific Islands (Palau)": "Palau", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russia": "Russia", - "Rwanda": "Rwanda", - "R\u00e9union": "Reunion", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syria": "Syria", - "Taiwan": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "USA": "United States", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Viet Nam": "Vietnam", - "Wallis and Futuna Islands": "Wallis and Futuna", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "French Equatorial Africa": "French Equatorial Africa (GCP)", - "French West Africa": "French West Africa (GCP)", - "Kuwaiti Oil Fires": "Kuwaiti Oil Fires (GCP)", - "Leeward Islands": "Leeward Islands (GCP)", - "Panama Canal Zone": "Panama Canal Zone (GCP)", - "Ryukyu Islands": "Ryukyu Islands (GCP)", - "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla (GCP)" -} diff --git a/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget_land_use_change_emissions.countries.json b/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget_land_use_change_emissions.countries.json deleted file mode 100644 index 7b96d7e11f3..00000000000 --- a/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget_land_use_change_emissions.countries.json +++ /dev/null @@ -1,214 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bhutan": "Bhutan", - "Bolivia (Plurinational State of)": "Bolivia", - "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cura\u00e7ao": "Curacao", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Falkland Islands (Malvinas)": "Falkland Islands", - "Faroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guatemala": "Guatemala", - "Guernsey": "Guernsey", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Isle of Man": "Isle of Man", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jersey": "Jersey", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Korea, Republic of": "South Korea", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Mali": "Mali", - "Malta": "Malta", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Mexico": "Mexico", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "Netherlands Antilles": "Netherlands Antilles", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palestine, State of": "Palestine", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "R\u00e9union": "Reunion", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Martin (French part)": "Saint Martin (French part)", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Svalbard and Jan Mayen": "Svalbard and Jan Mayen", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Taiwan, Province of China": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania, United Republic of": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Virgin Islands (U.S.)": "United States Virgin Islands", - "Western Sahara": "Western Sahara", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "\u00c5land Islands": "Aland Islands", - "Congo, Democratic Republic of the": "Democratic Republic of Congo", - "Korea (Democratic People's Republic of)": "North Korea", - "Moldova, Republic of": "Moldova" -} diff --git a/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget_national_emissions.countries.json b/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget_national_emissions.countries.json deleted file mode 100644 index 38a5c63f3f9..00000000000 --- a/etl/steps/archive/garden/gcp/2022-11-11/global_carbon_budget_national_emissions.countries.json +++ /dev/null @@ -1,234 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Africa": "Africa (GCP)", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Asia": "Asia (GCP)", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Bunkers": "International transport", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cape Verde": "Cape Verde", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cura\u00e7ao": "Curacao", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Ethiopia": "Ethiopia", - "EU27": "European Union (27) (GCP)", - "Europe": "Europe (GCP)", - "Faeroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Laos": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Moldova": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North America": "North America (GCP)", - "North Korea": "North Korea", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Occupied Palestinian Territory": "Palestine", - "Oceania": "Oceania (GCP)", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Qatar": "Qatar", - "Romania": "Romania", - "Russia": "Russia", - "Rwanda": "Rwanda", - "R\u00e9union": "Reunion", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South America": "South America (GCP)", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syria": "Syria", - "Taiwan": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "USA": "United States", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Viet Nam": "Vietnam", - "Wallis and Futuna Islands": "Wallis and Futuna", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Central America": "Central America (GCP)", - "Middle East": "Middle East (GCP)", - "Non-OECD": "Non-OECD (GCP)", - "OECD": "OECD (GCP)" -} diff --git a/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.countries.json b/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.countries.json deleted file mode 100644 index abaab52fe1b..00000000000 --- a/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.countries.json +++ /dev/null @@ -1,278 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Africa": "Africa (GCP)", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antarctica": "Antarctica", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Asia": "Asia (GCP)", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bolivia (Plurinational State of)": "Bolivia", - "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Bunkers": "International transport", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cape Verde": "Cape Verde", - "Central African Republic": "Central African Republic", - "Central America": "Central America (GCP)", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Christmas Island": "Christmas Island", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Congo, Democratic Republic of the": "Democratic Republic of Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cura\u00e7ao": "Curacao", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "Czechia": "Czechia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "EU27": "European Union (27) (GCP)", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Europe": "Europe (GCP)", - "Faeroe Islands": "Faroe Islands", - "Falkland Islands (Malvinas)": "Falkland Islands", - "Faroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Equatorial Africa": "French Equatorial Africa (GCP)", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "French West Africa": "French West Africa (GCP)", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Global": "World", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guatemala": "Guatemala", - "Guernsey": "Guernsey", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "International Transport": "International transport", - "Iran": "Iran", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Isle of Man": "Isle of Man", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jersey": "Jersey", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea (Democratic People's Republic of)": "North Korea", - "Korea, Republic of": "South Korea", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kuwaiti Oil Fires": "Kuwaiti Oil Fires (GCP)", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic": "Laos", - "Laos": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Leeward Islands": "Leeward Islands (GCP)", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Middle East": "Middle East (GCP)", - "Moldova": "Moldova", - "Moldova, Republic of": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "Netherlands Antilles": "Netherlands Antilles", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "Non-OECD": "Non-OECD (GCP)", - "North America": "North America (GCP)", - "North Korea": "North Korea", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "OECD": "OECD (GCP)", - "Occupied Palestinian Territory": "Palestine", - "Oceania": "Oceania (GCP)", - "Oman": "Oman", - "Pacific Islands (Palau)": "Palau", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Palestine, State of": "Palestine", - "Panama": "Panama", - "Panama Canal Zone": "Panama Canal Zone (GCP)", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russia": "Russia", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Ryukyu Islands": "Ryukyu Islands (GCP)", - "R\u00e9union": "Reunion", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Martin (French part)": "Saint Martin (French part)", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South America": "South America (GCP)", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla (GCP)", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Svalbard and Jan Mayen": "Svalbard and Jan Mayen", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syria": "Syria", - "Syrian Arab Republic": "Syria", - "Taiwan": "Taiwan", - "Taiwan, Province of China": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Tanzania, United Republic of": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "USA": "United States", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Virgin Islands (U.S.)": "United States Virgin Islands", - "Wallis and Futuna Islands": "Wallis and Futuna", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "\u00c5land Islands": "Aland Islands" -} diff --git a/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.excluded_countries.json b/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.excluded_countries.json deleted file mode 100644 index e7a16636a61..00000000000 --- a/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.excluded_countries.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - "KP Annex B", - "Non KP Annex B" -] \ No newline at end of file diff --git a/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.meta.yml b/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.meta.yml deleted file mode 100644 index 42f40dd8a28..00000000000 --- a/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.meta.yml +++ /dev/null @@ -1,485 +0,0 @@ -dataset: - title: Global Carbon Budget (Global Carbon Project, 2023) - description: | - The Global Carbon Budget dataset is available [here](https://globalcarbonbudget.org/archive/). - - Full reference: - Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Alkama, R., Arneth, A., Arora, V. K., Bates, N. R., Becker, M., Bellouin, N., Bittig, H. C., Bopp, L., Chevallier, F., Chini, L. P., Cronin, M., Evans, W., Falk, S., Feely, R. A., Gasser, T., Gehlen, M., Gkritzalis, T., Gloege, L., Grassi, G., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jain, A. K., Jersild, A., Kadono, K., Kato, E., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Landschützer, P., Lefèvre, N., Lindsay, K., Liu, J., Liu, Z., Marland, G., Mayot, N., McGrath, M. J., Metzl, N., Monacci, N. M., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K., Ono, T., Palmer, P. I., Pan, N., Pierrot, D., Pocock, K., Poulter, B., Resplandy, L., Robertson, E., Rödenbeck, C., Rodriguez, C., Rosan, T. M., Schwinger, J., Séférian, R., Shutler, J. D., Skjelvan, I., Steinhoff, T., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tanhua, T., Tans, P. P., Tian, X., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., Walker, A. P., Wanninkhof, R., Whitehead, C., Willstrand Wranne, A., Wright, R., Yuan, W., Yue, C., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2022, Earth Syst. Sci. Data, 14, 4811-4900, https://doi.org/10.5194/essd-14-4811-2022, 2022. - - Variables include each country, region and World Bank income group's share of the global population; production-based (territorial); and consumption-based (trade-adjusted) carbon dioxide emissions. - - Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. - - Note that consumption-based emissions are not available for all countries; although those without complete data are a small fraction (3%) of the global total. - - Calculation of each country's share of the global population is calculated using our population dataset, based on [different sources]("https://ourworldindata.org/population-sources)). - - Data on global emissions has been converted by Our World in Data from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. - - Our World in Data have renamed bunker fuels as "International transport" for improved clarity, which includes emissions from international aviation and shipping. - Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. - - licenses: - - name: Creative Commons Attribution 4.0 International - url: https://zenodo.org/record/7215364 - sources: - - name: Our World in Data based on the Global Carbon Project (2023) - published_by: "Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Alkama, R., Arneth, A., Arora, V. K., Bates, N. R., Becker, M., Bellouin, N., Bittig, H. C., Bopp, L., Chevallier, F., Chini, L. P., Cronin, M., Evans, W., Falk, S., Feely, R. A., Gasser, T., Gehlen, M., Gkritzalis, T., Gloege, L., Grassi, G., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jain, A. K., Jersild, A., Kadono, K., Kato, E., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Landschützer, P., Lefèvre, N., Lindsay, K., Liu, J., Liu, Z., Marland, G., Mayot, N., McGrath, M. J., Metzl, N., Monacci, N. M., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K., Ono, T., Palmer, P. I., Pan, N., Pierrot, D., Pocock, K., Poulter, B., Resplandy, L., Robertson, E., Rödenbeck, C., Rodriguez, C., Rosan, T. M., Schwinger, J., Séférian, R., Shutler, J. D., Skjelvan, I., Steinhoff, T., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tanhua, T., Tans, P. P., Tian, X., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., Walker, A. P., Wanninkhof, R., Whitehead, C., Willstrand Wranne, A., Wright, R., Yuan, W., Yue, C., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2022, Earth Syst. Sci. Data, 14, 4811-4900, https://doi.org/10.5194/essd-14-4811-2022, 2022." - url: https://www.globalcarbonproject.org/ - date_accessed: 2023-04-28 - -tables: - global_carbon_budget: - variables: - consumption_emissions: - title: "Annual consumption-based CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Data has been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - consumption_emissions_as_share_of_global: - title: "Share of global annual CO₂ consumption-based emissions" - unit: "%" - short_unit: "%" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured as a percentage of global consumption-based emissions of CO₂ in the same year. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide." - consumption_emissions_per_capita: - title: "Annual consumption-based CO₂ emissions (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes per person. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - consumption_emissions_per_gdp: - title: "Annual consumption-based CO₂ emissions per GDP (kg per international-$)" - unit: "kilograms per international-$" - short_unit: "kg/$" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in kilograms per dollar of GDP (2011 international-$). Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - cumulative_consumption_emissions: - title: "Cumulative CO₂ consumption-based emissions" - unit: "tonnes" - short_unit: "t" - description: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of data availability, measured in tonnes. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - cumulative_consumption_emissions_as_share_of_global: - title: "Share of global cumulative CO₂ consumption-based emissions" - unit: "%" - short_unit: "%" - description: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of data availability, measured as a percentage of global cumulative consumption-based emissions of CO₂ since the first year of data availability. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide." - cumulative_emissions_from_cement: - title: "Cumulative CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from cement since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_cement_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from cement" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from cement since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from cement since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from cement has been calculated by Our World in Data using global CO₂ emissions from cement provided in the Global Carbon Budget dataset." - cumulative_emissions_from_coal: - title: "Cumulative CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from coal since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_coal_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from coal" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from coal since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from coal since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from coal has been calculated by Our World in Data using global CO₂ emissions from coal provided in the Global Carbon Budget dataset." - cumulative_emissions_from_flaring: - title: "Cumulative CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from flaring since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_flaring_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from flaring" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from flaring since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from flaring since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from flaring has been calculated by Our World in Data using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset." - cumulative_emissions_from_gas: - title: "Cumulative CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from gas since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_gas_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from gas" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from gas since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from gas since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from gas has been calculated by Our World in Data using global CO₂ emissions from gas provided in the Global Carbon Budget dataset." - cumulative_emissions_from_land_use_change: - title: "Cumulative CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from land-use change since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_land_use_change_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from land-use change" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from land-use change since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from land-use chang since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset." - cumulative_emissions_from_oil: - title: "Cumulative CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from oil since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_oil_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from oil" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from oil since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from oil since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from oil has been calculated by Our World in Data using global CO₂ emissions from oil provided in the Global Carbon Budget dataset. Global oil emissions include all country emissions as well as emissions from international aviation and shipping." - cumulative_emissions_from_other_industry: - title: "Cumulative CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from other industry sources since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_other_industry_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from other industry" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from other industry sources since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from other industry sources since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from other industry sources has been calculated by Our World in Data using global CO₂ emissions from other industry sources provided in the Global Carbon Budget dataset. Global emissions from other industry sources include all country emissions." - cumulative_emissions_total: - title: "Cumulative CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_total_as_share_of_global: - title: "Share of global cumulative CO₂ emissions" - unit: "%" - short_unit: "%" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of data availability, measured as a percentage of global total cumulative production-based emissions of CO₂ since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - cumulative_emissions_total_including_land_use_change: - title: "Cumulative CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), including land-use change, since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_total_including_land_use_change_as_share_of_global: - title: "Share of global cumulative CO₂ emissions including land-use change" - unit: "%" - short_unit: "%" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), including land-use change, since the first year of data availability, measured as a percentage of global total cumulative production-based emissions of CO₂ (including land-use change) since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_from_cement: - title: "Annual CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_cement_as_share_of_global: - title: "Share of global annual CO₂ emissions from cement" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured as a percentage of global production-based emissions of CO₂ from cement in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from cement has been calculated by Our World in Data using global CO₂ emissions from cement provided in the Global Carbon Budget dataset." - emissions_from_cement_per_capita: - title: "Annual CO₂ emissions from cement (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_coal: - title: "Annual CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_coal_as_share_of_global: - title: "Share of global annual CO₂ emissions from coal" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured as a percentage of global production-based emissions of CO₂ from coal in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from coal has been calculated by Our World in Data using global CO₂ emissions from coal provided in the Global Carbon Budget dataset." - emissions_from_coal_per_capita: - title: "Annual CO₂ emissions from coal (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_flaring: - title: "Annual CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_flaring_as_share_of_global: - title: "Share of global annual CO₂ emissions from flaring" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured as a percentage of global production-based emissions of CO₂ from flaring in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from flaring has been calculated by Our World in Data using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset." - emissions_from_flaring_per_capita: - title: "Annual CO₂ emissions from flaring (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_gas: - title: "Annual CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_gas_as_share_of_global: - title: "Share of global annual CO₂ emissions from gas" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured as a percentage of global production-based emissions of CO₂ from gas in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from gas has been calculated by Our World in Data using global CO₂ emissions from gas provided in the Global Carbon Budget dataset. Global gas emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_from_gas_per_capita: - title: "Annual CO₂ emissions from gas (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_land_use_change: - title: "Annual CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_land_use_change_as_share_of_global: - title: "Share of global annual CO₂ emissions from land-use change" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured as a percentage of global production-based emissions of CO₂ from land-use change in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset." - emissions_from_land_use_change_per_capita: - title: "Annual CO₂ emissions from land-use change per capita" - unit: "tonnes of CO₂ per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_oil: - title: "Annual CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_oil_as_share_of_global: - title: "Share of global annual CO₂ emissions from oil" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured as a percentage of global production-based emissions of CO₂ from oil in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from oil has been calculated by Our World in Data using global CO₂ emissions from oil provided in the Global Carbon Budget dataset. Global oil emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_from_oil_per_capita: - title: "Annual CO₂ emissions from oil (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_other_industry: - title: "Annual CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_other_industry_as_share_of_global: - title: "Share of global annual CO₂ emissions from other industry" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured as a percentage of global production-based emissions of CO₂ from other industry sources in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from other industry sources has been calculated by Our World in Data using global CO₂ emissions from other industry sources provided in the Global Carbon Budget dataset. Global emissions form other industry sources include all country emissions." - emissions_from_other_industry_per_capita: - title: "Annual CO₂ emissions from other industry (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total: - title: "Annual CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_as_share_of_global: - title: "Share of global annual CO₂ emissions" - unit: "%" - short_unit: "%" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured as a percentage of global production-based emissions of CO₂ in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_total_including_land_use_change: - title: "Annual CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_including_land_use_change_as_share_of_global: - title: "Share of global annual CO₂ emissions including land-use change" - unit: "%" - short_unit: "%" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured as a percentage of global total production-based emissions of CO₂ in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_total_including_land_use_change_per_capita: - title: "Annual CO₂ emissions including land-use change per capita" - unit: "tonnes of CO₂ per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_including_land_use_change_per_gdp: - title: "Annual CO₂ emissions including land-use change per GDP" - unit: "kilograms per international-$" - short_unit: "kg/$" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per dollar of GDP (2011 international-$). Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_including_land_use_change_per_unit_energy: - title: "Annual CO₂ emissions including land-use change per unit energy" - unit: "kilograms per kilowatt-hour" - short_unit: "kg/kWh" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per kilowatt-hour of primary energy consumption. Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_per_capita: - title: "Annual CO₂ emissions (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_per_gdp: - title: "Annual CO₂ emissions per GDP (kg per international-$)" - unit: "kilograms per international-$" - short_unit: "kg/$" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per dollar of GDP (2011 international-$). Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_per_unit_energy: - title: "Annual CO₂ emissions per unit energy (kg per kilowatt-hour)" - unit: "kilograms per kilowatt-hour" - short_unit: "kg/kWh" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per kilowatt-hour of primary energy consumption. Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - gdp: - title: "GDP" - unit: "2011 international-$" - short_unit: "$" - description: >- - Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time - (inflation) and price differences between countries. - global_cumulative_emissions_from_cement: - title: "Global cumulative CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_coal: - title: "Global cumulative CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_flaring: - title: "Global cumulative CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_gas: - title: "Global cumulative CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_land_use_change: - title: "Global cumulative CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_oil: - title: "Global cumulative CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_other_industry: - title: "Global cumulative CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_total: - title: "Global cumulative CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_total_including_land_use_change: - title: "Global cumulative CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_cement: - title: "Global annual CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_coal: - title: "Global annual CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_flaring: - title: "Global annual CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_gas: - title: "Global annual CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_international_transport: - title: "Global annual CO₂ emissions from international transport" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_land_use_change: - title: "Global annual CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_oil: - title: "Global annual CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_other_industry: - title: "Global annual CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_total: - title: "Global annual CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_total_including_land_use_change: - title: "Global annual CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_population: - title: "Global population" - unit: "persons" - short_unit: "persons" - description: "World population." - growth_emissions_total: - title: "Annual CO₂ emissions growth (abs)" - unit: "tonnes" - short_unit: "t" - description: "Annual growth in total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - growth_emissions_total_including_land_use_change: - title: "Growth rate of emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual growth in total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - land_use_change_quality_flag: - title: "Land-use change quality flag" - unit: "" - short_unit: "" - description: "Carbon dioxide emissions from land use change vary significantly in their degree of certainty. The quality flag is 1 if the different estimates of land-use change emissions considered by the Global Carbon Project have a reasonable agrement. Otherwise the quality flag is 0. The flag is also set to zero if not all estimates have data for a given country. For a more detailed definition, see the original paper." - pct_growth_emissions_total: - title: "Annual CO₂ emissions growth (%)" - unit: "%" - short_unit: "%" - description: "Annual percentage growth in total production-based emissions of carbon dioxide (CO₂), excluding land-use change. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - pct_growth_emissions_total_including_land_use_change: - title: "Growth rate of emissions including land-use change (%)" - unit: "%" - short_unit: "%" - description: "Annual percentage growth in total production-based emissions of carbon dioxide (CO₂), including land-use change. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - pct_traded_emissions: - title: "Share of annual CO₂ emissions embedded in trade" - unit: "%" - short_unit: "%" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured as a percentage of production-based emissions of CO₂. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - pct_traded_emissions_including_land_use_change: - title: "Traded emissions including land-use change (%)" - unit: "%" - short_unit: "%" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured as a percentage of production-based emissions of CO₂, including land-use change. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - population: - title: "Population" - unit: "persons" - short_unit: "persons" - description: "" - population_as_share_of_global: - title: "Share of population" - unit: "%" - short_unit: "%" - description: "Population, measured as a percentage of global total population in the same year." - primary_energy_consumption: - title: "Primary energy consumption" - unit: "terawatt-hours" - short_unit: "TWh" - description: "Primary energy consumption, measured in terawatt-hours per year." - traded_emissions: - title: "Annual CO₂ emissions embedded in trade" - unit: "tonnes" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - traded_emissions_including_land_use_change: - title: "Traded emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured in tonnes. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - traded_emissions_including_land_use_change_per_capita: - title: "Traded emissions including land-use change per capita" - unit: "tonnes of CO₂ per capita" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured in tonnes per person. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - traded_emissions_per_capita: - title: "Annual CO₂ emissions embedded in trade (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes per person. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." diff --git a/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.py b/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.py deleted file mode 100644 index f6fb09796e3..00000000000 --- a/etl/steps/archive/garden/gcp/2023-04-28/global_carbon_budget.py +++ /dev/null @@ -1,945 +0,0 @@ -"""This step creates the Global Carbon Budget (GCB) dataset, by the Global Carbon Project (GCP). - -It harmonizes and further processes meadow data, and uses the following auxiliary datasets: -- GGDC's Maddison dataset on GDP, used to calculate emissions per GDP. -- Primary Energy Consumption (mix of sources from the 'energy' namespace) to calculate emissions per unit energy. -- Population (mix of sources), to calculate emissions per capita. -- Regions (mix of sources), to generate aggregates for different continents. -- WorldBank's Income groups, to generate aggregates for different income groups. - -""" - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table -from owid.datautils import dataframes -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Expected outliers in consumption-based emissions (with negative emissions in the original data, that will be removed). -OUTLIERS_IN_CONSUMPTION_DF = [ - ("Panama", 2003), - ("Panama", 2004), - ("Panama", 2005), - ("Panama", 2006), - ("Panama", 2011), - ("Panama", 2012), - ("Panama", 2013), - ("Venezuela", 2018), -] - -# Label used for international transport (emissions from oil in bunker fuels), included as a country in the -# fossil CO2 emissions dataset. -INTERNATIONAL_TRANSPORT_LABEL = "International Transport" - -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -REGIONS = { - # Default continents. - "Africa": {}, - "Asia": {}, - "Europe": {}, - "European Union (27)": {}, - "North America": {}, - "Oceania": {}, - "South America": {}, - # Income groups. - "Low-income countries": {}, - "Upper-middle-income countries": {}, - "Lower-middle-income countries": {}, - "High-income countries": {}, - # Additional composite regions. - "Asia (excl. China and India)": { - "additional_regions": ["Asia"], - "excluded_members": ["China", "India"], - }, - "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, - "Europe (excl. EU-28)": { - "additional_regions": ["Europe"], - "excluded_regions": ["European Union (27)"], - "excluded_members": ["United Kingdom"], - }, - "European Union (28)": { - "additional_regions": ["European Union (27)"], - "additional_members": ["United Kingdom"], - }, - "North America (excl. USA)": { - "additional_regions": ["North America"], - "excluded_members": ["United States"], - }, -} - -# Columns to use from GCB fossil CO2 emissions data and how to rename them. -CO2_COLUMNS = { - "country": "country", - "year": "year", - "cement": "emissions_from_cement", - "coal": "emissions_from_coal", - "flaring": "emissions_from_flaring", - "gas": "emissions_from_gas", - "oil": "emissions_from_oil", - "other": "emissions_from_other_industry", - "total": "emissions_total", -} - -# List all sources of emissions considered. -EMISSION_SOURCES = [column for column in CO2_COLUMNS.values() if column not in ["country", "year"]] - -# Columns to use from primary energy consumption data and how to rename them. -PRIMARY_ENERGY_COLUMNS = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "primary_energy_consumption", -} - -# Columns to use from GDP data and how to rename them. -GDP_COLUMNS = { - "country": "country", - "year": "year", - "gdp": "gdp", -} - -# Columns to use from primary energy consumption data and how to rename them. -HISTORICAL_EMISSIONS_COLUMNS = { - "country": "country", - "year": "year", - # Global fossil emissions are used only for sanity checks. - "global_fossil_emissions": "global_fossil_emissions", - "global_land_use_change_emissions": "global_emissions_from_land_use_change", -} - -# Columns to use from consumption-based emissions data and how to rename them. -CONSUMPTION_EMISSIONS_COLUMNS = { - "country": "country", - "year": "year", - "consumption_emissions": "consumption_emissions", -} - -# Conversion from terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - -# Conversion factor to change from billion tonnes of carbon to tonnes of CO2. -BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e9 - -# Conversion factor to change from million tonnes of carbon to tonnes of CO2. -MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 - -# Conversion from million tonnes of CO2 to tonnes of CO2. -MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 = 1e6 - -# Conversion from tonnes of CO2 to kg of CO2 (used for emissions per GDP and per unit energy). -TONNES_OF_CO2_TO_KG_OF_CO2 = 1000 - -# In order to remove uninformative columns, keep only rows where at least one of the following columns has data. -# All other columns are either derived variables, or global variables, or auxiliary variables from other datasets. -COLUMNS_THAT_MUST_HAVE_DATA = [ - "emissions_from_cement", - "emissions_from_coal", - "emissions_from_flaring", - "emissions_from_gas", - "emissions_from_oil", - "emissions_from_other_industry", - "emissions_total", - "consumption_emissions", - "emissions_from_land_use_change", - # 'land_use_change_quality_flag', -] - - -def sanity_checks_on_input_data( - df_production: pd.DataFrame, df_consumption: pd.DataFrame, df_historical: pd.DataFrame, df_co2: pd.DataFrame -) -> None: - """Run sanity checks on input data files. - - These checks should be used prior to country harmonization, but after basic processing of the dataframes. - - Parameters - ---------- - df_production : pd.DataFrame - Production-based emissions from GCP's official national emissions dataset (excel file). - df_consumption : pd.DataFrame - Consumption-based emissions from GCP's official national emissions dataset (excel file). - df_historical : pd.DataFrame - Historical emissions from GCP's official global emissions dataset (excel file). - df_co2 : pd.DataFrame - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - - """ - df_production = df_production.copy() - df_consumption = df_consumption.copy() - df_historical = df_historical.copy() - df_co2 = df_co2.copy() - - # In the original data, Bunkers was included in the national data file, as another country. - # But I suppose it should be considered as another kind of global emission. - # In fact, bunker emissions should coincide for production and consumption emissions. - global_bunkers_emissions = ( - df_production[df_production["country"] == "Bunkers"][["year", "production_emissions"]] - .reset_index(drop=True) - .rename(columns={"production_emissions": "global_bunker_emissions"}, errors="raise") - ) - - # Check that we get exactly the same array of bunker emissions from the consumption emissions dataframe - # (on years where there is data for bunker emissions in both datasets). - comparison = pd.merge( - global_bunkers_emissions, - df_consumption[df_consumption["country"] == "Bunkers"][["year", "consumption_emissions"]] - .reset_index(drop=True) - .rename(columns={"consumption_emissions": "global_bunker_emissions"}, errors="raise"), - how="inner", - on="year", - suffixes=("", "_check"), - ) - - error = "Bunker emissions were expected to coincide in production and consumption emissions dataframes." - assert (comparison["global_bunker_emissions"] == comparison["global_bunker_emissions_check"]).all(), error - - # Check that all production-based emissions are positive. - error = "There are negative emissions in df_production (from the additional variables dataset)." - assert (df_production.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that all production-based emissions from the fossil CO2 dataset are positive. - error = "There are negative emissions in df_co2 (from the fossil CO2 dataset)." - assert (df_co2.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that all consumption-based emissions are positive. - error = "There are negative emissions in df_consumption (from the national emissions dataset)." - assert (df_consumption.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that, for the World, production emissions coincides with consumption emissions (on common years). - error = "Production and consumption emissions for the world were expected to be identical." - comparison = pd.merge( - df_production[df_production["country"] == "World"].reset_index(drop=True), - df_consumption[df_consumption["country"] == "World"].reset_index(drop=True), - how="inner", - on="year", - ) - assert (comparison["production_emissions"] == comparison["consumption_emissions"]).all(), error - - # Check that production emissions for the World coincide with global (historical) emissions (on common years). - comparison = pd.merge( - df_production[df_production["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), - df_historical[["year", "global_fossil_emissions"]], - how="inner", - on="year", - ) - error = "Production emissions for the world were expected to coincide with global fossil emissions." - assert ( - abs(comparison["production_emissions"] - comparison["global_fossil_emissions"]) - / (comparison["global_fossil_emissions"]) - < 0.001 - ).all(), error - - # Check that emissions in df_production (emissions from the national excel file) coincide with emissions in df_co2 - # (from the Fossil CO2 emissions csv file). - # Given that country names have not yet been harmonized, rename the only countries that are present in both datasets. - comparison = pd.merge( - df_co2[["country", "year", "emissions_total"]], - df_production.replace({"Bunkers": "International Transport", "World": "Global"}), - on=["country", "year"], - how="inner", - ).dropna(subset=["emissions_total", "production_emissions"], how="any") - # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in df_production), - # omit that row in the comparison. - comparison = comparison.drop( - comparison[(comparison["country"] == "Kuwait") & (comparison["year"] == 1991)].index - ).reset_index(drop=True) - - error = "Production emissions from national file were expected to coincide with the Fossil CO2 emissions dataset." - assert ( - ( - 100 - * abs(comparison["production_emissions"] - comparison["emissions_total"]) - / (comparison["emissions_total"]) - ).fillna(0) - < 0.1 - ).all(), error - - -def sanity_checks_on_output_data(combined_df: pd.DataFrame) -> None: - """Run sanity checks on output data. - - These checks should be run on the very final output dataframe (with an index) prior to storing it as a table. - - Parameters - ---------- - combined_df : pd.DataFrame - Combination of all input dataframes, after processing, harmonization, and addition of variables. - - """ - combined_df = combined_df.reset_index() - error = "All variables (except traded emissions, growth, and land-use change) should be >= 0 or nan." - positive_variables = [ - col - for col in combined_df.columns - if col != "country" - if "traded" not in col - if "growth" not in col - if "land_use" not in col - ] - assert (combined_df[positive_variables].fillna(0) >= 0).all().all(), error - - error = "Production emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert combined_df[ - (combined_df["country"] == "World") & (abs(combined_df["emissions_total_as_share_of_global"] - 100) > 2) - ].empty, error - - error = "Consumption emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert combined_df[ - (combined_df["country"] == "World") & (abs(combined_df["consumption_emissions_as_share_of_global"] - 100) > 2) - ].empty, error - - error = "Population as a share of global population should be 100% for 'World'." - assert combined_df[ - (combined_df["country"] == "World") & (combined_df["population_as_share_of_global"].fillna(100) != 100) - ].empty, error - - error = "All share of global emissions should be smaller than 100% (within 2% error)." - share_variables = [col for col in combined_df.columns if "share" in col] - assert (combined_df[share_variables].fillna(0) <= 102).all().all(), error - - # Check that cumulative variables are monotonically increasing. - # Firstly, list columns of cumulative variables, but ignoring cumulative columns as a share of global - # (since they are not necessarily monotonic) and land-use change (which can be negative). - cumulative_cols = [ - col for col in combined_df.columns if "cumulative" in col if "share" not in col if "land_use" not in col - ] - # Using ".is_monotonic_increasing" can fail when differences between consecutive numbers are very small. - # Instead, sort data backwards in time, and check that consecutive values of cumulative variables always have - # a percentage change that is smaller than, say, 0.1%. - error = ( - "Cumulative variables (not given as a share of global) should be monotonically increasing (except when " - "including land-use change emissions, which can be negative)." - ) - assert ( - combined_df.sort_values("year", ascending=False) - .groupby("country") - .agg({col: lambda x: ((x.pct_change().dropna() * 100) <= 0.1).all() for col in cumulative_cols}) - .all() - .all() - ), error - - error = ( - "Production emissions as a share of global production emissions for the World should always be 100% " - "(or larger than 98%, given small discrepancies)." - ) - # Consumption emissions as a share of global production emissions is allowed to be smaller than 100%. - share_variables = [col for col in combined_df.columns if "share" in col if "consumption" not in col] - assert (combined_df[combined_df["country"] == "World"][share_variables].fillna(100) > 98).all().all(), error - - error = "Traded emissions for the World should be close to zero (within 2% error)." - world_mask = combined_df["country"] == "World" - assert ( - abs( - 100 - * combined_df[world_mask]["traded_emissions"].fillna(0) - / combined_df[world_mask]["emissions_total"].fillna(1) - ) - < 2 - ).all(), error - - -def prepare_fossil_co2_emissions(df_co2: pd.DataFrame) -> pd.DataFrame: - """Prepare Fossil CO2 emissions data (basic processing).""" - # Select and rename columns from fossil CO2 data. - df_co2 = df_co2[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") - - # Ensure all emissions are given in tonnes of CO2. - df_co2[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 - - #################################################################################################################### - # NOTE: For certain years, column "emissions_from_other_industry" is not informed for "World" but it is informed - # for some countries (namely China and US). - # This causes the cumulative emissions from other industry as share of global for those countries to become larger - # than 100%. - # This temporary solution fixes the issue: We aggregate the data for China and US on those years when the world's - # data is missing (without touching other years or other columns). - # Firstly, list of years for which the world has no data for emissions_from_other_industry. - world_missing_years = ( - df_co2[(df_co2["country"] == "Global") & (df_co2["emissions_from_other_industry"].isnull())]["year"] - .unique() - .tolist() # type: ignore - ) - # Data that needs to be aggregated. - data_missing_in_world = df_co2[ - df_co2["year"].isin(world_missing_years) & (df_co2["emissions_from_other_industry"].notnull()) - ] - # Check that there is indeed data to be aggregated (that is missing for the World). - error = ( - "Expected emissions_from_other_industry to be null for the world but not null for certain countries " - "(which was an issue in the original fossil CO2 data). The issue may be fixed and the code can be simplified." - ) - assert len(data_missing_in_world) > 0, error - # Create a dataframe of aggregate data for the World, on those years when it's missing. - aggregated_missing_data = ( - data_missing_in_world.groupby("year") - .agg({"emissions_from_other_industry": "sum"}) - .reset_index() - .assign(**{"country": "Global"}) - ) - # Combine the new dataframe of aggregate data with the main dataframe. - df_co2 = dataframes.combine_two_overlapping_dataframes( - df1=df_co2, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True - ) - #################################################################################################################### - - # We add the emissions from "Kuwaiti Oil Fires" (which is also included as a separate country) as part of the - # emissions of Kuwait. This ensures that they will be included in region aggregates. - error = "'Kuwaiti Oil Fires' was expected to only have not-null data for 1991." - assert df_co2[ - (df_co2["country"] == "Kuwaiti Oil Fires") - & (df_co2["emissions_total"].notnull()) - & (df_co2["emissions_total"] != 0) - ]["year"].tolist() == [1991], error - - df_co2.loc[(df_co2["country"] == "Kuwait") & (df_co2["year"] == 1991), EMISSION_SOURCES] = ( - df_co2[(df_co2["country"] == "Kuwaiti Oil Fires") & (df_co2["year"] == 1991)][EMISSION_SOURCES].values - + df_co2[(df_co2["country"] == "Kuwait") & (df_co2["year"] == 1991)][EMISSION_SOURCES].values - ) - - # Check that "emissions_total" agrees with the sum of emissions from individual sources. - error = "The sum of all emissions should add up to total emissions (within 1%)." - assert ( - abs( - df_co2.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) - - df_co2["emissions_total"].fillna(0) - ) - / (df_co2["emissions_total"].fillna(0) + 1e-7) - < 1e-2 - ).all(), error - - # Many rows have zero total emissions, but actually the individual sources are nan. - # Total emissions in those cases should be nan, instead of zero. - no_individual_emissions = df_co2.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) - df_co2.loc[no_individual_emissions, "emissions_total"] = np.nan - - return df_co2 - - -def prepare_consumption_emissions(df_consumption: pd.DataFrame) -> pd.DataFrame: - """Prepare consumption-based emissions data (basic processing).""" - # Select and rename columns. - df_consumption = df_consumption[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( - columns=CONSUMPTION_EMISSIONS_COLUMNS, errors="raise" - ) - - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in df_consumption.drop(columns=["country", "year"]).columns: - df_consumption[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - # List indexes of rows in df_consumption corresponding to outliers (defined above in OUTLIERS_IN_df_consumption). - outlier_indexes = [ - df_consumption[(df_consumption["country"] == outlier[0]) & (df_consumption["year"] == outlier[1])].index.item() - for outlier in OUTLIERS_IN_CONSUMPTION_DF - ] - - error = ( - "Outliers were expected to have negative consumption emissions. " - "Maybe outliers have been fixed (and should be removed from the code)." - ) - assert (df_consumption.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error - - # Remove outliers. - df_consumption = df_consumption.drop(outlier_indexes).reset_index(drop=True) - - return df_consumption - - -def prepare_production_emissions(df_production: pd.DataFrame) -> pd.DataFrame: - """Prepare production-based emissions data (basic processing).""" - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in df_production.drop(columns=["country", "year"]).columns: - df_production[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return df_production - - -def prepare_land_use_emissions(df_land_use: pd.DataFrame) -> pd.DataFrame: - """Prepare land-use change emissions data (basic processing).""" - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - df_land_use["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return df_land_use - - -def prepare_historical_emissions(df_historical: pd.DataFrame) -> pd.DataFrame: - """Prepare historical emissions data.""" - # Select and rename columns from historical emissions data. - df_historical = df_historical[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( - columns=HISTORICAL_EMISSIONS_COLUMNS, errors="raise" - ) - - # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in df_historical.drop(columns=["country", "year"]).columns: - df_historical[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return df_historical - - -def extract_global_emissions(df_co2: pd.DataFrame, df_historical: pd.DataFrame) -> pd.DataFrame: - """Extract World emissions by combining data from the Fossil CO2 emissions and the global emissions dataset. - - The resulting global emissions data includes bunker and land-use change emissions. - - NOTE: This function has to be used after selecting and renaming columns in df_co2, but before harmonizing country - names in df_co2 (so that "International Transport" is still listed as a country). - - Parameters - ---------- - df_co2 : pd.DataFrame - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - df_historical : pd.DataFrame - Historical emissions from GCP's official global emissions dataset (excel file). - - Returns - ------- - global_emissions : pd.DataFrame - World emissions. - - """ - # For some reason, "International Transport" is included as another country, that only has emissions from oil. - # We separate it as another variable (only given at the global level). - global_transport = df_co2[df_co2["country"] == INTERNATIONAL_TRANSPORT_LABEL].reset_index(drop=True) - - # Check that total emissions for international transport coincide with oil emissions. - error = "Total emissions from international transport do not coincide with oil emissions." - assert all((global_transport["emissions_from_oil"] - global_transport["emissions_total"]).dropna() == 0), error - - # Therefore, we can keep only one column for international transport emissions. - global_transport = ( - global_transport[["year", "emissions_from_oil"]] - .dropna() - .rename(columns={"emissions_from_oil": "global_emissions_from_international_transport"}, errors="raise") - ) - - # Create a new dataframe of global emissions. - global_emissions = ( - df_co2[df_co2["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] - .rename(columns={column: f"global_{column}" for column in EMISSION_SOURCES}, errors="raise") - .sort_values("year") - .reset_index(drop=True) - ) - - # Add bunker fuels to global emissions. - global_emissions = pd.merge(global_emissions, global_transport, on=["year"], how="outer") - - # Add historical land-use change emissions to dataframe of global emissions. - global_emissions = pd.merge( - global_emissions, df_historical[["year", "global_emissions_from_land_use_change"]], how="left", on="year" - ) - - # Add variable of total emissions including fossil fuels and land use change. - global_emissions["global_emissions_total_including_land_use_change"] = ( - global_emissions["global_emissions_total"] + global_emissions["global_emissions_from_land_use_change"] - ) - - # Calculate global cumulative emissions. - for column in EMISSION_SOURCES + ["emissions_from_land_use_change", "emissions_total_including_land_use_change"]: - global_emissions[f"global_cumulative_{column}"] = global_emissions[f"global_{column}"].cumsum() - - # Add a country column and add global population. - global_emissions["country"] = "World" - - # Add global population. - global_emissions = geo.add_population_to_dataframe(df=global_emissions, population_col="global_population") - - return global_emissions - - -def harmonize_country_names(df: pd.DataFrame) -> pd.DataFrame: - """Harmonize country names, and fix known issues with certain regions. - - Parameters - ---------- - df : pd.DataFrame - Emissions data (either from the fossil CO2, the production-based, consumption-based, or land-use emissions - datasets). - - Returns - ------- - df : pd.DataFrame - Emissions data after harmonizing country names. - - """ - # Harmonize country names. - df = geo.harmonize_countries( - df=df, - countries_file=paths.country_mapping_path, - excluded_countries_file=paths.excluded_countries_path, - warn_on_missing_countries=True, - warn_on_unused_countries=False, - make_missing_countries_nan=False, - warn_on_unknown_excluded_countries=False, - ) - - # Check that there is only one data point for each country-year. - # In the fossil CO2 emissions data, after harmonization, "Pacific Islands (Palau)" is mapped to "Palau", and - # therefore there are rows with different data for the same country-year. - # However, "Pacific Islands (Palau)" have data until 1991, and "Palau" has data from 1992 onwards. - # After removing empty rows, there should be no overlap. - columns_that_must_have_data = df.drop(columns=["country", "year"]).columns - check = df.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - error = "After harmonizing country names, there is more than one data point for the same country-year." - assert check[check.duplicated(subset=["country", "year"])].empty, error - - df = df.dropna(subset="country").reset_index(drop=True) - - return df - - -def combine_data_and_add_variables( - df_co2: pd.DataFrame, - df_production: pd.DataFrame, - df_consumption: pd.DataFrame, - df_global_emissions: pd.DataFrame, - df_land_use: pd.DataFrame, - df_gdp: pd.DataFrame, - df_energy: pd.DataFrame, - df_population: pd.DataFrame, - ds_regions: Dataset, - ds_income_groups: Dataset, -) -> Table: - """Combine all relevant data into one dataframe, add region aggregates, and add custom variables (e.g. emissions per - capita). - - Parameters - ---------- - df_co2 : pd.DataFrame - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file), after harmonization. - df_production : pd.DataFrame - Production-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - df_consumption : pd.DataFrame - Consumption-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - df_global_emissions : pd.DataFrame - World emissions (including bunker and land-use change emissions). - df_land_use : pd.DataFrame - National land-use change emissions from GCP's official dataset (excel file), after harmonization. - df_gdp : pd.DataFrame - GDP data. - df_energy : pd.DataFrame - Primary energy data. - df_population : pd.DataFrame - Population data. - ds_regions : Dataset - Regions dataset. - ds_income_groups : Dataset - Income groups dataset. - - Returns - ------- - tb_combined : Table - Combined data, with all additional variables and with region aggregates. - - """ - # Add region aggregates that were included in the national emissions file, but not in the Fossil CO2 emissions dataset. - gcp_aggregates = sorted(set(df_production["country"]) - set(df_co2["country"])) - df_co2 = pd.concat( - [ - df_co2, - df_production[df_production["country"].isin(gcp_aggregates)] - .rename(columns={"production_emissions": "emissions_total"}) - .astype({"year": int}), - ], - ignore_index=True, - ).reset_index(drop=True) - - # Add consumption emissions to main dataframe (keep only the countries of the main dataframe). - # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to df_co2 - # (when merging with df_production), all countries from df_consumption should be included in df_co2. - error = "Some countries in df_consumption are not included in df_co2." - assert set(df_consumption["country"]) < set(df_co2["country"]), error - df_co2 = pd.merge(df_co2, df_consumption, on=["country", "year"], how="outer") - - # Add population to original dataframe. - df_co2 = pd.merge(df_co2, df_population[["country", "year", "population"]], on=["country", "year"], how="left") - - # Add GDP to main dataframe. - df_co2 = pd.merge(df_co2, df_gdp, on=["country", "year"], how="left") - - # Add primary energy to main dataframe. - df_co2 = pd.merge(df_co2, df_energy, on=["country", "year"], how="left") - - # For convenience, rename columns in land-use change emissions data. - df_land_use = df_land_use.rename( - columns={"emissions": "emissions_from_land_use_change", "quality_flag": "land_use_change_quality_flag"} - ) - - # Land-use change data does not include data for the World. Include it by merging with the global dataset. - df_land_use = pd.concat( - [ - df_land_use, - df_global_emissions.rename( - columns={"global_emissions_from_land_use_change": "emissions_from_land_use_change"} - )[["year", "emissions_from_land_use_change"]] - .dropna() - .assign(**{"country": "World"}), - ], - ignore_index=True, - ).astype({"year": int}) - - # Add land-use change emissions to main dataframe. - df_co2 = pd.merge(df_co2, df_land_use, on=["country", "year"], how="outer") - - # Add total emissions (including land-use change) for each country. - df_co2["emissions_total_including_land_use_change"] = ( - df_co2["emissions_total"] + df_co2["emissions_from_land_use_change"] - ) - - # Add region aggregates. - # Aggregate not only emissions data, but also population, gdp and primary energy. - # This way we ensure that custom regions (e.g. "North America (excl. USA)") will have all required data. - aggregations = { - column: "sum" for column in df_co2.columns if column not in ["country", "year", "land_use_change_quality_flag"] - } - for region in REGIONS: - countries_in_region = geo.list_members_of_region( - region=region, - ds_regions=ds_regions, - ds_income_groups=ds_income_groups, - additional_regions=REGIONS[region].get("additional_regions", None), - excluded_regions=REGIONS[region].get("excluded_regions", None), - additional_members=REGIONS[region].get("additional_members", None), - excluded_members=REGIONS[region].get("excluded_members", None), - ) - df_co2 = geo.add_region_aggregates( - df=df_co2, - region=region, - countries_in_region=countries_in_region, - countries_that_must_have_data=[], - frac_allowed_nans_per_year=0.999, - aggregations=aggregations, - ) - - # Add global emissions and global cumulative emissions columns to main dataframe. - df_co2 = pd.merge(df_co2, df_global_emissions.drop(columns="country"), on=["year"], how="left") - - # Ensure main dataframe is sorted (so that cumulative emissions are properly calculated). - df_co2 = df_co2.sort_values(["country", "year"]).reset_index(drop=True) - - # Temporarily add certain global emissions variables. - # This is done simply to be able to consider "consumption_emissions" as just another type of emission - # when creating additional variables. - df_co2["global_consumption_emissions"] = df_co2["global_emissions_total"] - df_co2["global_cumulative_consumption_emissions"] = df_co2["global_cumulative_emissions_total"] - - # Add new variables for each source of emissions. - for column in EMISSION_SOURCES + [ - "consumption_emissions", - "emissions_from_land_use_change", - "emissions_total_including_land_use_change", - ]: - # Add per-capita variables. - df_co2[f"{column}_per_capita"] = df_co2[column] / df_co2["population"] - - # Add columns for cumulative emissions. - # Rows that had nan emissions will have nan cumulative emissions. - # But nans will not be propagated in the sum. - # This means that countries with some (not all) nans will have the cumulative sum of the informed emissions - # (treating nans as zeros), but will have nan on those rows that were not informed. - df_co2[f"cumulative_{column}"] = df_co2.groupby(["country"])[column].cumsum() - - # Add share of global emissions. - df_co2[f"{column}_as_share_of_global"] = 100 * df_co2[column] / df_co2[f"global_{column}"] - - # Add share of global cumulative emissions. - df_co2[f"cumulative_{column}_as_share_of_global"] = ( - 100 * df_co2[f"cumulative_{column}"] / df_co2[f"global_cumulative_{column}"] - ) - - # Add total emissions per unit energy (in kg of emissions per kWh). - df_co2["emissions_total_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["emissions_total"] / (df_co2["primary_energy_consumption"] * TWH_TO_KWH) - ) - - # Add total emissions (including land-use change) per unit energy (in kg of emissions per kWh). - df_co2["emissions_total_including_land_use_change_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * df_co2["emissions_total_including_land_use_change"] - / (df_co2["primary_energy_consumption"] * TWH_TO_KWH) - ) - - # Add total emissions per unit GDP. - df_co2["emissions_total_per_gdp"] = TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["emissions_total"] / df_co2["gdp"] - - # Add total emissions (including land-use change) per unit GDP. - df_co2["emissions_total_including_land_use_change_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["emissions_total_including_land_use_change"] / df_co2["gdp"] - ) - - # Add total consumption emissions per unit GDP. - df_co2["consumption_emissions_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["consumption_emissions"] / df_co2["gdp"] - ) - - # Add variable of emissions embedded in trade. - df_co2["traded_emissions"] = df_co2["consumption_emissions"] - df_co2["emissions_total"] - df_co2["pct_traded_emissions"] = 100 * df_co2["traded_emissions"] / df_co2["emissions_total"] - df_co2["traded_emissions_per_capita"] = df_co2["traded_emissions"] / df_co2["population"] - - # Add variable of emissions embedded in trade, including land-use change emissions. - df_co2["traded_emissions_including_land_use_change"] = ( - df_co2["consumption_emissions"] - df_co2["emissions_total_including_land_use_change"] - ) - df_co2["pct_traded_emissions_including_land_use_change"] = ( - 100 * df_co2["traded_emissions_including_land_use_change"] / df_co2["emissions_total_including_land_use_change"] - ) - df_co2["traded_emissions_including_land_use_change_per_capita"] = ( - df_co2["traded_emissions_including_land_use_change"] / df_co2["population"] - ) - - # Remove temporary columns. - df_co2 = df_co2.drop(columns=["global_consumption_emissions", "global_cumulative_consumption_emissions"]) - - # Add annual percentage growth of total emissions. - df_co2["pct_growth_emissions_total"] = df_co2.groupby("country")["emissions_total"].pct_change() * 100 - - # Add annual percentage growth of total emissions (including land-use change). - df_co2["pct_growth_emissions_total_including_land_use_change"] = ( - df_co2.groupby("country")["emissions_total_including_land_use_change"].pct_change() * 100 - ) - - # Add annual absolute growth of total emissions. - df_co2["growth_emissions_total"] = df_co2.groupby("country")["emissions_total"].diff() - - # Add annual absolute growth of total emissions (including land-use change). - df_co2["growth_emissions_total_including_land_use_change"] = df_co2.groupby("country")[ - "emissions_total_including_land_use_change" - ].diff() - - # Create variable of population as a share of global population. - df_co2["population_as_share_of_global"] = df_co2["population"] / df_co2["global_population"] * 100 - - # Replace infinity values (for example when calculating growth from zero to non-zero) in the data by nan. - for column in df_co2.drop(columns=["country", "year"]).columns: - df_co2.loc[np.isinf(df_co2[column]), column] = np.nan - - # For special GCP countries/regions (e.g. "Africa (GCP)") we should keep only the original data. - # Therefore, make nan all additional variables for those countries/regions, and keep only GCP's original data. - added_variables = df_co2.drop(columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA).columns.tolist() - df_co2.loc[(df_co2["country"].str.contains(" (GCP)", regex=False)), added_variables] = np.nan - - # Remove uninformative rows (those that have only data for, say, gdp, but not for variables related to emissions). - df_co2 = df_co2.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index(drop=True) - - # Set an appropriate index, ensure there are no rows that only have nan, and sort conveniently. - df_co2 = df_co2.set_index(["country", "year"], verify_integrity=True) - df_co2 = df_co2.dropna(subset=df_co2.columns, how="all").sort_index().sort_index(axis=1) - - # Create a table with the generated data. - tb_combined = Table(df_co2, short_name=paths.short_name, underscore=True) - - return tb_combined - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load meadow dataset and read all its tables. - ds_meadow: Dataset = paths.load_dependency("global_carbon_budget") - tb_co2 = ds_meadow["global_carbon_budget_fossil_co2_emissions"] - tb_historical = ds_meadow["global_carbon_budget_historical_budget"] - tb_consumption = ds_meadow["global_carbon_budget_consumption_emissions"] - tb_production = ds_meadow["global_carbon_budget_production_emissions"] - tb_land_use = ds_meadow["global_carbon_budget_land_use_change"] - - # Load primary energy consumption dataset and read its main table. - ds_energy: Dataset = paths.load_dependency("primary_energy_consumption") - tb_energy = ds_energy["primary_energy_consumption"] - - # Load GDP dataset and read its main table. - ds_gdp: Dataset = paths.load_dependency("ggdc_maddison") - tb_gdp = ds_gdp["maddison_gdp"] - - # Load population dataset and read its main table. - ds_population: Dataset = paths.load_dependency("population") - tb_population = ds_population["population"] - - # Load regions dataset and read its main tables (it will be used to create region aggregates). - ds_regions: Dataset = paths.load_dependency("regions") - - # Load income groups dataset and read its main table (it will be used to create region aggregates). - ds_income_groups: Dataset = paths.load_dependency("wb_income") - - # Create a dataframe for each table. - df_co2 = pd.DataFrame(tb_co2).reset_index() - df_historical = pd.DataFrame(tb_historical).reset_index() - df_consumption = pd.DataFrame(tb_consumption).reset_index() - df_production = pd.DataFrame(tb_production).reset_index() - df_land_use = pd.DataFrame(tb_land_use).reset_index() - df_energy = pd.DataFrame(tb_energy).reset_index() - df_gdp = pd.DataFrame(tb_gdp).reset_index() - df_population = pd.DataFrame(tb_population).reset_index() - - # - # Process data. - # - # Prepare fossil CO2 emissions data. - df_co2 = prepare_fossil_co2_emissions(df_co2=df_co2) - - # Prepare consumption-based emission data. - df_consumption = prepare_consumption_emissions(df_consumption=df_consumption) - - # Prepare production-based emission data. - df_production = prepare_production_emissions(df_production=df_production) - - # Prepare land-use emission data. - df_land_use = prepare_land_use_emissions(df_land_use=df_land_use) - - # Select and rename columns from primary energy data. - df_energy = df_energy[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS, errors="raise") - - # Select and rename columns from primary energy data. - df_gdp = df_gdp[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") - - # Prepare historical emissions data. - df_historical = prepare_historical_emissions(df_historical=df_historical) - - # Run sanity checks on input data. - sanity_checks_on_input_data( - df_production=df_production, df_consumption=df_consumption, df_historical=df_historical, df_co2=df_co2 - ) - - # For some reason, "International Transport" is included as another country, that only has emissions from oil. - # Extract that data and remove it from the rest of national emissions. - df_global_emissions = extract_global_emissions(df_co2=df_co2, df_historical=df_historical) - - # Harmonize country names. - df_co2 = harmonize_country_names(df=df_co2) - df_consumption = harmonize_country_names(df=df_consumption) - df_production = harmonize_country_names(df=df_production) - df_land_use = harmonize_country_names(df=df_land_use) - - # Add new variables to main dataframe (consumption-based emissions, emission intensity, per-capita emissions, etc.). - tb_combined = combine_data_and_add_variables( - df_co2=df_co2, - df_production=df_production, - df_consumption=df_consumption, - df_global_emissions=df_global_emissions, - df_land_use=df_land_use, - df_gdp=df_gdp, - df_energy=df_energy, - df_population=df_population, - ds_regions=ds_regions, - ds_income_groups=ds_income_groups, - ) - - # Run sanity checks on output data. - sanity_checks_on_output_data(tb_combined) - - # - # Save outputs. - # - # Create a new garden dataset and use metadata from meadow dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_combined], default_metadata=ds_meadow.metadata) - - ds_garden.save() diff --git a/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.countries.json b/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.countries.json deleted file mode 100644 index abaab52fe1b..00000000000 --- a/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.countries.json +++ /dev/null @@ -1,278 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Africa": "Africa (GCP)", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antarctica": "Antarctica", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Asia": "Asia (GCP)", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bolivia (Plurinational State of)": "Bolivia", - "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Bunkers": "International transport", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cape Verde": "Cape Verde", - "Central African Republic": "Central African Republic", - "Central America": "Central America (GCP)", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Christmas Island": "Christmas Island", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Congo, Democratic Republic of the": "Democratic Republic of Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cura\u00e7ao": "Curacao", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "Czechia": "Czechia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "EU27": "European Union (27) (GCP)", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Europe": "Europe (GCP)", - "Faeroe Islands": "Faroe Islands", - "Falkland Islands (Malvinas)": "Falkland Islands", - "Faroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Equatorial Africa": "French Equatorial Africa (GCP)", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "French West Africa": "French West Africa (GCP)", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Global": "World", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guatemala": "Guatemala", - "Guernsey": "Guernsey", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "International Transport": "International transport", - "Iran": "Iran", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Isle of Man": "Isle of Man", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jersey": "Jersey", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea (Democratic People's Republic of)": "North Korea", - "Korea, Republic of": "South Korea", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kuwaiti Oil Fires": "Kuwaiti Oil Fires (GCP)", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic": "Laos", - "Laos": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Leeward Islands": "Leeward Islands (GCP)", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Middle East": "Middle East (GCP)", - "Moldova": "Moldova", - "Moldova, Republic of": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "Netherlands Antilles": "Netherlands Antilles", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "Non-OECD": "Non-OECD (GCP)", - "North America": "North America (GCP)", - "North Korea": "North Korea", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "OECD": "OECD (GCP)", - "Occupied Palestinian Territory": "Palestine", - "Oceania": "Oceania (GCP)", - "Oman": "Oman", - "Pacific Islands (Palau)": "Palau", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Palestine, State of": "Palestine", - "Panama": "Panama", - "Panama Canal Zone": "Panama Canal Zone (GCP)", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russia": "Russia", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Ryukyu Islands": "Ryukyu Islands (GCP)", - "R\u00e9union": "Reunion", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Martin (French part)": "Saint Martin (French part)", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South America": "South America (GCP)", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla (GCP)", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Svalbard and Jan Mayen": "Svalbard and Jan Mayen", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syria": "Syria", - "Syrian Arab Republic": "Syria", - "Taiwan": "Taiwan", - "Taiwan, Province of China": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Tanzania, United Republic of": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "USA": "United States", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Virgin Islands (U.S.)": "United States Virgin Islands", - "Wallis and Futuna Islands": "Wallis and Futuna", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "\u00c5land Islands": "Aland Islands" -} diff --git a/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.excluded_countries.json b/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.excluded_countries.json deleted file mode 100644 index e7a16636a61..00000000000 --- a/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.excluded_countries.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - "KP Annex B", - "Non KP Annex B" -] \ No newline at end of file diff --git a/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.meta.yml b/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.meta.yml deleted file mode 100644 index 5a77ee1f6d0..00000000000 --- a/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.meta.yml +++ /dev/null @@ -1,477 +0,0 @@ -dataset: - title: Global Carbon Budget (Global Carbon Project, 2023b) - description: | - The Global Carbon Budget dataset is available [here](https://globalcarbonbudget.org/archive/). - - Full reference: - Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Alkama, R., Arneth, A., Arora, V. K., Bates, N. R., Becker, M., Bellouin, N., Bittig, H. C., Bopp, L., Chevallier, F., Chini, L. P., Cronin, M., Evans, W., Falk, S., Feely, R. A., Gasser, T., Gehlen, M., Gkritzalis, T., Gloege, L., Grassi, G., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jain, A. K., Jersild, A., Kadono, K., Kato, E., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Landschützer, P., Lefèvre, N., Lindsay, K., Liu, J., Liu, Z., Marland, G., Mayot, N., McGrath, M. J., Metzl, N., Monacci, N. M., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K., Ono, T., Palmer, P. I., Pan, N., Pierrot, D., Pocock, K., Poulter, B., Resplandy, L., Robertson, E., Rödenbeck, C., Rodriguez, C., Rosan, T. M., Schwinger, J., Séférian, R., Shutler, J. D., Skjelvan, I., Steinhoff, T., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tanhua, T., Tans, P. P., Tian, X., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., Walker, A. P., Wanninkhof, R., Whitehead, C., Willstrand Wranne, A., Wright, R., Yuan, W., Yue, C., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2022, Earth Syst. Sci. Data, 14, 4811-4900, https://doi.org/10.5194/essd-14-4811-2022, 2022. - - Variables include each country, region and World Bank income group's share of the global population; production-based (territorial); and consumption-based (trade-adjusted) carbon dioxide emissions. - - Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. - - Note that consumption-based emissions are not available for all countries; although those without complete data are a small fraction (3%) of the global total. - - Calculation of each country's share of the global population is calculated using our population dataset, based on [different sources](https://ourworldindata.org/population-sources)). - - Data on global emissions has been converted by Our World in Data from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. - - Our World in Data have renamed bunker fuels as "International transport" for improved clarity, which includes emissions from international aviation and shipping. - - Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. - -tables: - global_carbon_budget: - variables: - consumption_emissions: - title: "Annual consumption-based CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Data has been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - consumption_emissions_as_share_of_global: - title: "Share of global annual CO₂ consumption-based emissions" - unit: "%" - short_unit: "%" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured as a percentage of global consumption-based emissions of CO₂ in the same year. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide." - consumption_emissions_per_capita: - title: "Annual consumption-based CO₂ emissions (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes per person. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - consumption_emissions_per_gdp: - title: "Annual consumption-based CO₂ emissions per GDP (kg per international-$)" - unit: "kilograms per international-$" - short_unit: "kg/$" - description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in kilograms per dollar of GDP (2011 international-$). Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - cumulative_consumption_emissions: - title: "Cumulative CO₂ consumption-based emissions" - unit: "tonnes" - short_unit: "t" - description: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of data availability, measured in tonnes. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." - cumulative_consumption_emissions_as_share_of_global: - title: "Share of global cumulative CO₂ consumption-based emissions" - unit: "%" - short_unit: "%" - description: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of data availability, measured as a percentage of global cumulative consumption-based emissions of CO₂ since the first year of data availability. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide." - cumulative_emissions_from_cement: - title: "Cumulative CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from cement since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_cement_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from cement" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from cement since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from cement since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from cement has been calculated by Our World in Data using global CO₂ emissions from cement provided in the Global Carbon Budget dataset." - cumulative_emissions_from_coal: - title: "Cumulative CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from coal since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_coal_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from coal" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from coal since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from coal since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from coal has been calculated by Our World in Data using global CO₂ emissions from coal provided in the Global Carbon Budget dataset." - cumulative_emissions_from_flaring: - title: "Cumulative CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from flaring since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_flaring_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from flaring" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from flaring since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from flaring since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from flaring has been calculated by Our World in Data using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset." - cumulative_emissions_from_gas: - title: "Cumulative CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from gas since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_gas_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from gas" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from gas since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from gas since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from gas has been calculated by Our World in Data using global CO₂ emissions from gas provided in the Global Carbon Budget dataset." - cumulative_emissions_from_land_use_change: - title: "Cumulative CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from land-use change since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_land_use_change_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from land-use change" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from land-use change since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from land-use chang since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset." - cumulative_emissions_from_oil: - title: "Cumulative CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from oil since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_oil_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from oil" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from oil since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from oil since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from oil has been calculated by Our World in Data using global CO₂ emissions from oil provided in the Global Carbon Budget dataset. Global oil emissions include all country emissions as well as emissions from international aviation and shipping." - cumulative_emissions_from_other_industry: - title: "Cumulative CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from other industry sources since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_from_other_industry_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from other industry" - unit: "%" - short_unit: "%" - description: "Cumulative production-based emissions of carbon dioxide (CO₂) from other industry sources since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from other industry sources since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from other industry sources has been calculated by Our World in Data using global CO₂ emissions from other industry sources provided in the Global Carbon Budget dataset. Global emissions from other industry sources include all country emissions." - cumulative_emissions_total: - title: "Cumulative CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_total_as_share_of_global: - title: "Share of global cumulative CO₂ emissions" - unit: "%" - short_unit: "%" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of data availability, measured as a percentage of global total cumulative production-based emissions of CO₂ since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - cumulative_emissions_total_including_land_use_change: - title: "Cumulative CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), including land-use change, since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - cumulative_emissions_total_including_land_use_change_as_share_of_global: - title: "Share of global cumulative CO₂ emissions including land-use change" - unit: "%" - short_unit: "%" - description: "Total cumulative production-based emissions of carbon dioxide (CO₂), including land-use change, since the first year of data availability, measured as a percentage of global total cumulative production-based emissions of CO₂ (including land-use change) since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_from_cement: - title: "Annual CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_cement_as_share_of_global: - title: "Share of global annual CO₂ emissions from cement" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured as a percentage of global production-based emissions of CO₂ from cement in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from cement has been calculated by Our World in Data using global CO₂ emissions from cement provided in the Global Carbon Budget dataset." - emissions_from_cement_per_capita: - title: "Annual CO₂ emissions from cement (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_coal: - title: "Annual CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_coal_as_share_of_global: - title: "Share of global annual CO₂ emissions from coal" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured as a percentage of global production-based emissions of CO₂ from coal in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from coal has been calculated by Our World in Data using global CO₂ emissions from coal provided in the Global Carbon Budget dataset." - emissions_from_coal_per_capita: - title: "Annual CO₂ emissions from coal (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_flaring: - title: "Annual CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_flaring_as_share_of_global: - title: "Share of global annual CO₂ emissions from flaring" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured as a percentage of global production-based emissions of CO₂ from flaring in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from flaring has been calculated by Our World in Data using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset." - emissions_from_flaring_per_capita: - title: "Annual CO₂ emissions from flaring (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_gas: - title: "Annual CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_gas_as_share_of_global: - title: "Share of global annual CO₂ emissions from gas" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured as a percentage of global production-based emissions of CO₂ from gas in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from gas has been calculated by Our World in Data using global CO₂ emissions from gas provided in the Global Carbon Budget dataset. Global gas emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_from_gas_per_capita: - title: "Annual CO₂ emissions from gas (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_land_use_change: - title: "Annual CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_land_use_change_as_share_of_global: - title: "Share of global annual CO₂ emissions from land-use change" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured as a percentage of global production-based emissions of CO₂ from land-use change in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset." - emissions_from_land_use_change_per_capita: - title: "Annual CO₂ emissions from land-use change per capita" - unit: "tonnes of CO₂ per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_oil: - title: "Annual CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_oil_as_share_of_global: - title: "Share of global annual CO₂ emissions from oil" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured as a percentage of global production-based emissions of CO₂ from oil in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from oil has been calculated by Our World in Data using global CO₂ emissions from oil provided in the Global Carbon Budget dataset. Global oil emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_from_oil_per_capita: - title: "Annual CO₂ emissions from oil (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_other_industry: - title: "Annual CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_from_other_industry_as_share_of_global: - title: "Share of global annual CO₂ emissions from other industry" - unit: "%" - short_unit: "%" - description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured as a percentage of global production-based emissions of CO₂ from other industry sources in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from other industry sources has been calculated by Our World in Data using global CO₂ emissions from other industry sources provided in the Global Carbon Budget dataset. Global emissions form other industry sources include all country emissions." - emissions_from_other_industry_per_capita: - title: "Annual CO₂ emissions from other industry (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total: - title: "Annual CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_as_share_of_global: - title: "Share of global annual CO₂ emissions" - unit: "%" - short_unit: "%" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured as a percentage of global production-based emissions of CO₂ in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_total_including_land_use_change: - title: "Annual CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_including_land_use_change_as_share_of_global: - title: "Share of global annual CO₂ emissions including land-use change" - unit: "%" - short_unit: "%" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured as a percentage of global total production-based emissions of CO₂ in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." - emissions_total_including_land_use_change_per_capita: - title: "Annual CO₂ emissions including land-use change per capita" - unit: "tonnes of CO₂ per capita" - short_unit: "t" - description: "Annual production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_including_land_use_change_per_gdp: - title: "Annual CO₂ emissions including land-use change per GDP" - unit: "kilograms per international-$" - short_unit: "kg/$" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per dollar of GDP (2011 international-$). Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_including_land_use_change_per_unit_energy: - title: "Annual CO₂ emissions including land-use change per unit energy" - unit: "kilograms per kilowatt-hour" - short_unit: "kg/kWh" - description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per kilowatt-hour of primary energy consumption. Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_per_capita: - title: "Annual CO₂ emissions (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_per_gdp: - title: "Annual CO₂ emissions per GDP (kg per international-$)" - unit: "kilograms per international-$" - short_unit: "kg/$" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per dollar of GDP (2011 international-$). Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - emissions_total_per_unit_energy: - title: "Annual CO₂ emissions per unit energy (kg per kilowatt-hour)" - unit: "kilograms per kilowatt-hour" - short_unit: "kg/kWh" - description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per kilowatt-hour of primary energy consumption. Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." - gdp: - title: "GDP" - unit: "2011 international-$" - short_unit: "$" - description: >- - Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) - and price differences between countries. - global_cumulative_emissions_from_cement: - title: "Global cumulative CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_coal: - title: "Global cumulative CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_flaring: - title: "Global cumulative CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_gas: - title: "Global cumulative CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_land_use_change: - title: "Global cumulative CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_oil: - title: "Global cumulative CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_from_other_industry: - title: "Global cumulative CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_total: - title: "Global cumulative CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "" - global_cumulative_emissions_total_including_land_use_change: - title: "Global cumulative CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_cement: - title: "Global annual CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_coal: - title: "Global annual CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_flaring: - title: "Global annual CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_gas: - title: "Global annual CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_international_transport: - title: "Global annual CO₂ emissions from international transport" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_land_use_change: - title: "Global annual CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_oil: - title: "Global annual CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_from_other_industry: - title: "Global annual CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_total: - title: "Global annual CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description: "" - global_emissions_total_including_land_use_change: - title: "Global annual CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "" - global_population: - title: "Global population" - unit: "persons" - short_unit: "persons" - description: "World population." - growth_emissions_total: - title: "Annual CO₂ emissions growth (abs)" - unit: "tonnes" - short_unit: "t" - description: "Annual growth in total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - growth_emissions_total_including_land_use_change: - title: "Growth rate of emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual growth in total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - land_use_change_quality_flag: - title: "Land-use change quality flag" - unit: "" - short_unit: "" - description: "Carbon dioxide emissions from land use change vary significantly in their degree of certainty. The quality flag is 1 if the different estimates of land-use change emissions considered by the Global Carbon Project have a reasonable agrement. Otherwise the quality flag is 0. The flag is also set to zero if not all estimates have data for a given country. For a more detailed definition, see the original paper." - pct_growth_emissions_total: - title: "Annual CO₂ emissions growth (%)" - unit: "%" - short_unit: "%" - description: "Annual percentage growth in total production-based emissions of carbon dioxide (CO₂), excluding land-use change. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - pct_growth_emissions_total_including_land_use_change: - title: "Growth rate of emissions including land-use change (%)" - unit: "%" - short_unit: "%" - description: "Annual percentage growth in total production-based emissions of carbon dioxide (CO₂), including land-use change. This is based on territorial emissions, which do not account for emissions embedded in traded goods." - pct_traded_emissions: - title: "Share of annual CO₂ emissions embedded in trade" - unit: "%" - short_unit: "%" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured as a percentage of production-based emissions of CO₂. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - pct_traded_emissions_including_land_use_change: - title: "Traded emissions including land-use change (%)" - unit: "%" - short_unit: "%" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured as a percentage of production-based emissions of CO₂, including land-use change. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - population: - title: "Population" - unit: "persons" - short_unit: "persons" - description: "" - population_as_share_of_global: - title: "Share of population" - unit: "%" - short_unit: "%" - description: "Population, measured as a percentage of global total population in the same year." - primary_energy_consumption: - title: "Primary energy consumption" - unit: "terawatt-hours" - short_unit: "TWh" - description: "Primary energy consumption, measured in terawatt-hours per year." - traded_emissions: - title: "Annual CO₂ emissions embedded in trade" - unit: "tonnes" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - traded_emissions_including_land_use_change: - title: "Traded emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured in tonnes. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - traded_emissions_including_land_use_change_per_capita: - title: "Traded emissions including land-use change per capita" - unit: "tonnes of CO₂ per capita" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured in tonnes per person. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." - traded_emissions_per_capita: - title: "Annual CO₂ emissions embedded in trade (per capita)" - unit: "tonnes per capita" - short_unit: "t" - description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes per person. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." diff --git a/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.py b/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.py deleted file mode 100644 index 319d165e6fe..00000000000 --- a/etl/steps/archive/garden/gcp/2023-07-10/global_carbon_budget.py +++ /dev/null @@ -1,984 +0,0 @@ -"""This step creates the Global Carbon Budget (GCB) dataset, by the Global Carbon Project (GCP). - -It harmonizes and further processes meadow data, and uses the following auxiliary datasets: -- GGDC's Maddison dataset on GDP, used to calculate emissions per GDP. -- Primary Energy Consumption (mix of sources from the 'energy' namespace) to calculate emissions per unit energy. -- Population (mix of sources), to calculate emissions per capita. -- Regions (mix of sources), to generate aggregates for different continents. -- WorldBank's Income groups, to generate aggregates for different income groups. - -""" - -import numpy as np -import owid.catalog.processing as pr -from owid.catalog import Dataset, Table -from owid.datautils import dataframes -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Expected outliers in consumption-based emissions (with negative emissions in the original data, that will be removed). -OUTLIERS_IN_CONSUMPTION_DF = [ - ("Panama", 2003), - ("Panama", 2004), - ("Panama", 2005), - ("Panama", 2006), - ("Panama", 2011), - ("Panama", 2012), - ("Panama", 2013), - ("Venezuela", 2018), -] - -# Label used for international transport (emissions from oil in bunker fuels), included as a country in the -# fossil CO2 emissions dataset. -INTERNATIONAL_TRANSPORT_LABEL = "International Transport" - -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -REGIONS = { - # Default continents. - "Africa": {}, - "Asia": {}, - "Europe": {}, - "European Union (27)": {}, - "North America": {}, - "Oceania": {}, - "South America": {}, - # Income groups. - "Low-income countries": {}, - "Upper-middle-income countries": {}, - "Lower-middle-income countries": {}, - "High-income countries": {}, - # Additional composite regions. - "Asia (excl. China and India)": { - "additional_regions": ["Asia"], - "excluded_members": ["China", "India"], - }, - "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, - "Europe (excl. EU-28)": { - "additional_regions": ["Europe"], - "excluded_regions": ["European Union (27)"], - "excluded_members": ["United Kingdom"], - }, - "European Union (28)": { - "additional_regions": ["European Union (27)"], - "additional_members": ["United Kingdom"], - }, - "North America (excl. USA)": { - "additional_regions": ["North America"], - "excluded_members": ["United States"], - }, -} - -# Columns to use from GCB fossil CO2 emissions data and how to rename them. -CO2_COLUMNS = { - "country": "country", - "year": "year", - "cement": "emissions_from_cement", - "coal": "emissions_from_coal", - "flaring": "emissions_from_flaring", - "gas": "emissions_from_gas", - "oil": "emissions_from_oil", - "other": "emissions_from_other_industry", - "total": "emissions_total", -} - -# List all sources of emissions considered. -EMISSION_SOURCES = [column for column in CO2_COLUMNS.values() if column not in ["country", "year"]] - -# Columns to use from primary energy consumption data and how to rename them. -PRIMARY_ENERGY_COLUMNS = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "primary_energy_consumption", -} - -# Columns to use from primary energy consumption data and how to rename them. -HISTORICAL_EMISSIONS_COLUMNS = { - "country": "country", - "year": "year", - # Global fossil emissions are used only for sanity checks. - "global_fossil_emissions": "global_fossil_emissions", - "global_land_use_change_emissions": "global_emissions_from_land_use_change", -} - -# Columns to use from consumption-based emissions data and how to rename them. -CONSUMPTION_EMISSIONS_COLUMNS = { - "country": "country", - "year": "year", - "consumption_emissions": "consumption_emissions", -} - -# Conversion from terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - -# Conversion factor to change from billion tonnes of carbon to tonnes of CO2. -BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e9 - -# Conversion factor to change from million tonnes of carbon to tonnes of CO2. -MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 - -# Conversion from million tonnes of CO2 to tonnes of CO2. -MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 = 1e6 - -# Conversion from tonnes of CO2 to kg of CO2 (used for emissions per GDP and per unit energy). -TONNES_OF_CO2_TO_KG_OF_CO2 = 1000 - -# In order to remove uninformative columns, keep only rows where at least one of the following columns has data. -# All other columns are either derived variables, or global variables, or auxiliary variables from other datasets. -COLUMNS_THAT_MUST_HAVE_DATA = [ - "emissions_from_cement", - "emissions_from_coal", - "emissions_from_flaring", - "emissions_from_gas", - "emissions_from_oil", - "emissions_from_other_industry", - "emissions_total", - "consumption_emissions", - "emissions_from_land_use_change", - # 'land_use_change_quality_flag', -] - - -def sanity_checks_on_input_data( - tb_production: Table, tb_consumption: Table, tb_historical: Table, tb_co2: Table -) -> None: - """Run sanity checks on input data files. - - These checks should be used prior to country harmonization, but after basic processing of the tables. - - Parameters - ---------- - tb_production : Table - Production-based emissions from GCP's official national emissions dataset (excel file). - tb_consumption : Table - Consumption-based emissions from GCP's official national emissions dataset (excel file). - tb_historical : Table - Historical emissions from GCP's official global emissions dataset (excel file). - tb_co2 : Table - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - - """ - tb_production = tb_production.copy() - tb_consumption = tb_consumption.copy() - tb_historical = tb_historical.copy() - tb_co2 = tb_co2.copy() - - # In the original data, Bunkers was included in the national data file, as another country. - # But I suppose it should be considered as another kind of global emission. - # In fact, bunker emissions should coincide for production and consumption emissions. - global_bunkers_emissions = ( - tb_production[tb_production["country"] == "Bunkers"][["year", "production_emissions"]] - .reset_index(drop=True) - .rename(columns={"production_emissions": "global_bunker_emissions"}, errors="raise") - ) - - # Check that we get exactly the same array of bunker emissions from the consumption emissions table - # (on years where there is data for bunker emissions in both datasets). - comparison = pr.merge( - global_bunkers_emissions, - tb_consumption[tb_consumption["country"] == "Bunkers"][["year", "consumption_emissions"]] - .reset_index(drop=True) - .rename(columns={"consumption_emissions": "global_bunker_emissions"}, errors="raise"), - how="inner", - on="year", - suffixes=("", "_check"), - ) - - error = "Bunker emissions were expected to coincide in production and consumption emissions tables." - assert (comparison["global_bunker_emissions"] == comparison["global_bunker_emissions_check"]).all(), error - - # Check that all production-based emissions are positive. - error = "There are negative emissions in tb_production (from the additional variables dataset)." - assert (tb_production.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that all production-based emissions from the fossil CO2 dataset are positive. - error = "There are negative emissions in tb_co2 (from the fossil CO2 dataset)." - assert (tb_co2.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that all consumption-based emissions are positive. - error = "There are negative emissions in tb_consumption (from the national emissions dataset)." - assert (tb_consumption.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that, for the World, production emissions coincides with consumption emissions (on common years). - error = "Production and consumption emissions for the world were expected to be identical." - comparison = pr.merge( - tb_production[tb_production["country"] == "World"].reset_index(drop=True), - tb_consumption[tb_consumption["country"] == "World"].reset_index(drop=True), - how="inner", - on="year", - ) - assert (comparison["production_emissions"] == comparison["consumption_emissions"]).all(), error - - # Check that production emissions for the World coincide with global (historical) emissions (on common years). - comparison = pr.merge( - tb_production[tb_production["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), - tb_historical[["year", "global_fossil_emissions"]], - how="inner", - on="year", - ) - error = "Production emissions for the world were expected to coincide with global fossil emissions." - assert ( - abs(comparison["production_emissions"] - comparison["global_fossil_emissions"]) - / (comparison["global_fossil_emissions"]) - < 0.001 - ).all(), error - - # Check that emissions in tb_production (emissions from the national excel file) coincide with emissions in tb_co2 - # (from the Fossil CO2 emissions csv file). - # Given that country names have not yet been harmonized, rename the only countries that are present in both datasets. - comparison = pr.merge( - tb_co2[["country", "year", "emissions_total"]], - tb_production.replace({"Bunkers": "International Transport", "World": "Global"}), - on=["country", "year"], - how="inner", - ).dropna(subset=["emissions_total", "production_emissions"], how="any") - # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in tb_production), - # omit that row in the comparison. - comparison = comparison.drop( - comparison[(comparison["country"] == "Kuwait") & (comparison["year"] == 1991)].index - ).reset_index(drop=True) - - error = "Production emissions from national file were expected to coincide with the Fossil CO2 emissions dataset." - assert ( - ( - 100 - * abs(comparison["production_emissions"] - comparison["emissions_total"]) - / (comparison["emissions_total"]) - ).fillna(0) - < 0.1 - ).all(), error - - -def sanity_checks_on_output_data(tb_combined: Table) -> None: - """Run sanity checks on output data. - - These checks should be run on the very final output table (with an index) prior to storing it as a table. - - Parameters - ---------- - combined_df : Table - Combination of all input tables, after processing, harmonization, and addition of variables. - - """ - tb_combined = tb_combined.reset_index() - error = "All variables (except traded emissions, growth, and land-use change) should be >= 0 or nan." - positive_variables = [ - col - for col in tb_combined.columns - if col != "country" - if "traded" not in col - if "growth" not in col - if "land_use" not in col - ] - assert (tb_combined[positive_variables].fillna(0) >= 0).all().all(), error - - error = "Production emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert tb_combined[ - (tb_combined["country"] == "World") & (abs(tb_combined["emissions_total_as_share_of_global"] - 100) > 2) - ].empty, error - - error = "Consumption emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert tb_combined[ - (tb_combined["country"] == "World") & (abs(tb_combined["consumption_emissions_as_share_of_global"] - 100) > 2) - ].empty, error - - error = "Population as a share of global population should be 100% for 'World'." - assert tb_combined[ - (tb_combined["country"] == "World") & (tb_combined["population_as_share_of_global"].fillna(100) != 100) - ].empty, error - - error = "All share of global emissions should be smaller than 100% (within 2% error)." - share_variables = [col for col in tb_combined.columns if "share" in col] - assert (tb_combined[share_variables].fillna(0) <= 102).all().all(), error - - # Check that cumulative variables are monotonically increasing. - # Firstly, list columns of cumulative variables, but ignoring cumulative columns as a share of global - # (since they are not necessarily monotonic) and land-use change (which can be negative). - cumulative_cols = [ - col for col in tb_combined.columns if "cumulative" in col if "share" not in col if "land_use" not in col - ] - # Using ".is_monotonic_increasing" can fail when differences between consecutive numbers are very small. - # Instead, sort data backwards in time, and check that consecutive values of cumulative variables always have - # a percentage change that is smaller than, say, 0.1%. - error = ( - "Cumulative variables (not given as a share of global) should be monotonically increasing (except when " - "including land-use change emissions, which can be negative)." - ) - assert ( - tb_combined.sort_values("year", ascending=False) - .groupby("country") - .agg({col: lambda x: ((x.pct_change().dropna() * 100) <= 0.1).all() for col in cumulative_cols}) - .all() - .all() - ), error - - error = ( - "Production emissions as a share of global production emissions for the World should always be 100% " - "(or larger than 98%, given small discrepancies)." - ) - # Consumption emissions as a share of global production emissions is allowed to be smaller than 100%. - share_variables = [col for col in tb_combined.columns if "share" in col if "consumption" not in col] - assert (tb_combined[tb_combined["country"] == "World"][share_variables].fillna(100) > 98).all().all(), error - - error = "Traded emissions for the World should be close to zero (within 2% error)." - world_mask = tb_combined["country"] == "World" - assert ( - abs( - 100 - * tb_combined[world_mask]["traded_emissions"].fillna(0) - / tb_combined[world_mask]["emissions_total"].fillna(1) - ) - < 2 - ).all(), error - - -def prepare_fossil_co2_emissions(tb_co2: Table) -> Table: - """Prepare Fossil CO2 emissions data (basic processing).""" - # Select and rename columns from fossil CO2 data. - tb_co2 = tb_co2[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") - - # Ensure all emissions are given in tonnes of CO2. - tb_co2[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 - - #################################################################################################################### - # NOTE: For certain years, column "emissions_from_other_industry" is not informed for "World" but it is informed - # for some countries (namely China and US). - # This causes the cumulative emissions from other industry as share of global for those countries to become larger - # than 100%. - # This temporary solution fixes the issue: We aggregate the data for China and US on those years when the world's - # data is missing (without touching other years or other columns). - # Firstly, list of years for which the world has no data for emissions_from_other_industry. - world_missing_years = ( - tb_co2[(tb_co2["country"] == "Global") & (tb_co2["emissions_from_other_industry"].isnull())]["year"] - .unique() - .tolist() # type: ignore - ) - # Data that needs to be aggregated. - data_missing_in_world = tb_co2[ - tb_co2["year"].isin(world_missing_years) & (tb_co2["emissions_from_other_industry"].notnull()) - ] - # Check that there is indeed data to be aggregated (that is missing for the World). - error = ( - "Expected emissions_from_other_industry to be null for the world but not null for certain countries " - "(which was an issue in the original fossil CO2 data). The issue may be fixed and the code can be simplified." - ) - assert len(data_missing_in_world) > 0, error - # Create a table of aggregate data for the World, on those years when it's missing. - aggregated_missing_data = ( - data_missing_in_world.groupby("year") - .agg({"emissions_from_other_industry": "sum"}) - .reset_index() - .assign(**{"country": "Global"}) - ) - # Combine the new table of aggregate data with the main table. - tb_co2 = dataframes.combine_two_overlapping_dataframes( - df1=tb_co2, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True - ) - # NOTE: The previous function currently does not properly propagate metadata, but keeps only the sources of the - # first table. But given that both tables combined have the same source, we don't need to manually change it. - #################################################################################################################### - - # We add the emissions from "Kuwaiti Oil Fires" (which is also included as a separate country) as part of the - # emissions of Kuwait. This ensures that they will be included in region aggregates. - error = "'Kuwaiti Oil Fires' was expected to only have not-null data for 1991." - assert tb_co2[ - (tb_co2["country"] == "Kuwaiti Oil Fires") - & (tb_co2["emissions_total"].notnull()) - & (tb_co2["emissions_total"] != 0) - ]["year"].tolist() == [1991], error - - tb_co2.loc[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991), EMISSION_SOURCES] = ( - tb_co2[(tb_co2["country"] == "Kuwaiti Oil Fires") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values - + tb_co2[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values - ) - - # Check that "emissions_total" agrees with the sum of emissions from individual sources. - error = "The sum of all emissions should add up to total emissions (within 1%)." - assert ( - abs( - tb_co2.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) - - tb_co2["emissions_total"].fillna(0) - ) - / (tb_co2["emissions_total"].fillna(0) + 1e-7) - < 1e-2 - ).all(), error - - # Many rows have zero total emissions, but actually the individual sources are nan. - # Total emissions in those cases should be nan, instead of zero. - no_individual_emissions = tb_co2.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) - tb_co2.loc[no_individual_emissions, "emissions_total"] = np.nan - - return tb_co2 - - -def prepare_consumption_emissions(tb_consumption: Table) -> Table: - """Prepare consumption-based emissions data (basic processing).""" - # Select and rename columns. - tb_consumption = tb_consumption[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( - columns=CONSUMPTION_EMISSIONS_COLUMNS, errors="raise" - ) - - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in tb_consumption.drop(columns=["country", "year"]).columns: - tb_consumption[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - # List indexes of rows in tb_consumption corresponding to outliers (defined above in OUTLIERS_IN_tb_consumption). - outlier_indexes = [ - tb_consumption[(tb_consumption["country"] == outlier[0]) & (tb_consumption["year"] == outlier[1])].index.item() - for outlier in OUTLIERS_IN_CONSUMPTION_DF - ] - - error = ( - "Outliers were expected to have negative consumption emissions. " - "Maybe outliers have been fixed (and should be removed from the code)." - ) - assert (tb_consumption.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error - - # Remove outliers. - tb_consumption = tb_consumption.drop(outlier_indexes).reset_index(drop=True) - - return tb_consumption - - -def prepare_production_emissions(tb_production: Table) -> Table: - """Prepare production-based emissions data (basic processing).""" - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in tb_production.drop(columns=["country", "year"]).columns: - tb_production[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return tb_production - - -def prepare_land_use_emissions(tb_land_use: Table) -> Table: - """Prepare land-use change emissions data (basic processing).""" - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - tb_land_use["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return tb_land_use - - -def prepare_historical_emissions(tb_historical: Table) -> Table: - """Prepare historical emissions data.""" - # Select and rename columns from historical emissions data. - tb_historical = tb_historical[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( - columns=HISTORICAL_EMISSIONS_COLUMNS, errors="raise" - ) - - # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in tb_historical.drop(columns=["country", "year"]).columns: - tb_historical[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return tb_historical - - -def extract_global_emissions(tb_co2: Table, tb_historical: Table, ds_population: Dataset) -> Table: - """Extract World emissions by combining data from the Fossil CO2 emissions and the global emissions dataset. - - The resulting global emissions data includes bunker and land-use change emissions. - - NOTE: This function has to be used after selecting and renaming columns in tb_co2, but before harmonizing country - names in tb_co2 (so that "International Transport" is still listed as a country). - - Parameters - ---------- - tb_co2 : Table - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - tb_historical : Table - Historical emissions from GCP's official global emissions dataset (excel file). - ds_population : Dataset - Population dataset. - - Returns - ------- - global_emissions : Table - World emissions. - - """ - # For some reason, "International Transport" is included as another country, that only has emissions from oil. - # We separate it as another variable (only given at the global level). - global_transport = tb_co2[tb_co2["country"] == INTERNATIONAL_TRANSPORT_LABEL].reset_index(drop=True) - - # Check that total emissions for international transport coincide with oil emissions. - error = "Total emissions from international transport do not coincide with oil emissions." - assert all((global_transport["emissions_from_oil"] - global_transport["emissions_total"]).dropna() == 0), error - - # Therefore, we can keep only one column for international transport emissions. - global_transport = ( - global_transport[["year", "emissions_from_oil"]] - .dropna() - .rename(columns={"emissions_from_oil": "global_emissions_from_international_transport"}, errors="raise") - ) - - # Create a new table of global emissions. - global_emissions = ( - tb_co2[tb_co2["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] - .rename(columns={column: f"global_{column}" for column in EMISSION_SOURCES}, errors="raise") - .sort_values("year") - .reset_index(drop=True) - ) - - # Add bunker fuels to global emissions. - global_emissions = pr.merge(global_emissions, global_transport, on=["year"], how="outer") - - # Add historical land-use change emissions to table of global emissions. - global_emissions = pr.merge( - global_emissions, tb_historical[["year", "global_emissions_from_land_use_change"]], how="left", on="year" - ) - - # Add variable of total emissions including fossil fuels and land use change. - global_emissions["global_emissions_total_including_land_use_change"] = ( - global_emissions["global_emissions_total"] + global_emissions["global_emissions_from_land_use_change"] - ) - - # Calculate global cumulative emissions. - for column in EMISSION_SOURCES + ["emissions_from_land_use_change", "emissions_total_including_land_use_change"]: - global_emissions[f"global_cumulative_{column}"] = global_emissions[f"global_{column}"].cumsum() - - # Add a country column and add global population. - global_emissions["country"] = "World" - - # Add global population. - global_emissions = geo.add_population_to_table( - tb=global_emissions, ds_population=ds_population, population_col="global_population" - ) - - return global_emissions - - -def harmonize_country_names(df: Table) -> Table: - """Harmonize country names, and fix known issues with certain regions. - - Parameters - ---------- - df : Table - Emissions data (either from the fossil CO2, the production-based, consumption-based, or land-use emissions - datasets). - - Returns - ------- - df : Table - Emissions data after harmonizing country names. - - """ - # Harmonize country names. - df = geo.harmonize_countries( - df=df, - countries_file=paths.country_mapping_path, - excluded_countries_file=paths.excluded_countries_path, - warn_on_missing_countries=True, - warn_on_unused_countries=False, - make_missing_countries_nan=False, - warn_on_unknown_excluded_countries=False, - ) - - # Check that there is only one data point for each country-year. - # In the fossil CO2 emissions data, after harmonization, "Pacific Islands (Palau)" is mapped to "Palau", and - # therefore there are rows with different data for the same country-year. - # However, "Pacific Islands (Palau)" have data until 1991, and "Palau" has data from 1992 onwards. - # After removing empty rows, there should be no overlap. - columns_that_must_have_data = df.drop(columns=["country", "year"]).columns - check = df.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - error = "After harmonizing country names, there is more than one data point for the same country-year." - assert check[check.duplicated(subset=["country", "year"])].empty, error - - df = df.dropna(subset="country").reset_index(drop=True) - - return df - - -def combine_data_and_add_variables( - tb_co2: Table, - tb_production: Table, - tb_consumption: Table, - tb_global_emissions: Table, - tb_land_use: Table, - tb_energy: Table, - ds_gdp: Dataset, - ds_population: Table, - ds_regions: Dataset, - ds_income_groups: Dataset, -) -> Table: - """Combine all relevant data into one table, add region aggregates, and add custom variables (e.g. emissions per - capita). - - Parameters - ---------- - tb_co2 : Table - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file), after harmonization. - tb_production : Table - Production-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - tb_consumption : Table - Consumption-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - tb_global_emissions : Table - World emissions (including bunker and land-use change emissions). - tb_land_use : Table - National land-use change emissions from GCP's official dataset (excel file), after harmonization. - tb_energy : Table - Primary energy data. - ds_gdp : Dataset - GDP dataset. - ds_population : Dataset - Population dataset. - ds_regions : Dataset - Regions dataset. - ds_income_groups : Dataset - Income groups dataset. - - Returns - ------- - tb_co2_with_regions : Table - Combined data, with all additional variables and with region aggregates. - - """ - tb_co2_with_regions = tb_co2.copy() - - # Add region aggregates that were included in the national emissions file, but not in the Fossil CO2 emissions dataset. - gcp_aggregates = sorted(set(tb_production["country"]) - set(tb_co2_with_regions["country"])) - tb_co2_with_regions = pr.concat( - [ - tb_co2_with_regions, - tb_production[tb_production["country"].isin(gcp_aggregates)] - .rename(columns={"production_emissions": "emissions_total"}) - .astype({"year": int}), - ], - ignore_index=True, - ).reset_index(drop=True) - - # Add consumption emissions to main table (keep only the countries of the main table). - # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to tb_co2 - # (when merging with tb_production), all countries from tb_consumption should be included in tb_co2. - error = "Some countries in tb_consumption are not included in tb_co2." - assert set(tb_consumption["country"]) < set(tb_co2_with_regions["country"]), error - tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_consumption, on=["country", "year"], how="outer") - - # Add population to original table. - tb_co2_with_regions = geo.add_population_to_table( - tb=tb_co2_with_regions, ds_population=ds_population, warn_on_missing_countries=False - ) - - # Add GDP to main table. - tb_co2_with_regions = geo.add_gdp_to_table(tb=tb_co2_with_regions, ds_gdp=ds_gdp) - - # Add primary energy to main table. - tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_energy, on=["country", "year"], how="left") - - # For convenience, rename columns in land-use change emissions data. - tb_land_use = tb_land_use.rename( - columns={"emissions": "emissions_from_land_use_change", "quality_flag": "land_use_change_quality_flag"} - ) - - # Land-use change data does not include data for the World. Include it by merging with the global dataset. - tb_land_use = pr.concat( - [ - tb_land_use, - tb_global_emissions.rename( - columns={"global_emissions_from_land_use_change": "emissions_from_land_use_change"} - )[["year", "emissions_from_land_use_change"]] - .dropna() - .assign(**{"country": "World"}), - ], - ignore_index=True, - ).astype({"year": int}) - - # Add land-use change emissions to main table. - tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_land_use, on=["country", "year"], how="outer") - - # Add total emissions (including land-use change) for each country. - tb_co2_with_regions["emissions_total_including_land_use_change"] = ( - tb_co2_with_regions["emissions_total"] + tb_co2_with_regions["emissions_from_land_use_change"] - ) - - # Create a copy of the current table, to be able to copy its metadata after adding region aggregates. - _tb_co2_with_regions = tb_co2_with_regions.copy() - - # Add region aggregates. - # Aggregate not only emissions data, but also population, gdp and primary energy. - # This way we ensure that custom regions (e.g. "North America (excl. USA)") will have all required data. - aggregations = { - column: "sum" - for column in tb_co2_with_regions.columns - if column not in ["country", "year", "land_use_change_quality_flag"] - } - for region in REGIONS: - countries_in_region = geo.list_members_of_region( - region=region, - ds_regions=ds_regions, - ds_income_groups=ds_income_groups, - additional_regions=REGIONS[region].get("additional_regions", None), - excluded_regions=REGIONS[region].get("excluded_regions", None), - additional_members=REGIONS[region].get("additional_members", None), - excluded_members=REGIONS[region].get("excluded_members", None), - include_historical_regions_in_income_groups=True, - ) - tb_co2_with_regions = geo.add_region_aggregates( - df=tb_co2_with_regions, - region=region, - countries_in_region=countries_in_region, - countries_that_must_have_data=[], - frac_allowed_nans_per_year=0.999, - aggregations=aggregations, - ) - - # NOTE: The previous operation does not preserve metadata. Copy metadata of original table. - tb_co2_with_regions = tb_co2_with_regions.copy_metadata(from_table=_tb_co2_with_regions) - - # Add global emissions and global cumulative emissions columns to main table. - tb_co2_with_regions = pr.merge( - tb_co2_with_regions, tb_global_emissions.drop(columns="country"), on=["year"], how="left" - ) - - # Ensure main table is sorted (so that cumulative emissions are properly calculated). - tb_co2_with_regions = tb_co2_with_regions.sort_values(["country", "year"]).reset_index(drop=True) - - # Temporarily add certain global emissions variables. - # This is done simply to be able to consider "consumption_emissions" as just another type of emission - # when creating additional variables. - tb_co2_with_regions["global_consumption_emissions"] = tb_co2_with_regions["global_emissions_total"] - tb_co2_with_regions["global_cumulative_consumption_emissions"] = tb_co2_with_regions[ - "global_cumulative_emissions_total" - ] - - # Add new variables for each source of emissions. - for column in EMISSION_SOURCES + [ - "consumption_emissions", - "emissions_from_land_use_change", - "emissions_total_including_land_use_change", - ]: - # Add per-capita variables. - tb_co2_with_regions[f"{column}_per_capita"] = tb_co2_with_regions[column] / tb_co2_with_regions["population"] - - # Add columns for cumulative emissions. - # Rows that had nan emissions will have nan cumulative emissions. - # But nans will not be propagated in the sum. - # This means that countries with some (not all) nans will have the cumulative sum of the informed emissions - # (treating nans as zeros), but will have nan on those rows that were not informed. - # NOTE: Currently, this operation doesn't propagate metadata properly. This has to be done manually. - tb_co2_with_regions[f"cumulative_{column}"] = ( - tb_co2_with_regions.groupby(["country"])[column].cumsum().copy_metadata(tb_co2_with_regions[column]) - ) - - # Add share of global emissions. - tb_co2_with_regions[f"{column}_as_share_of_global"] = ( - 100 * tb_co2_with_regions[column] / tb_co2_with_regions[f"global_{column}"] - ) - - # Add share of global cumulative emissions. - tb_co2_with_regions[f"cumulative_{column}_as_share_of_global"] = ( - 100 * tb_co2_with_regions[f"cumulative_{column}"] / tb_co2_with_regions[f"global_cumulative_{column}"] - ) - - # Add total emissions per unit energy (in kg of emissions per kWh). - tb_co2_with_regions["emissions_total_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * tb_co2_with_regions["emissions_total"] - / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) - ) - - # Add total emissions (including land-use change) per unit energy (in kg of emissions per kWh). - tb_co2_with_regions["emissions_total_including_land_use_change_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * tb_co2_with_regions["emissions_total_including_land_use_change"] - / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) - ) - - # Add total emissions per unit GDP. - tb_co2_with_regions["emissions_total_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["emissions_total"] / tb_co2_with_regions["gdp"] - ) - - # Add total emissions (including land-use change) per unit GDP. - tb_co2_with_regions["emissions_total_including_land_use_change_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * tb_co2_with_regions["emissions_total_including_land_use_change"] - / tb_co2_with_regions["gdp"] - ) - - # Add total consumption emissions per unit GDP. - tb_co2_with_regions["consumption_emissions_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["consumption_emissions"] / tb_co2_with_regions["gdp"] - ) - - # Add variable of emissions embedded in trade. - tb_co2_with_regions["traded_emissions"] = ( - tb_co2_with_regions["consumption_emissions"] - tb_co2_with_regions["emissions_total"] - ) - tb_co2_with_regions["pct_traded_emissions"] = ( - 100 * tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["emissions_total"] - ) - tb_co2_with_regions["traded_emissions_per_capita"] = ( - tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["population"] - ) - - # Add variable of emissions embedded in trade, including land-use change emissions. - tb_co2_with_regions["traded_emissions_including_land_use_change"] = ( - tb_co2_with_regions["consumption_emissions"] - tb_co2_with_regions["emissions_total_including_land_use_change"] - ) - tb_co2_with_regions["pct_traded_emissions_including_land_use_change"] = ( - 100 - * tb_co2_with_regions["traded_emissions_including_land_use_change"] - / tb_co2_with_regions["emissions_total_including_land_use_change"] - ) - tb_co2_with_regions["traded_emissions_including_land_use_change_per_capita"] = ( - tb_co2_with_regions["traded_emissions_including_land_use_change"] / tb_co2_with_regions["population"] - ) - - # Remove temporary columns. - tb_co2_with_regions = tb_co2_with_regions.drop( - columns=["global_consumption_emissions", "global_cumulative_consumption_emissions"] - ) - - # Add annual percentage growth of total emissions. - # NOTE: Currently, this operation doesn't propagate metadata properly. This has to be done manually. - tb_co2_with_regions["pct_growth_emissions_total"] = ( - tb_co2_with_regions.groupby("country")["emissions_total"].pct_change() * 100 - ).copy_metadata(tb_co2_with_regions["emissions_total"]) - - # Add annual percentage growth of total emissions (including land-use change). - # NOTE: Currently, this operation doesn't propagate metadata properly. This has to be done manually. - tb_co2_with_regions["pct_growth_emissions_total_including_land_use_change"] = ( - tb_co2_with_regions.groupby("country")["emissions_total_including_land_use_change"].pct_change() * 100 - ).copy_metadata(tb_co2_with_regions["emissions_total_including_land_use_change"]) - - # Add annual absolute growth of total emissions. - tb_co2_with_regions["growth_emissions_total"] = tb_co2_with_regions.groupby("country")["emissions_total"].diff() - - # Add annual absolute growth of total emissions (including land-use change). - tb_co2_with_regions["growth_emissions_total_including_land_use_change"] = tb_co2_with_regions.groupby("country")[ - "emissions_total_including_land_use_change" - ].diff() - - # Create variable of population as a share of global population. - tb_co2_with_regions["population_as_share_of_global"] = ( - tb_co2_with_regions["population"] / tb_co2_with_regions["global_population"] * 100 - ) - - # Replace infinity values (for example when calculating growth from zero to non-zero) in the data by nan. - for column in tb_co2_with_regions.drop(columns=["country", "year"]).columns: - tb_co2_with_regions.loc[np.isinf(tb_co2_with_regions[column]), column] = np.nan - - # For special GCP countries/regions (e.g. "Africa (GCP)") we should keep only the original data. - # Therefore, make nan all additional variables for those countries/regions, and keep only GCP's original data. - added_variables = tb_co2_with_regions.drop( - columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA - ).columns.tolist() - tb_co2_with_regions.loc[ - (tb_co2_with_regions["country"].str.contains(" (GCP)", regex=False)), added_variables - ] = np.nan - - # Remove uninformative rows (those that have only data for, say, gdp, but not for variables related to emissions). - tb_co2_with_regions = tb_co2_with_regions.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index( - drop=True - ) - - # Set an appropriate index, ensure there are no rows that only have nan, and sort conveniently. - tb_co2_with_regions = tb_co2_with_regions.set_index(["country", "year"], verify_integrity=True) - tb_co2_with_regions = ( - tb_co2_with_regions.dropna(subset=tb_co2_with_regions.columns, how="all").sort_index().sort_index(axis=1) - ) - - # Rename table. - tb_co2_with_regions.metadata.short_name = paths.short_name - - return tb_co2_with_regions - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load meadow dataset and read all its tables. - ds_meadow: Dataset = paths.load_dependency("global_carbon_budget") - tb_co2 = ds_meadow["global_carbon_budget_fossil_co2_emissions"].reset_index() - tb_historical = ds_meadow["global_carbon_budget_historical_budget"].reset_index() - tb_consumption = ds_meadow["global_carbon_budget_consumption_emissions"].reset_index() - tb_production = ds_meadow["global_carbon_budget_production_emissions"].reset_index() - tb_land_use = ds_meadow["global_carbon_budget_land_use_change"].reset_index() - - # Load primary energy consumption dataset and read its main table. - ds_energy: Dataset = paths.load_dependency("primary_energy_consumption") - tb_energy = ds_energy["primary_energy_consumption"].reset_index() - - # Load GDP dataset. - ds_gdp: Dataset = paths.load_dependency("ggdc_maddison") - - # Load population dataset. - ds_population: Dataset = paths.load_dependency("population") - - # Load regions dataset. - ds_regions: Dataset = paths.load_dependency("regions") - - # Load income groups dataset. - ds_income_groups: Dataset = paths.load_dependency("income_groups") - - # - # Process data. - # - # Prepare fossil CO2 emissions data. - tb_co2 = prepare_fossil_co2_emissions(tb_co2=tb_co2) - - # Prepare consumption-based emission data. - tb_consumption = prepare_consumption_emissions(tb_consumption=tb_consumption) - - # Prepare production-based emission data. - tb_production = prepare_production_emissions(tb_production=tb_production) - - # Prepare land-use emission data. - tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) - - # Select and rename columns from primary energy data. - tb_energy = tb_energy[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS, errors="raise") - - # Prepare historical emissions data. - tb_historical = prepare_historical_emissions(tb_historical=tb_historical) - - # Run sanity checks on input data. - sanity_checks_on_input_data( - tb_production=tb_production, tb_consumption=tb_consumption, tb_historical=tb_historical, tb_co2=tb_co2 - ) - - # For some reason, "International Transport" is included as another country, that only has emissions from oil. - # Extract that data and remove it from the rest of national emissions. - tb_global_emissions = extract_global_emissions( - tb_co2=tb_co2, tb_historical=tb_historical, ds_population=ds_population - ) - - # Harmonize country names. - tb_co2 = harmonize_country_names(df=tb_co2) - tb_consumption = harmonize_country_names(df=tb_consumption) - tb_production = harmonize_country_names(df=tb_production) - tb_land_use = harmonize_country_names(df=tb_land_use) - - # Add new variables to main table (consumption-based emissions, emission intensity, per-capita emissions, etc.). - tb_combined = combine_data_and_add_variables( - tb_co2=tb_co2, - tb_production=tb_production, - tb_consumption=tb_consumption, - tb_global_emissions=tb_global_emissions, - tb_land_use=tb_land_use, - tb_energy=tb_energy, - ds_gdp=ds_gdp, - ds_population=ds_population, - ds_regions=ds_regions, - ds_income_groups=ds_income_groups, - ) - - # Run sanity checks on output data. - sanity_checks_on_output_data(tb_combined) - - # - # Save outputs. - # - # Create a new garden dataset and use metadata from meadow dataset. - ds_garden = create_dataset( - dest_dir=dest_dir, tables=[tb_combined], default_metadata=ds_meadow.metadata, check_variables_metadata=True - ) - ds_garden.save() diff --git a/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.countries.json b/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.countries.json deleted file mode 100644 index abaab52fe1b..00000000000 --- a/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.countries.json +++ /dev/null @@ -1,278 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Africa": "Africa (GCP)", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antarctica": "Antarctica", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Asia": "Asia (GCP)", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bolivia (Plurinational State of)": "Bolivia", - "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Bunkers": "International transport", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cape Verde": "Cape Verde", - "Central African Republic": "Central African Republic", - "Central America": "Central America (GCP)", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Christmas Island": "Christmas Island", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Congo, Democratic Republic of the": "Democratic Republic of Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cura\u00e7ao": "Curacao", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "Czechia": "Czechia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "EU27": "European Union (27) (GCP)", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Europe": "Europe (GCP)", - "Faeroe Islands": "Faroe Islands", - "Falkland Islands (Malvinas)": "Falkland Islands", - "Faroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Equatorial Africa": "French Equatorial Africa (GCP)", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "French West Africa": "French West Africa (GCP)", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Global": "World", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guatemala": "Guatemala", - "Guernsey": "Guernsey", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "International Transport": "International transport", - "Iran": "Iran", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Isle of Man": "Isle of Man", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jersey": "Jersey", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea (Democratic People's Republic of)": "North Korea", - "Korea, Republic of": "South Korea", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kuwaiti Oil Fires": "Kuwaiti Oil Fires (GCP)", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic": "Laos", - "Laos": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Leeward Islands": "Leeward Islands (GCP)", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Middle East": "Middle East (GCP)", - "Moldova": "Moldova", - "Moldova, Republic of": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "Netherlands Antilles": "Netherlands Antilles", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "Non-OECD": "Non-OECD (GCP)", - "North America": "North America (GCP)", - "North Korea": "North Korea", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "OECD": "OECD (GCP)", - "Occupied Palestinian Territory": "Palestine", - "Oceania": "Oceania (GCP)", - "Oman": "Oman", - "Pacific Islands (Palau)": "Palau", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Palestine, State of": "Palestine", - "Panama": "Panama", - "Panama Canal Zone": "Panama Canal Zone (GCP)", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russia": "Russia", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Ryukyu Islands": "Ryukyu Islands (GCP)", - "R\u00e9union": "Reunion", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Martin (French part)": "Saint Martin (French part)", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South America": "South America (GCP)", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla (GCP)", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Svalbard and Jan Mayen": "Svalbard and Jan Mayen", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syria": "Syria", - "Syrian Arab Republic": "Syria", - "Taiwan": "Taiwan", - "Taiwan, Province of China": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Tanzania, United Republic of": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "USA": "United States", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Virgin Islands (U.S.)": "United States Virgin Islands", - "Wallis and Futuna Islands": "Wallis and Futuna", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "\u00c5land Islands": "Aland Islands" -} diff --git a/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.excluded_countries.json b/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.excluded_countries.json deleted file mode 100644 index e7a16636a61..00000000000 --- a/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.excluded_countries.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - "KP Annex B", - "Non KP Annex B" -] \ No newline at end of file diff --git a/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.meta.yml b/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.meta.yml deleted file mode 100644 index cffd80650a8..00000000000 --- a/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.meta.yml +++ /dev/null @@ -1,600 +0,0 @@ -definitions: - production_emissions_description_key: &production_emissions_description_key - - This data is based on territorial emissions, which do not account for emissions embedded in traded goods. - traded_emissions_description_key: &traded_emissions_description_key - - Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter. - international_aviation_description_key: &international_aviation_description_key - - Emissions from international aviation and shipping are not included in any country or region's emissions. They are only included in the global total emissions. - consumption_emissions_description_key: &consumption_emissions_description_key - - Consumption-based emissions attribute the emissions generated in the production of goods and services according to where they were _consumed_, rather than where they were _produced_. - - "The data is calculated by adjusting 'production-based' emissions (emissions produced domestically) for trade: Consumption-based emissions equals production-based emissions, _minus_ emissions embedded in exports, _plus_ emissions embedded in imports." - - If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. If its consumption-based emissions are lower, then it is a net exporter. - - Consumption-based emissions are not available for all countries because not all countries have sufficient, high-quality trade data. But those without complete data are a small fraction (3%) of the global total. - - This data measures carbon dioxide (CO₂) emissions from fossil fuels and industry and does not include emissions from land use change, deforestation, soils, or vegetation. - per_capita_description_key: &per_capita_description_key - - Per capita emissions represent the emissions of an average person in a country or region - they are total emissions divided by population. - # Common fields to be used in all indicators (unless overridden for specific indicators below). - common: - description_processing: &description_processing | - - Data on global emissions has been converted from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. - - Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. - - Country's share of the global population is calculated using our population dataset, based on [different sources](https://ourworldindata.org/population-sources). - - Each country's share of global CO₂ emissions from flaring has been calculated using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset. - description_key: - # NOTE: The description key points are re-defined for each indicator on consumption-based emissions and traded emissions, as well as on per-capita indicators. - - *production_emissions_description_key - - *international_aviation_description_key - presentation: - topic_tags: - - CO2 & Greenhouse Gas Emissions - -dataset: - title: Global Carbon Budget - update_period_days: 365 - -tables: - global_carbon_budget: - variables: - consumption_emissions: - title: "Annual consumption-based CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description_short: Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes. - description_key: - - *consumption_emissions_description_key - - *international_aviation_description_key - consumption_emissions_as_share_of_global: - title: "Share of global annual CO₂ consumption-based emissions" - unit: "%" - short_unit: "%" - description_short: "Annual consumption-based emissions of carbon dioxide (CO₂), measured as a percentage of global consumption-based emissions of CO₂ in the same year." - description_key: - - *consumption_emissions_description_key - - *international_aviation_description_key - ################################################################################################################## - # Curated indicator for data page. - consumption_emissions_per_capita: - title: Per capita consumption-based CO₂ emissions - description_short: | - Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes per person. - description_key: - - *consumption_emissions_description_key - - *per_capita_description_key - - *international_aviation_description_key - description_processing: *description_processing - unit: tonnes per person - short_unit: t/person - display: - shortUnit: t - numDecimalPlaces: 0 - presentation: - attribution_short: Global Carbon Project - topic_tags: - - CO2 & Greenhouse Gas Emissions - - Climate Change - - Energy - faqs: - - fragment_id: emissions-from-aviation-and-shipping - gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw - - fragment_id: missing-consumption-based-emissions - gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw - grapher_config: - subtitle: >- - [Consumption-based emissions](#dod:consumptionbasedemissions) are national - emissions that have been adjusted for trade. It's production-based emissions - minus emissions embedded in exports, plus emissions embedded in imports. - hideAnnotationFieldsInTitle: - time: true - entity: true - changeInPrefix: true - hideRelativeToggle: false - hasMapTab: true - tab: map - originUrl: https://ourworldindata.org/co2-and-greenhouse-gas-emissions - colorScale: - binningStrategy: equalInterval - map: - colorScale: - baseColorScheme: Reds - binningStrategy: manual - customNumericValues: - - 1 - - 2 - - 5 - - 10 - - 20 - - 50 - customNumericColors: - - null - - null - selectedEntityNames: - - United States - - United Kingdom - - European Union (27) - - China - - India - - Australia - - Brazil - - South Africa - relatedQuestions: - - url: https://ourworldindata.org/grapher/consumption-co2-per-capita#faqs - text: FAQs on this data - consumption_emissions_per_gdp: - title: "Annual consumption-based CO₂ emissions per GDP (kg per international-$)" - unit: "kilograms per international-$" - short_unit: "kg/$" - description_short: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in kilograms per dollar of GDP (2011 international-$)." - description_key: - - *consumption_emissions_description_key - - *international_aviation_description_key - cumulative_consumption_emissions: - title: "Cumulative CO₂ consumption-based emissions" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of available data, measured in tonnes." - description_key: - - *consumption_emissions_description_key - - *international_aviation_description_key - cumulative_consumption_emissions_as_share_of_global: - title: "Share of global cumulative CO₂ consumption-based emissions" - unit: "%" - short_unit: "%" - description_short: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of available data, measured as a percentage of global cumulative consumption-based emissions." - description_key: - - *consumption_emissions_description_key - - *international_aviation_description_key - cumulative_emissions_from_cement: - title: "Cumulative CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from cement since the first year of available data, measured in tonnes." - cumulative_emissions_from_cement_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from cement" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from cement since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from cement." - cumulative_emissions_from_coal: - title: "Cumulative CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from coal since the first year of available data, measured in tonnes." - cumulative_emissions_from_coal_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from coal" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from coal since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from coal." - cumulative_emissions_from_flaring: - title: "Cumulative CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from flaring since the first year of available data, measured in tonnes." - cumulative_emissions_from_flaring_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from flaring" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from flaring since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from flaring." - cumulative_emissions_from_gas: - title: "Cumulative CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from gas since the first year of available data, measured in tonnes." - cumulative_emissions_from_gas_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from gas" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from gas since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from gas." - cumulative_emissions_from_land_use_change: - title: "Cumulative CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from land-use change since the first year of available data, measured in tonnes." - cumulative_emissions_from_land_use_change_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from land-use change" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from land-use change since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from land-use change." - cumulative_emissions_from_oil: - title: "Cumulative CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from oil since the first year of available data, measured in tonnes." - cumulative_emissions_from_oil_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from oil" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from oil since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from oil." - cumulative_emissions_from_other_industry: - title: "Cumulative CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from other industry sources since the first year of available data, measured in tonnes." - cumulative_emissions_from_other_industry_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from other industry" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from other industry sources since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from other industry sources." - cumulative_emissions_total: - title: "Cumulative CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description_short: "Total cumulative emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of available data, measured in tonnes." - cumulative_emissions_total_as_share_of_global: - title: "Share of global cumulative CO₂ emissions" - unit: "%" - short_unit: "%" - description_short: "Total cumulative emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of available data, measured as a percentage of global total cumulative emissions of CO₂." - cumulative_emissions_total_including_land_use_change: - title: "Cumulative CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description_short: "Total cumulative emissions of carbon dioxide (CO₂), including land-use change, since the first year of available data, measured in tonnes." - cumulative_emissions_total_including_land_use_change_as_share_of_global: - title: "Share of global cumulative CO₂ emissions including land-use change" - unit: "%" - short_unit: "%" - description_short: "Total cumulative emissions of carbon dioxide (CO₂), including land-use change, since the first year of available data, measured as a percentage of global total cumulative emissions of CO₂ (including land-use change)." - emissions_from_cement: - title: "Annual CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured in tonnes." - emissions_from_cement_as_share_of_global: - title: "Share of global annual CO₂ emissions from cement" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured as a percentage of global emissions of CO₂ from cement in the same year." - emissions_from_cement_per_capita: - title: "Annual CO₂ emissions from cement (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_coal: - title: "Annual CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured in tonnes." - emissions_from_coal_as_share_of_global: - title: "Share of global annual CO₂ emissions from coal" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured as a percentage of global emissions of CO₂ from coal in the same year." - emissions_from_coal_per_capita: - title: "Annual CO₂ emissions from coal (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_flaring: - title: "Annual CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured in tonnes." - emissions_from_flaring_as_share_of_global: - title: "Share of global annual CO₂ emissions from flaring" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured as a percentage of global emissions of CO₂ from flaring in the same year." - emissions_from_flaring_per_capita: - title: "Annual CO₂ emissions from flaring (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_gas: - title: "Annual CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured in tonnes." - emissions_from_gas_as_share_of_global: - title: "Share of global annual CO₂ emissions from gas" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured as a percentage of global emissions of CO₂ from gas in the same year." - emissions_from_gas_per_capita: - title: "Annual CO₂ emissions from gas (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_land_use_change: - title: "Annual CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes." - emissions_from_land_use_change_as_share_of_global: - title: "Share of global annual CO₂ emissions from land-use change" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured as a percentage of global emissions of CO₂ from land-use change in the same year." - emissions_from_land_use_change_per_capita: - title: "Annual CO₂ emissions from land-use change per capita" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_oil: - title: "Annual CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured in tonnes." - emissions_from_oil_as_share_of_global: - title: "Share of global annual CO₂ emissions from oil" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured as a percentage of global emissions of CO₂ from oil in the same year." - emissions_from_oil_per_capita: - title: "Annual CO₂ emissions from oil (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_other_industry: - title: "Annual CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes." - emissions_from_other_industry_as_share_of_global: - title: "Share of global annual CO₂ emissions from other industry" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured as a percentage of global emissions of CO₂ from other industry sources in the same year." - emissions_from_other_industry_per_capita: - title: "Annual CO₂ emissions from other industry (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_total: - title: "Annual CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes." - emissions_total_as_share_of_global: - title: "Share of global annual CO₂ emissions" - unit: "%" - short_unit: "%" - description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured as a percentage of global emissions of CO₂ in the same year." - emissions_total_including_land_use_change: - title: "Annual CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes." - emissions_total_including_land_use_change_as_share_of_global: - title: "Share of global annual CO₂ emissions including land-use change" - unit: "%" - short_unit: "%" - description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured as a percentage of global total emissions of CO₂ in the same year." - emissions_total_including_land_use_change_per_capita: - title: "Annual CO₂ emissions including land-use change per capita" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_total_including_land_use_change_per_gdp: - title: "Annual CO₂ emissions including land-use change per GDP" - unit: "kilograms per international-$" - short_unit: "kg/$" - description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per dollar of GDP (2011 international-$)." - emissions_total_including_land_use_change_per_unit_energy: - title: "Annual CO₂ emissions including land-use change per unit energy" - unit: "kilograms per kilowatt-hour" - short_unit: "kg/kWh" - description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per kilowatt-hour of primary energy consumption." - emissions_total_per_capita: - title: "Annual CO₂ emissions (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_total_per_gdp: - title: "Annual CO₂ emissions per GDP (kg per international-$)" - unit: "kilograms per international-$" - short_unit: "kg/$" - description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per dollar of GDP (2011 international-$)." - emissions_total_per_unit_energy: - title: "Annual CO₂ emissions per unit energy (kg per kilowatt-hour)" - unit: "kilograms per kilowatt-hour" - short_unit: "kg/kWh" - description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per kilowatt-hour of primary energy consumption." - gdp: - title: "GDP" - unit: "2011 international-$" - short_unit: "$" - description_short: >- - Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) - and price differences between countries. - global_cumulative_emissions_from_cement: - title: "Global cumulative CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - global_cumulative_emissions_from_coal: - title: "Global cumulative CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - global_cumulative_emissions_from_flaring: - title: "Global cumulative CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - global_cumulative_emissions_from_gas: - title: "Global cumulative CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - global_cumulative_emissions_from_land_use_change: - title: "Global cumulative CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - global_cumulative_emissions_from_oil: - title: "Global cumulative CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - global_cumulative_emissions_from_other_industry: - title: "Global cumulative CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - global_cumulative_emissions_total: - title: "Global cumulative CO₂ emissions" - unit: "tonnes" - short_unit: "t" - global_cumulative_emissions_total_including_land_use_change: - title: "Global cumulative CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - global_emissions_from_cement: - title: "Global annual CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - global_emissions_from_coal: - title: "Global annual CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - global_emissions_from_flaring: - title: "Global annual CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - global_emissions_from_gas: - title: "Global annual CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - global_emissions_from_international_transport: - title: "Global annual CO₂ emissions from international transport" - unit: "tonnes" - short_unit: "t" - global_emissions_from_land_use_change: - title: "Global annual CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - global_emissions_from_oil: - title: "Global annual CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - global_emissions_from_other_industry: - title: "Global annual CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - global_emissions_total: - title: "Global annual CO₂ emissions" - unit: "tonnes" - short_unit: "t" - global_emissions_total_including_land_use_change: - title: "Global annual CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - global_population: - title: "Global population" - unit: "persons" - short_unit: "persons" - description_short: "World population." - growth_emissions_total: - title: "Annual CO₂ emissions growth (abs)" - unit: "tonnes" - short_unit: "t" - description_short: "Annual growth in total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes." - growth_emissions_total_including_land_use_change: - title: "Growth rate of emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description_short: "Annual growth in total emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes." - land_use_change_quality_flag: - title: "Land-use change quality flag" - unit: "" - short_unit: "" - description_short: "The quality flag is 1 if the different estimates of land-use change emissions considered by the Global Carbon Project have a reasonable agreement. Otherwise the quality flag is 0. The flag is also set to zero if not all estimates have data for a given country." - description_key: - - Carbon dioxide emissions from land use change vary significantly in their degree of certainty. - pct_growth_emissions_total: - title: "Annual CO₂ emissions growth (%)" - unit: "%" - short_unit: "%" - description_short: "Annual percentage growth in total emissions of carbon dioxide (CO₂), excluding land-use change." - pct_growth_emissions_total_including_land_use_change: - title: "Growth rate of emissions including land-use change (%)" - unit: "%" - short_unit: "%" - description_short: "Annual percentage growth in total emissions of carbon dioxide (CO₂), including land-use change." - pct_traded_emissions: - title: "Share of annual CO₂ emissions embedded in trade" - unit: "%" - short_unit: "%" - description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured as a percentage of emissions of CO₂." - description_key: - - *traded_emissions_description_key - - *international_aviation_description_key - population: - title: "Population" - unit: "persons" - short_unit: "persons" - population_as_share_of_global: - title: "Share of population" - unit: "%" - short_unit: "%" - description_short: "Population, measured as a percentage of global total population in the same year." - primary_energy_consumption: - title: "Primary energy consumption" - unit: "terawatt-hours" - short_unit: "TWh" - description_short: "Primary energy consumption, measured in terawatt-hours per year." - traded_emissions: - title: "Annual CO₂ emissions embedded in trade" - unit: "tonnes" - short_unit: "t" - description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes." - description_key: - - *traded_emissions_description_key - - *international_aviation_description_key - traded_emissions_per_capita: - title: "Annual CO₂ emissions embedded in trade (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *traded_emissions_description_key - - *international_aviation_description_key diff --git a/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.py b/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.py deleted file mode 100644 index 77f5ec9e898..00000000000 --- a/etl/steps/archive/garden/gcp/2023-09-28/global_carbon_budget.py +++ /dev/null @@ -1,1063 +0,0 @@ -"""This step creates the Global Carbon Budget (GCB) dataset, by the Global Carbon Project (GCP). - -It harmonizes and further processes meadow data, and uses the following auxiliary datasets: -- GGDC's Maddison dataset on GDP, used to calculate emissions per GDP. -- Primary Energy Consumption (mix of sources from the 'energy' namespace) to calculate emissions per unit energy. -- Population (mix of sources), to calculate emissions per capita. -- Regions (mix of sources), to generate aggregates for different continents. -- WorldBank's Income groups, to generate aggregates for different income groups. - -""" -import numpy as np -import owid.catalog.processing as pr -from owid.catalog import Dataset, Table -from owid.datautils import dataframes -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Expected outliers in consumption-based emissions (with negative emissions in the original data, that will be removed). -OUTLIERS_IN_CONSUMPTION_DF = [ - ("Panama", 2003), - ("Panama", 2004), - ("Panama", 2005), - ("Panama", 2006), - ("Panama", 2011), - ("Panama", 2012), - ("Panama", 2013), - ("Venezuela", 2018), -] - -# Label used for international transport (emissions from oil in bunker fuels), included as a country in the -# fossil CO2 emissions dataset. -INTERNATIONAL_TRANSPORT_LABEL = "International Transport" - -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -REGIONS = { - # Default continents. - "Africa": {}, - "Asia": {}, - "Europe": {}, - "European Union (27)": {}, - "North America": {}, - "Oceania": {}, - "South America": {}, - # Income groups. - "Low-income countries": {}, - "Upper-middle-income countries": {}, - "Lower-middle-income countries": {}, - "High-income countries": {}, - # Additional composite regions. - "Asia (excl. China and India)": { - "additional_regions": ["Asia"], - "excluded_members": ["China", "India"], - }, - "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, - "Europe (excl. EU-28)": { - "additional_regions": ["Europe"], - "excluded_regions": ["European Union (27)"], - "excluded_members": ["United Kingdom"], - }, - "European Union (28)": { - "additional_regions": ["European Union (27)"], - "additional_members": ["United Kingdom"], - }, - "North America (excl. USA)": { - "additional_regions": ["North America"], - "excluded_members": ["United States"], - }, -} - -# Columns to use from GCB fossil CO2 emissions data and how to rename them. -CO2_COLUMNS = { - "country": "country", - "year": "year", - "cement": "emissions_from_cement", - "coal": "emissions_from_coal", - "flaring": "emissions_from_flaring", - "gas": "emissions_from_gas", - "oil": "emissions_from_oil", - "other": "emissions_from_other_industry", - "total": "emissions_total", -} - -# List all sources of emissions considered. -EMISSION_SOURCES = [column for column in CO2_COLUMNS.values() if column not in ["country", "year"]] - -# Columns to use from primary energy consumption data and how to rename them. -PRIMARY_ENERGY_COLUMNS = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "primary_energy_consumption", -} - -# Columns to use from primary energy consumption data and how to rename them. -HISTORICAL_EMISSIONS_COLUMNS = { - "country": "country", - "year": "year", - # Global fossil emissions are used only for sanity checks. - "global_fossil_emissions": "global_fossil_emissions", - "global_land_use_change_emissions": "global_emissions_from_land_use_change", -} - -# Columns to use from consumption-based emissions data and how to rename them. -CONSUMPTION_EMISSIONS_COLUMNS = { - "country": "country", - "year": "year", - "consumption_emissions": "consumption_emissions", -} - -# Conversion from terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - -# Conversion factor to change from billion tonnes of carbon to tonnes of CO2. -BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e9 - -# Conversion factor to change from million tonnes of carbon to tonnes of CO2. -MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 - -# Conversion from million tonnes of CO2 to tonnes of CO2. -MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 = 1e6 - -# Conversion from tonnes of CO2 to kg of CO2 (used for emissions per GDP and per unit energy). -TONNES_OF_CO2_TO_KG_OF_CO2 = 1000 - -# In order to remove uninformative columns, keep only rows where at least one of the following columns has data. -# All other columns are either derived variables, or global variables, or auxiliary variables from other datasets. -COLUMNS_THAT_MUST_HAVE_DATA = [ - "emissions_from_cement", - "emissions_from_coal", - "emissions_from_flaring", - "emissions_from_gas", - "emissions_from_oil", - "emissions_from_other_industry", - "emissions_total", - "consumption_emissions", - "emissions_from_land_use_change", - # 'land_use_change_quality_flag', -] - - -def sanity_checks_on_input_data( - tb_production: Table, tb_consumption: Table, tb_historical: Table, tb_co2: Table -) -> None: - """Run sanity checks on input data files. - - These checks should be used prior to country harmonization, but after basic processing of the tables. - - Parameters - ---------- - tb_production : Table - Production-based emissions from GCP's official national emissions dataset (excel file). - tb_consumption : Table - Consumption-based emissions from GCP's official national emissions dataset (excel file). - tb_historical : Table - Historical emissions from GCP's official global emissions dataset (excel file). - tb_co2 : Table - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - - """ - tb_production = tb_production.copy() - tb_consumption = tb_consumption.copy() - tb_historical = tb_historical.copy() - tb_co2 = tb_co2.copy() - - # In the original data, Bunkers was included in the national data file, as another country. - # But I suppose it should be considered as another kind of global emission. - # In fact, bunker emissions should coincide for production and consumption emissions. - global_bunkers_emissions = ( - tb_production[tb_production["country"] == "Bunkers"][["year", "production_emissions"]] - .reset_index(drop=True) - .rename(columns={"production_emissions": "global_bunker_emissions"}, errors="raise") - ) - - # Check that we get exactly the same array of bunker emissions from the consumption emissions table - # (on years where there is data for bunker emissions in both datasets). - comparison = pr.merge( - global_bunkers_emissions, - tb_consumption[tb_consumption["country"] == "Bunkers"][["year", "consumption_emissions"]] - .reset_index(drop=True) - .rename(columns={"consumption_emissions": "global_bunker_emissions"}, errors="raise"), - how="inner", - on="year", - suffixes=("", "_check"), - ) - - error = "Bunker emissions were expected to coincide in production and consumption emissions tables." - assert (comparison["global_bunker_emissions"] == comparison["global_bunker_emissions_check"]).all(), error - - # Check that all production-based emissions are positive. - error = "There are negative emissions in tb_production (from the additional variables dataset)." - assert (tb_production.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that all production-based emissions from the fossil CO2 dataset are positive. - error = "There are negative emissions in tb_co2 (from the fossil CO2 dataset)." - assert (tb_co2.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that all consumption-based emissions are positive. - error = "There are negative emissions in tb_consumption (from the national emissions dataset)." - assert (tb_consumption.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that, for the World, production emissions coincides with consumption emissions (on common years). - error = "Production and consumption emissions for the world were expected to be identical." - comparison = pr.merge( - tb_production[tb_production["country"] == "World"].reset_index(drop=True), - tb_consumption[tb_consumption["country"] == "World"].reset_index(drop=True), - how="inner", - on="year", - ) - assert (comparison["production_emissions"] == comparison["consumption_emissions"]).all(), error - - # Check that production emissions for the World coincide with global (historical) emissions (on common years). - comparison = pr.merge( - tb_production[tb_production["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), - tb_historical[["year", "global_fossil_emissions"]], - how="inner", - on="year", - ) - error = "Production emissions for the world were expected to coincide with global fossil emissions." - assert ( - abs(comparison["production_emissions"] - comparison["global_fossil_emissions"]) - / (comparison["global_fossil_emissions"]) - < 0.001 - ).all(), error - - # Check that emissions in tb_production (emissions from the national excel file) coincide with emissions in tb_co2 - # (from the Fossil CO2 emissions csv file). - # Given that country names have not yet been harmonized, rename the only countries that are present in both datasets. - comparison = pr.merge( - tb_co2[["country", "year", "emissions_total"]], - tb_production.replace({"Bunkers": "International Transport", "World": "Global"}), - on=["country", "year"], - how="inner", - ).dropna(subset=["emissions_total", "production_emissions"], how="any") - # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in tb_production), - # omit that row in the comparison. - comparison = comparison.drop( - comparison[(comparison["country"] == "Kuwait") & (comparison["year"] == 1991)].index - ).reset_index(drop=True) - - error = "Production emissions from national file were expected to coincide with the Fossil CO2 emissions dataset." - assert ( - ( - 100 - * abs(comparison["production_emissions"] - comparison["emissions_total"]) - / (comparison["emissions_total"]) - ).fillna(0) - < 0.1 - ).all(), error - - -def sanity_checks_on_output_data(tb_combined: Table) -> None: - """Run sanity checks on output data. - - These checks should be run on the very final output table (with an index) prior to storing it as a table. - - Parameters - ---------- - combined_df : Table - Combination of all input tables, after processing, harmonization, and addition of variables. - - """ - tb_combined = tb_combined.reset_index() - error = "All variables (except traded emissions, growth, and land-use change) should be >= 0 or nan." - positive_variables = [ - col - for col in tb_combined.columns - if col != "country" - if "traded" not in col - if "growth" not in col - if "land_use" not in col - ] - assert (tb_combined[positive_variables].fillna(0) >= 0).all().all(), error - - error = "Production emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert tb_combined[ - (tb_combined["country"] == "World") & (abs(tb_combined["emissions_total_as_share_of_global"] - 100) > 2) - ].empty, error - - error = "Consumption emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert tb_combined[ - (tb_combined["country"] == "World") & (abs(tb_combined["consumption_emissions_as_share_of_global"] - 100) > 2) - ].empty, error - - error = "Population as a share of global population should be 100% for 'World'." - assert tb_combined[ - (tb_combined["country"] == "World") & (tb_combined["population_as_share_of_global"].fillna(100) != 100) - ].empty, error - - error = "All share of global emissions should be smaller than 100% (within 2% error)." - share_variables = [col for col in tb_combined.columns if "share" in col] - assert (tb_combined[share_variables].fillna(0) <= 102).all().all(), error - - # Check that cumulative variables are monotonically increasing. - # Firstly, list columns of cumulative variables, but ignoring cumulative columns as a share of global - # (since they are not necessarily monotonic) and land-use change (which can be negative). - cumulative_cols = [ - col for col in tb_combined.columns if "cumulative" in col if "share" not in col if "land_use" not in col - ] - # Using ".is_monotonic_increasing" can fail when differences between consecutive numbers are very small. - # Instead, sort data backwards in time, and check that consecutive values of cumulative variables always have - # a percentage change that is smaller than, say, 0.1%. - error = ( - "Cumulative variables (not given as a share of global) should be monotonically increasing (except when " - "including land-use change emissions, which can be negative)." - ) - assert ( - tb_combined.sort_values("year", ascending=False) - .groupby("country") - .agg({col: lambda x: ((x.pct_change().dropna() * 100) <= 0.1).all() for col in cumulative_cols}) - .all() - .all() - ), error - - error = ( - "Production emissions as a share of global production emissions for the World should always be 100% " - "(or larger than 98%, given small discrepancies)." - ) - # Consumption emissions as a share of global production emissions is allowed to be smaller than 100%. - share_variables = [col for col in tb_combined.columns if "share" in col if "consumption" not in col] - assert (tb_combined[tb_combined["country"] == "World"][share_variables].fillna(100) > 98).all().all(), error - - error = "Traded emissions for the World should be close to zero (within 2% error)." - world_mask = tb_combined["country"] == "World" - assert ( - abs( - 100 - * tb_combined[world_mask]["traded_emissions"].fillna(0) - / tb_combined[world_mask]["emissions_total"].fillna(1) - ) - < 2 - ).all(), error - - -def prepare_fossil_co2_emissions(tb_co2: Table) -> Table: - """Prepare Fossil CO2 emissions data (basic processing).""" - # Select and rename columns from fossil CO2 data. - tb_co2 = tb_co2[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") - - # Ensure all emissions are given in tonnes of CO2. - tb_co2[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 - - #################################################################################################################### - # NOTE: For certain years, column "emissions_from_other_industry" is not informed for "World" but it is informed - # for some countries (namely China and US). - # This causes the cumulative emissions from other industry as share of global for those countries to become larger - # than 100%. - # This temporary solution fixes the issue: We aggregate the data for China and US on those years when the world's - # data is missing (without touching other years or other columns). - # Firstly, list of years for which the world has no data for emissions_from_other_industry. - world_missing_years = ( - tb_co2[(tb_co2["country"] == "Global") & (tb_co2["emissions_from_other_industry"].isnull())]["year"] - .unique() - .tolist() # type: ignore - ) - # Data that needs to be aggregated. - data_missing_in_world = tb_co2[ - tb_co2["year"].isin(world_missing_years) & (tb_co2["emissions_from_other_industry"].notnull()) - ] - # Check that there is indeed data to be aggregated (that is missing for the World). - error = ( - "Expected emissions_from_other_industry to be null for the world but not null for certain countries " - "(which was an issue in the original fossil CO2 data). The issue may be fixed and the code can be simplified." - ) - assert len(data_missing_in_world) > 0, error - # Create a table of aggregate data for the World, on those years when it's missing. - aggregated_missing_data = ( - data_missing_in_world.groupby("year") - .agg({"emissions_from_other_industry": "sum"}) - .reset_index() - .assign(**{"country": "Global"}) - ) - # Combine the new table of aggregate data with the main table. - tb_co2 = dataframes.combine_two_overlapping_dataframes( - df1=tb_co2, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True - ) - # NOTE: The previous function currently does not properly propagate metadata, but keeps only the sources of the - # first table. But given that both tables combined have the same source, we don't need to manually change it. - #################################################################################################################### - - # We add the emissions from "Kuwaiti Oil Fires" (which is also included as a separate country) as part of the - # emissions of Kuwait. This ensures that they will be included in region aggregates. - error = "'Kuwaiti Oil Fires' was expected to only have not-null data for 1991." - assert tb_co2[ - (tb_co2["country"] == "Kuwaiti Oil Fires") - & (tb_co2["emissions_total"].notnull()) - & (tb_co2["emissions_total"] != 0) - ]["year"].tolist() == [1991], error - - tb_co2.loc[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991), EMISSION_SOURCES] = ( - tb_co2[(tb_co2["country"] == "Kuwaiti Oil Fires") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values - + tb_co2[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values - ) - - # Check that "emissions_total" agrees with the sum of emissions from individual sources. - error = "The sum of all emissions should add up to total emissions (within 1%)." - assert ( - abs( - tb_co2.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) - - tb_co2["emissions_total"].fillna(0) - ) - / (tb_co2["emissions_total"].fillna(0) + 1e-7) - < 1e-2 - ).all(), error - - # Many rows have zero total emissions, but actually the individual sources are nan. - # Total emissions in those cases should be nan, instead of zero. - no_individual_emissions = tb_co2.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) - tb_co2.loc[no_individual_emissions, "emissions_total"] = np.nan - - return tb_co2 - - -def prepare_consumption_emissions(tb_consumption: Table) -> Table: - """Prepare consumption-based emissions data (basic processing).""" - # Select and rename columns. - tb_consumption = tb_consumption[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( - columns=CONSUMPTION_EMISSIONS_COLUMNS, errors="raise" - ) - - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in tb_consumption.drop(columns=["country", "year"]).columns: - tb_consumption[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - # List indexes of rows in tb_consumption corresponding to outliers (defined above in OUTLIERS_IN_tb_consumption). - outlier_indexes = [ - tb_consumption[(tb_consumption["country"] == outlier[0]) & (tb_consumption["year"] == outlier[1])].index.item() - for outlier in OUTLIERS_IN_CONSUMPTION_DF - ] - - error = ( - "Outliers were expected to have negative consumption emissions. " - "Maybe outliers have been fixed (and should be removed from the code)." - ) - assert (tb_consumption.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error - - # Remove outliers. - tb_consumption = tb_consumption.drop(outlier_indexes).reset_index(drop=True) - - return tb_consumption - - -def prepare_production_emissions(tb_production: Table) -> Table: - """Prepare production-based emissions data (basic processing).""" - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in tb_production.drop(columns=["country", "year"]).columns: - tb_production[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return tb_production - - -def prepare_land_use_emissions(tb_land_use: Table) -> Table: - """Prepare land-use change emissions data (basic processing).""" - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - tb_land_use["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return tb_land_use - - -def prepare_historical_emissions(tb_historical: Table) -> Table: - """Prepare historical emissions data.""" - # Select and rename columns from historical emissions data. - tb_historical = tb_historical[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( - columns=HISTORICAL_EMISSIONS_COLUMNS, errors="raise" - ) - - # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in tb_historical.drop(columns=["country", "year"]).columns: - tb_historical[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return tb_historical - - -def extract_global_emissions(tb_co2: Table, tb_historical: Table, ds_population: Dataset) -> Table: - """Extract World emissions by combining data from the Fossil CO2 emissions and the global emissions dataset. - - The resulting global emissions data includes bunker and land-use change emissions. - - NOTE: This function has to be used after selecting and renaming columns in tb_co2, but before harmonizing country - names in tb_co2 (so that "International Transport" is still listed as a country). - - Parameters - ---------- - tb_co2 : Table - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - tb_historical : Table - Historical emissions from GCP's official global emissions dataset (excel file). - ds_population : Dataset - Population dataset. - - Returns - ------- - global_emissions : Table - World emissions. - - """ - # For some reason, "International Transport" is included as another country, that only has emissions from oil. - # We separate it as another variable (only given at the global level). - global_transport = tb_co2[tb_co2["country"] == INTERNATIONAL_TRANSPORT_LABEL].reset_index(drop=True) - - # Check that total emissions for international transport coincide with oil emissions. - error = "Total emissions from international transport do not coincide with oil emissions." - assert all((global_transport["emissions_from_oil"] - global_transport["emissions_total"]).dropna() == 0), error - - # Therefore, we can keep only one column for international transport emissions. - global_transport = ( - global_transport[["year", "emissions_from_oil"]] - .dropna() - .rename(columns={"emissions_from_oil": "global_emissions_from_international_transport"}, errors="raise") - ) - - # Create a new table of global emissions. - global_emissions = ( - tb_co2[tb_co2["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] - .rename(columns={column: f"global_{column}" for column in EMISSION_SOURCES}, errors="raise") - .sort_values("year") - .reset_index(drop=True) - ) - - # Add bunker fuels to global emissions. - global_emissions = pr.merge(global_emissions, global_transport, on=["year"], how="outer") - - # Add historical land-use change emissions to table of global emissions. - global_emissions = pr.merge( - global_emissions, tb_historical[["year", "global_emissions_from_land_use_change"]], how="left", on="year" - ) - - # Add variable of total emissions including fossil fuels and land use change. - global_emissions["global_emissions_total_including_land_use_change"] = ( - global_emissions["global_emissions_total"] + global_emissions["global_emissions_from_land_use_change"] - ) - - # Calculate global cumulative emissions. - for column in EMISSION_SOURCES + ["emissions_from_land_use_change", "emissions_total_including_land_use_change"]: - global_emissions[f"global_cumulative_{column}"] = global_emissions[f"global_{column}"].cumsum() - - # Add a country column and add global population. - global_emissions["country"] = "World" - - # Add global population. - global_emissions = geo.add_population_to_table( - tb=global_emissions, ds_population=ds_population, population_col="global_population" - ) - - return global_emissions - - -def harmonize_country_names(tb: Table) -> Table: - """Harmonize country names, and fix known issues with certain regions. - - Parameters - ---------- - df : Table - Emissions data (either from the fossil CO2, the production-based, consumption-based, or land-use emissions - datasets). - - Returns - ------- - df : Table - Emissions data after harmonizing country names. - - """ - # Harmonize country names. - tb = geo.harmonize_countries( - df=tb, - countries_file=paths.country_mapping_path, - excluded_countries_file=paths.excluded_countries_path, - warn_on_missing_countries=True, - warn_on_unused_countries=False, - make_missing_countries_nan=False, - warn_on_unknown_excluded_countries=False, - ) - - return tb - - -def fix_duplicated_palau_data(tb_co2: Table) -> Table: - tb = tb_co2.copy() - # Check that there is only one data point for each country-year. - # In the fossil CO2 emissions data, after harmonization, "Pacific Islands (Palau)" is mapped to "Palau", and - # therefore there are rows with different data for the same country-year. - # However, "Pacific Islands (Palau)" have data until 1991, and "Palau" has data from 1992 onwards. - # Check that duplicate rows are still there. - error = "Expected 'Palau' data to be duplicated. Remove temporary fix." - assert tb[tb.duplicated(subset=["country", "year"])]["country"].unique().tolist() == ["Palau"], error - - # Select rows corresponding to "Palau" prior to 1992, and to "Pacific Islands (Palau)" from 1992 onwards. - indexes_to_drop = ( - tb[ - (tb["country"] == "Palau") & (tb["year"] < 1992) & (tb.duplicated(subset=["country", "year"], keep="first")) - ].index.tolist() - + tb[ - (tb["country"] == "Palau") & (tb["year"] >= 1992) & (tb.duplicated(subset=["country", "year"], keep="last")) - ].index.tolist() - ) - # Check that the selected rows do not overlap. - assert len(indexes_to_drop) == len(set(indexes_to_drop)) - # Remove those rows. - tb = tb.drop(indexes_to_drop).reset_index(drop=True) - # NOTE: Do not drop empty rows yet, as they will be needed to have a complete population series. - - return tb - - -def fix_consumption_emissions_for_africa(tb_co2_with_regions: Table) -> Table: - # The calculated consumption emissions for Africa differ significantly from those in the GCP dataset. - # GCP's estimate is significantly larger. The reason may be that many African countries do not have data on - # consumption emissions, so the aggregate may be underestimated. Maybe GCP has a different way to estimate Africa's - # consumption emissions. - # We therefore replace our values for Africa (calculated by summing consumption emissions from African countries) - # with those from GCP. - # At the end of the day, the reason why we keep ours and GCP's version of continents is that our definitions may - # differ. But it is unlikely that their definition of the African continent is different from ours. - - # First, check that the discrepancy exists in the current data. - tb = tb_co2_with_regions.copy() - consumption_emissions_africa = tb[(tb["country"] == "Africa") & (tb["year"] == 2020)][ - "consumption_emissions" - ].item() - consumption_emissions_africa_gcp = tb[(tb["country"] == "Africa (GCP)") & (tb["year"] == 2020)][ - "consumption_emissions" - ].item() - error = ( - "Discrepancy in consumption emissions between aggregated Africa and Africa (GCP) no longer exists. " - "Remove temporary fix" - ) - assert ( - consumption_emissions_africa_gcp - consumption_emissions_africa - ) / consumption_emissions_africa_gcp > 0.24, error - - # Replace consumption emissions for "Africa" by those by "Africa (GCP)". - consumption_emissions = tb[tb["country"] != "Africa"][["country", "year", "consumption_emissions"]].reset_index( - drop=True - ) - consumption_emissions_for_africa = ( - consumption_emissions[consumption_emissions["country"] == "Africa (GCP)"] - .reset_index(drop=True) - .replace({"Africa (GCP)": "Africa"}) - ) - consumption_emissions = pr.concat([consumption_emissions, consumption_emissions_for_africa], ignore_index=True) - # Replace consumption emissions in main table by the fixed one. - tb = tb.drop(columns="consumption_emissions").merge(consumption_emissions, on=["country", "year"], how="outer") - - # Sanity checks. - # All columns except consumption_emissions should be identical to the original. - error = "Mismatch before and after fixing consumption emissions for Africa." - for col in tb.drop(columns=["consumption_emissions"]).columns: - assert ( - tb[col].dropna().reset_index(drop=True) == tb_co2_with_regions[col].dropna().reset_index(drop=True) - ).all() - # Consumption emissions should be identical to the original except for Africa. - assert ( - tb[tb["country"] != "Africa"]["consumption_emissions"].dropna().reset_index(drop=True) - == tb_co2_with_regions[tb_co2_with_regions["country"] != "Africa"]["consumption_emissions"] - .dropna() - .reset_index(drop=True) - ).all() - - return tb - - -def combine_data_and_add_variables( - tb_co2: Table, - tb_production: Table, - tb_consumption: Table, - tb_global_emissions: Table, - tb_land_use: Table, - tb_energy: Table, - ds_gdp: Dataset, - ds_population: Table, - ds_regions: Dataset, - ds_income_groups: Dataset, -) -> Table: - """Combine all relevant data into one table, add region aggregates, and add custom variables (e.g. emissions per - capita). - - Parameters - ---------- - tb_co2 : Table - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file), after harmonization. - tb_production : Table - Production-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - tb_consumption : Table - Consumption-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - tb_global_emissions : Table - World emissions (including bunker and land-use change emissions). - tb_land_use : Table - National land-use change emissions from GCP's official dataset (excel file), after harmonization. - tb_energy : Table - Primary energy data. - ds_gdp : Dataset - GDP dataset. - ds_population : Dataset - Population dataset. - ds_regions : Dataset - Regions dataset. - ds_income_groups : Dataset - Income groups dataset. - - Returns - ------- - tb_co2_with_regions : Table - Combined data, with all additional variables and with region aggregates. - - """ - tb_co2_with_regions = tb_co2.copy() - - # Add region aggregates that were included in the national emissions file, but not in the Fossil CO2 emissions file. - gcp_aggregates = sorted(set(tb_production["country"]) - set(tb_co2_with_regions["country"])) - tb_co2_with_regions = pr.concat( - [ - tb_co2_with_regions, - tb_production[tb_production["country"].isin(gcp_aggregates)] - .rename(columns={"production_emissions": "emissions_total"}) - .astype({"year": int}), - ], - ignore_index=True, - short_name=paths.short_name, - ).reset_index(drop=True) - - # Add consumption emissions to main table (keep only the countries of the main table). - # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to tb_co2 - # (when merging with tb_production), all countries from tb_consumption should be included in tb_co2. - error = "Some countries in tb_consumption are not included in tb_co2." - assert set(tb_consumption["country"]) < set(tb_co2_with_regions["country"]), error - tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_consumption, on=["country", "year"], how="outer") - - # Add population to original table. - tb_co2_with_regions = geo.add_population_to_table( - tb=tb_co2_with_regions, ds_population=ds_population, warn_on_missing_countries=False - ) - - # Add GDP to main table. - tb_co2_with_regions = geo.add_gdp_to_table(tb=tb_co2_with_regions, ds_gdp=ds_gdp) - - # Add primary energy to main table. - tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_energy, on=["country", "year"], how="left") - - # For convenience, rename columns in land-use change emissions data. - tb_land_use = tb_land_use.rename( - columns={"emissions": "emissions_from_land_use_change", "quality_flag": "land_use_change_quality_flag"} - ) - - # Land-use change data does not include data for the World. Include it by merging with the global dataset. - tb_land_use = pr.concat( - [ - tb_land_use, - tb_global_emissions.rename( - columns={"global_emissions_from_land_use_change": "emissions_from_land_use_change"} - )[["year", "emissions_from_land_use_change"]] - .dropna() - .assign(**{"country": "World"}), - ], - ignore_index=True, - ).astype({"year": int}) - - # Add land-use change emissions to main table. - tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_land_use, on=["country", "year"], how="outer") - - # Add total emissions (including land-use change) for each country. - tb_co2_with_regions["emissions_total_including_land_use_change"] = ( - tb_co2_with_regions["emissions_total"] + tb_co2_with_regions["emissions_from_land_use_change"] - ) - - # Add region aggregates. - # Aggregate not only emissions data, but also population, gdp and primary energy. - # This way we ensure that custom regions (e.g. "North America (excl. USA)") will have all required data. - aggregations = { - column: "sum" - for column in tb_co2_with_regions.columns - if column not in ["country", "year", "land_use_change_quality_flag"] - } - for region in REGIONS: - countries_in_region = geo.list_members_of_region( - region=region, - ds_regions=ds_regions, - ds_income_groups=ds_income_groups, - additional_regions=REGIONS[region].get("additional_regions", None), - excluded_regions=REGIONS[region].get("excluded_regions", None), - additional_members=REGIONS[region].get("additional_members", None), - excluded_members=REGIONS[region].get("excluded_members", None), - include_historical_regions_in_income_groups=True, - ) - tb_co2_with_regions = geo.add_region_aggregates( - df=tb_co2_with_regions, - region=region, - countries_in_region=countries_in_region, - countries_that_must_have_data=[], - frac_allowed_nans_per_year=0.999, - aggregations=aggregations, - ) - - # Fix consumption emissions for Africa. - tb_co2_with_regions = fix_consumption_emissions_for_africa(tb_co2_with_regions=tb_co2_with_regions) - - # Add global emissions and global cumulative emissions columns to main table. - tb_co2_with_regions = pr.merge( - tb_co2_with_regions, tb_global_emissions.drop(columns="country"), on=["year"], how="left" - ) - - # Ensure main table is sorted (so that cumulative emissions are properly calculated). - tb_co2_with_regions = tb_co2_with_regions.sort_values(["country", "year"]).reset_index(drop=True) - - # Temporarily add certain global emissions variables. - # This is done simply to be able to consider "consumption_emissions" as just another type of emission - # when creating additional variables. - tb_co2_with_regions["global_consumption_emissions"] = tb_co2_with_regions["global_emissions_total"] - tb_co2_with_regions["global_cumulative_consumption_emissions"] = tb_co2_with_regions[ - "global_cumulative_emissions_total" - ] - - # Add new variables for each source of emissions. - for column in EMISSION_SOURCES + [ - "consumption_emissions", - "emissions_from_land_use_change", - "emissions_total_including_land_use_change", - ]: - # Add per-capita variables. - tb_co2_with_regions[f"{column}_per_capita"] = tb_co2_with_regions[column] / tb_co2_with_regions["population"] - - # Add columns for cumulative emissions. - # Rows that had nan emissions will have nan cumulative emissions. - # But nans will not be propagated in the sum. - # This means that countries with some (not all) nans will have the cumulative sum of the informed emissions - # (treating nans as zeros), but will have nan on those rows that were not informed. - tb_co2_with_regions[f"cumulative_{column}"] = tb_co2_with_regions.groupby(["country"])[column].cumsum() - - # Add share of global emissions. - tb_co2_with_regions[f"{column}_as_share_of_global"] = ( - 100 * tb_co2_with_regions[column] / tb_co2_with_regions[f"global_{column}"] - ) - - # Add share of global cumulative emissions. - tb_co2_with_regions[f"cumulative_{column}_as_share_of_global"] = ( - 100 * tb_co2_with_regions[f"cumulative_{column}"] / tb_co2_with_regions[f"global_cumulative_{column}"] - ) - - # Add total emissions per unit energy (in kg of emissions per kWh). - tb_co2_with_regions["emissions_total_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * tb_co2_with_regions["emissions_total"] - / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) - ) - - # Add total emissions (including land-use change) per unit energy (in kg of emissions per kWh). - tb_co2_with_regions["emissions_total_including_land_use_change_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * tb_co2_with_regions["emissions_total_including_land_use_change"] - / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) - ) - - # Add total emissions per unit GDP. - tb_co2_with_regions["emissions_total_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["emissions_total"] / tb_co2_with_regions["gdp"] - ) - - # Add total emissions (including land-use change) per unit GDP. - tb_co2_with_regions["emissions_total_including_land_use_change_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * tb_co2_with_regions["emissions_total_including_land_use_change"] - / tb_co2_with_regions["gdp"] - ) - - # Add total consumption emissions per unit GDP. - tb_co2_with_regions["consumption_emissions_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["consumption_emissions"] / tb_co2_with_regions["gdp"] - ) - - # Add variable of emissions embedded in trade. - tb_co2_with_regions["traded_emissions"] = ( - tb_co2_with_regions["consumption_emissions"] - tb_co2_with_regions["emissions_total"] - ) - tb_co2_with_regions["pct_traded_emissions"] = ( - 100 * tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["emissions_total"] - ) - tb_co2_with_regions["traded_emissions_per_capita"] = ( - tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["population"] - ) - - # Add variable of emissions embedded in trade, including land-use change emissions. - # NOTE: The following variables would be a little misleading, since consumption emissions do not include land-use - # change emissions, but total emissions do. - # tb_co2_with_regions["traded_emissions_including_land_use_change"] = ( - # tb_co2_with_regions["consumption_emissions"] - tb_co2_with_regions["emissions_total_including_land_use_change"] - # ) - # tb_co2_with_regions["pct_traded_emissions_including_land_use_change"] = ( - # 100 - # * tb_co2_with_regions["traded_emissions_including_land_use_change"] - # / tb_co2_with_regions["emissions_total_including_land_use_change"] - # ) - # tb_co2_with_regions["traded_emissions_including_land_use_change_per_capita"] = ( - # tb_co2_with_regions["traded_emissions_including_land_use_change"] / tb_co2_with_regions["population"] - # ) - - # Remove temporary columns. - tb_co2_with_regions = tb_co2_with_regions.drop( - columns=["global_consumption_emissions", "global_cumulative_consumption_emissions"] - ) - - # Add annual percentage growth of total emissions. - tb_co2_with_regions["pct_growth_emissions_total"] = ( - tb_co2_with_regions.groupby("country")["emissions_total"].pct_change() * 100 - ) - - # Add annual percentage growth of total emissions (including land-use change). - tb_co2_with_regions["pct_growth_emissions_total_including_land_use_change"] = ( - tb_co2_with_regions.groupby("country")["emissions_total_including_land_use_change"].pct_change() * 100 - ) - - # Add annual absolute growth of total emissions. - tb_co2_with_regions["growth_emissions_total"] = tb_co2_with_regions.groupby("country")["emissions_total"].diff() - - # Add annual absolute growth of total emissions (including land-use change). - tb_co2_with_regions["growth_emissions_total_including_land_use_change"] = tb_co2_with_regions.groupby("country")[ - "emissions_total_including_land_use_change" - ].diff() - - # Create variable of population as a share of global population. - tb_co2_with_regions["population_as_share_of_global"] = ( - tb_co2_with_regions["population"] / tb_co2_with_regions["global_population"] * 100 - ) - - # Replace infinity values (for example when calculating growth from zero to non-zero) in the data by nan. - for column in tb_co2_with_regions.drop(columns=["country", "year"]).columns: - tb_co2_with_regions.loc[np.isinf(tb_co2_with_regions[column]), column] = np.nan - - # For special GCP countries/regions (e.g. "Africa (GCP)") we should keep only the original data. - # Therefore, make nan all additional variables for those countries/regions, and keep only GCP's original data. - added_variables = tb_co2_with_regions.drop( - columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA - ).columns.tolist() - tb_co2_with_regions.loc[ - (tb_co2_with_regions["country"].str.contains(" (GCP)", regex=False)), added_variables - ] = np.nan - - # Remove uninformative rows (those that have only data for, say, gdp, but not for variables related to emissions). - tb_co2_with_regions = tb_co2_with_regions.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index( - drop=True - ) - - # Set an appropriate index, ensure there are no rows that only have nan, and sort conveniently. - tb_co2_with_regions = tb_co2_with_regions.set_index(["country", "year"], verify_integrity=True) - tb_co2_with_regions = ( - tb_co2_with_regions.dropna(subset=tb_co2_with_regions.columns, how="all").sort_index().sort_index(axis=1) - ) - - # Rename table. - tb_co2_with_regions.metadata.short_name = paths.short_name - - return tb_co2_with_regions - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load meadow dataset and read all its tables. - ds_meadow = paths.load_dataset("global_carbon_budget") - tb_co2 = ds_meadow["global_carbon_budget_fossil_co2_emissions"].reset_index() - tb_historical = ds_meadow["global_carbon_budget_historical_budget"].reset_index() - tb_consumption = ds_meadow["global_carbon_budget_consumption_emissions"].reset_index() - tb_production = ds_meadow["global_carbon_budget_production_emissions"].reset_index() - tb_land_use = ds_meadow["global_carbon_budget_land_use_change"].reset_index() - - # Load primary energy consumption dataset and read its main table. - ds_energy = paths.load_dataset("primary_energy_consumption") - tb_energy = ds_energy["primary_energy_consumption"].reset_index() - - #################################################################################################################### - # TODO: Remove this temporary solution once primary energy consumption dataset has origins. - error = "Remove temporary solution now that primary energy consumption has origins." - assert not tb_energy["primary_energy_consumption__twh"].metadata.origins, error - from etl.data_helpers.misc import add_origins_to_energy_table - - tb_energy = add_origins_to_energy_table(tb_energy=tb_energy) - #################################################################################################################### - - # Load GDP dataset. - ds_gdp = paths.load_dataset("ggdc_maddison") - - # Load population dataset. - ds_population = paths.load_dataset("population") - - # Load regions dataset. - ds_regions = paths.load_dataset("regions") - - # Load income groups dataset. - ds_income_groups = paths.load_dataset("income_groups") - - # - # Process data. - # - # Prepare fossil CO2 emissions data. - tb_co2 = prepare_fossil_co2_emissions(tb_co2=tb_co2) - - # Prepare consumption-based emission data. - tb_consumption = prepare_consumption_emissions(tb_consumption=tb_consumption) - - # Prepare production-based emission data. - tb_production = prepare_production_emissions(tb_production=tb_production) - - # Prepare land-use emission data. - tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) - - # Select and rename columns from primary energy data. - tb_energy = tb_energy[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS, errors="raise") - - # Prepare historical emissions data. - tb_historical = prepare_historical_emissions(tb_historical=tb_historical) - - # Run sanity checks on input data. - sanity_checks_on_input_data( - tb_production=tb_production, tb_consumption=tb_consumption, tb_historical=tb_historical, tb_co2=tb_co2 - ) - - # For some reason, "International Transport" is included as another country, that only has emissions from oil. - # Extract that data and remove it from the rest of national emissions. - tb_global_emissions = extract_global_emissions( - tb_co2=tb_co2, tb_historical=tb_historical, ds_population=ds_population - ) - - # Harmonize country names. - tb_co2 = harmonize_country_names(tb=tb_co2) - tb_consumption = harmonize_country_names(tb=tb_consumption) - tb_production = harmonize_country_names(tb=tb_production) - tb_land_use = harmonize_country_names(tb=tb_land_use) - - # Fix duplicated rows for Palau. - tb_co2 = fix_duplicated_palau_data(tb_co2=tb_co2) - - # Add new variables to main table (consumption-based emissions, emission intensity, per-capita emissions, etc.). - tb_combined = combine_data_and_add_variables( - tb_co2=tb_co2, - tb_production=tb_production, - tb_consumption=tb_consumption, - tb_global_emissions=tb_global_emissions, - tb_land_use=tb_land_use, - tb_energy=tb_energy, - ds_gdp=ds_gdp, - ds_population=ds_population, - ds_regions=ds_regions, - ds_income_groups=ds_income_groups, - ) - - # Run sanity checks on output data. - sanity_checks_on_output_data(tb_combined) - - # - # Save outputs. - # - # Create a new garden dataset and use metadata from meadow dataset. - ds_garden = create_dataset( - dest_dir=dest_dir, tables=[tb_combined], default_metadata=ds_meadow.metadata, check_variables_metadata=True - ) - ds_garden.save() diff --git a/etl/steps/archive/garden/gcp/2023-12-05/global_carbon_budget.meta.yml b/etl/steps/archive/garden/gcp/2023-12-05/global_carbon_budget.meta.yml deleted file mode 100644 index 40eb75944f0..00000000000 --- a/etl/steps/archive/garden/gcp/2023-12-05/global_carbon_budget.meta.yml +++ /dev/null @@ -1,520 +0,0 @@ -definitions: - production_emissions_description_key: &production_emissions_description_key - - This data is based on territorial emissions, which do not account for emissions embedded in traded goods. - traded_emissions_description_key: &traded_emissions_description_key - - Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter. - international_aviation_description_key: &international_aviation_description_key - - Emissions from international aviation and shipping are not included in any country or region's emissions. They are only included in the global total emissions. - consumption_emissions_description_key: &consumption_emissions_description_key - - Consumption-based emissions attribute the emissions generated in the production of goods and services according to where they were _consumed_, rather than where they were _produced_. - - "The data is calculated by adjusting 'production-based' emissions (emissions produced domestically) for trade: Consumption-based emissions equals production-based emissions, _minus_ emissions embedded in exports, _plus_ emissions embedded in imports." - - If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. If its consumption-based emissions are lower, then it is a net exporter. - - Consumption-based emissions are not available for all countries because not all countries have sufficient, high-quality trade data. But those without complete data are a small fraction (3%) of the global total. - - This data measures carbon dioxide (CO₂) emissions from fossil fuels and industry and does not include emissions from land use change, deforestation, soils, or vegetation. - per_capita_description_key: &per_capita_description_key - - Per capita emissions represent the emissions of an average person in a country or region - they are calculated as the total emissions divided by population. - # Common fields to be used in all indicators (unless overridden for specific indicators below). - common: - description_processing: &description_processing | - - Data on global emissions has been converted from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. - - Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. - - Country's share of the global population is calculated using our population dataset, based on [different sources](https://ourworldindata.org/population-sources). - - Each country's share of global CO₂ emissions from flaring has been calculated using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset. - description_key: - # NOTE: The description key points are re-defined for each indicator on consumption-based emissions and traded emissions, as well as on per-capita indicators. - - *production_emissions_description_key - - *international_aviation_description_key - presentation: - topic_tags: - - CO2 & Greenhouse Gas Emissions - processing_level: major - -dataset: - title: Global Carbon Budget - update_period_days: 365 - -tables: - global_carbon_budget: - variables: - consumption_emissions: - title: "Annual consumption-based CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description_short: Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes. - description_key: - - *consumption_emissions_description_key - - *international_aviation_description_key - consumption_emissions_as_share_of_global: - title: "Share of global annual CO₂ consumption-based emissions" - unit: "%" - short_unit: "%" - description_short: "Annual consumption-based emissions of carbon dioxide (CO₂), measured as a percentage of global consumption-based emissions of CO₂ in the same year." - description_key: - - *consumption_emissions_description_key - - *international_aviation_description_key - ################################################################################################################## - # Curated indicator for data page. - consumption_emissions_per_capita: - title: Per capita consumption-based CO₂ emissions - description_short: | - Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes per person. - description_key: - - *consumption_emissions_description_key - - *per_capita_description_key - - *international_aviation_description_key - description_processing: *description_processing - unit: tonnes per person - short_unit: t/person - display: - shortUnit: t - numDecimalPlaces: 0 - presentation: - attribution_short: Global Carbon Project - topic_tags: - - CO2 & Greenhouse Gas Emissions - - Climate Change - - Energy - faqs: - - fragment_id: emissions-from-aviation-and-shipping - gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw - - fragment_id: missing-consumption-based-emissions - gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw - grapher_config: - subtitle: >- - [Consumption-based emissions](#dod:consumptionbasedemissions) are national - emissions that have been adjusted for trade. It's production-based emissions - minus emissions embedded in exports, plus emissions embedded in imports. - hideAnnotationFieldsInTitle: - time: true - entity: true - changeInPrefix: true - hideRelativeToggle: false - hasMapTab: true - tab: map - originUrl: https://ourworldindata.org/co2-and-greenhouse-gas-emissions - colorScale: - binningStrategy: equalInterval - map: - colorScale: - baseColorScheme: Reds - binningStrategy: manual - customNumericValues: - - 1 - - 2 - - 5 - - 10 - - 20 - - 50 - customNumericColors: - - null - - null - selectedEntityNames: - - United States - - United Kingdom - - European Union (27) - - China - - India - - Australia - - Brazil - - South Africa - relatedQuestions: - - url: https://ourworldindata.org/grapher/consumption-co2-per-capita#faqs - text: FAQs on this data - consumption_emissions_per_gdp: - title: "Annual consumption-based CO₂ emissions per GDP (kg per international-$)" - unit: "kilograms per international-$" - short_unit: "kg/$" - description_short: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in kilograms per dollar of GDP (2011 international-$)." - description_key: - - *consumption_emissions_description_key - - *international_aviation_description_key - cumulative_consumption_emissions: - title: "Cumulative CO₂ consumption-based emissions" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of available data, measured in tonnes." - description_key: - - *consumption_emissions_description_key - - *international_aviation_description_key - cumulative_consumption_emissions_as_share_of_global: - title: "Share of global cumulative CO₂ consumption-based emissions" - unit: "%" - short_unit: "%" - description_short: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of available data, measured as a percentage of global cumulative consumption-based emissions." - description_key: - - *consumption_emissions_description_key - - *international_aviation_description_key - cumulative_emissions_from_cement: - title: "Cumulative CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from cement since the first year of available data, measured in tonnes." - cumulative_emissions_from_cement_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from cement" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from cement since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from cement." - cumulative_emissions_from_coal: - title: "Cumulative CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from coal since the first year of available data, measured in tonnes." - cumulative_emissions_from_coal_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from coal" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from coal since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from coal." - cumulative_emissions_from_flaring: - title: "Cumulative CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from flaring since the first year of available data, measured in tonnes." - cumulative_emissions_from_flaring_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from flaring" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from flaring since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from flaring." - cumulative_emissions_from_gas: - title: "Cumulative CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from gas since the first year of available data, measured in tonnes." - cumulative_emissions_from_gas_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from gas" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from gas since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from gas." - cumulative_emissions_from_land_use_change: - title: "Cumulative CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from land-use change since the first year of available data, measured in tonnes." - cumulative_emissions_from_land_use_change_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from land-use change" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from land-use change since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from land-use change." - cumulative_emissions_from_oil: - title: "Cumulative CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from oil since the first year of available data, measured in tonnes." - cumulative_emissions_from_oil_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from oil" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from oil since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from oil." - cumulative_emissions_from_other_industry: - title: "Cumulative CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from other industry sources since the first year of available data, measured in tonnes." - cumulative_emissions_from_other_industry_as_share_of_global: - title: "Share of global cumulative CO₂ emissions from other industry" - unit: "%" - short_unit: "%" - description_short: "Cumulative emissions of carbon dioxide (CO₂) from other industry sources since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from other industry sources." - cumulative_emissions_total: - title: "Cumulative CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description_short: "Total cumulative emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of available data, measured in tonnes." - cumulative_emissions_total_as_share_of_global: - title: "Share of global cumulative CO₂ emissions" - unit: "%" - short_unit: "%" - description_short: "Total cumulative emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of available data, measured as a percentage of global total cumulative emissions of CO₂." - cumulative_emissions_total_including_land_use_change: - title: "Cumulative CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description_short: "Total cumulative emissions of carbon dioxide (CO₂), including land-use change, since the first year of available data, measured in tonnes." - cumulative_emissions_total_including_land_use_change_as_share_of_global: - title: "Share of global cumulative CO₂ emissions including land-use change" - unit: "%" - short_unit: "%" - description_short: "Total cumulative emissions of carbon dioxide (CO₂), including land-use change, since the first year of available data, measured as a percentage of global total cumulative emissions of CO₂ (including land-use change)." - emissions_from_cement: - title: "Annual CO₂ emissions from cement" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured in tonnes." - emissions_from_cement_as_share_of_global: - title: "Share of global annual CO₂ emissions from cement" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured as a percentage of global emissions of CO₂ from cement in the same year." - emissions_from_cement_per_capita: - title: "Annual CO₂ emissions from cement (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_coal: - title: "Annual CO₂ emissions from coal" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured in tonnes." - emissions_from_coal_as_share_of_global: - title: "Share of global annual CO₂ emissions from coal" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured as a percentage of global emissions of CO₂ from coal in the same year." - emissions_from_coal_per_capita: - title: "Annual CO₂ emissions from coal (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_flaring: - title: "Annual CO₂ emissions from flaring" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured in tonnes." - emissions_from_flaring_as_share_of_global: - title: "Share of global annual CO₂ emissions from flaring" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured as a percentage of global emissions of CO₂ from flaring in the same year." - emissions_from_flaring_per_capita: - title: "Annual CO₂ emissions from flaring (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_gas: - title: "Annual CO₂ emissions from gas" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured in tonnes." - emissions_from_gas_as_share_of_global: - title: "Share of global annual CO₂ emissions from gas" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured as a percentage of global emissions of CO₂ from gas in the same year." - emissions_from_gas_per_capita: - title: "Annual CO₂ emissions from gas (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_land_use_change: - title: "Annual CO₂ emissions from land-use change" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes." - emissions_from_land_use_change_as_share_of_global: - title: "Share of global annual CO₂ emissions from land-use change" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured as a percentage of global emissions of CO₂ from land-use change in the same year." - emissions_from_land_use_change_per_capita: - title: "Annual CO₂ emissions from land-use change per capita" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_oil: - title: "Annual CO₂ emissions from oil" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured in tonnes." - emissions_from_oil_as_share_of_global: - title: "Share of global annual CO₂ emissions from oil" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured as a percentage of global emissions of CO₂ from oil in the same year." - emissions_from_oil_per_capita: - title: "Annual CO₂ emissions from oil (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_from_other_industry: - title: "Annual CO₂ emissions from other industry" - unit: "tonnes" - short_unit: "t" - description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes." - emissions_from_other_industry_as_share_of_global: - title: "Share of global annual CO₂ emissions from other industry" - unit: "%" - short_unit: "%" - description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured as a percentage of global emissions of CO₂ from other industry sources in the same year." - emissions_from_other_industry_per_capita: - title: "Annual CO₂ emissions from other industry (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_total: - title: "Annual CO₂ emissions" - unit: "tonnes" - short_unit: "t" - description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes." - emissions_total_as_share_of_global: - title: "Share of global annual CO₂ emissions" - unit: "%" - short_unit: "%" - description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured as a percentage of global emissions of CO₂ in the same year." - emissions_total_including_land_use_change: - title: "Annual CO₂ emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes." - emissions_total_including_land_use_change_as_share_of_global: - title: "Share of global annual CO₂ emissions including land-use change" - unit: "%" - short_unit: "%" - description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured as a percentage of global total emissions of CO₂ in the same year." - emissions_total_including_land_use_change_per_capita: - title: "Annual CO₂ emissions including land-use change per capita" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_total_including_land_use_change_per_gdp: - title: "Annual CO₂ emissions including land-use change per GDP" - unit: "kilograms per international-$" - short_unit: "kg/$" - description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per dollar of GDP (2011 international-$)." - emissions_total_including_land_use_change_per_unit_energy: - title: "Annual CO₂ emissions including land-use change per unit energy" - unit: "kilograms per kilowatt-hour" - short_unit: "kg/kWh" - description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per kilowatt-hour of primary energy consumption." - emissions_total_per_capita: - title: "Annual CO₂ emissions (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *production_emissions_description_key - - *international_aviation_description_key - emissions_total_per_gdp: - title: "Annual CO₂ emissions per GDP (kg per international-$)" - unit: "kilograms per international-$" - short_unit: "kg/$" - description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per dollar of GDP (2011 international-$)." - emissions_total_per_unit_energy: - title: "Annual CO₂ emissions per unit energy (kg per kilowatt-hour)" - unit: "kilograms per kilowatt-hour" - short_unit: "kg/kWh" - description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per kilowatt-hour of primary energy consumption." - gdp: - title: "GDP" - unit: "2011 international-$" - short_unit: "$" - description_short: >- - Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) - and price differences between countries. - growth_emissions_total: - title: "Annual CO₂ emissions growth (abs)" - unit: "tonnes" - short_unit: "t" - description_short: "Annual growth in total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes." - growth_emissions_total_including_land_use_change: - title: "Growth rate of emissions including land-use change" - unit: "tonnes" - short_unit: "t" - description_short: "Annual growth in total emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes." - land_use_change_quality_flag: - title: "Land-use change quality flag" - unit: "" - short_unit: "" - description_short: "The quality flag is 1 if the different estimates of land-use change emissions considered by the Global Carbon Project have a reasonable agreement. Otherwise the quality flag is 0. The flag is also set to zero if not all estimates have data for a given country." - description_key: - - Carbon dioxide emissions from land use change vary significantly in their degree of certainty. - pct_growth_emissions_total: - title: "Annual CO₂ emissions growth (%)" - unit: "%" - short_unit: "%" - description_short: "Annual percentage growth in total emissions of carbon dioxide (CO₂), excluding land-use change." - pct_growth_emissions_total_including_land_use_change: - title: "Growth rate of emissions including land-use change (%)" - unit: "%" - short_unit: "%" - description_short: "Annual percentage growth in total emissions of carbon dioxide (CO₂), including land-use change." - pct_traded_emissions: - title: "Share of annual CO₂ emissions embedded in trade" - unit: "%" - short_unit: "%" - description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured as a percentage of emissions of CO₂." - description_key: - - *traded_emissions_description_key - - *international_aviation_description_key - population: - title: "Population" - unit: "persons" - short_unit: "persons" - population_as_share_of_global: - title: "Share of population" - unit: "%" - short_unit: "%" - description_short: "Population, measured as a percentage of global total population in the same year." - primary_energy_consumption: - title: "Primary energy consumption" - unit: "terawatt-hours" - short_unit: "TWh" - description_short: "Primary energy consumption, measured in terawatt-hours per year." - traded_emissions: - title: "Annual CO₂ emissions embedded in trade" - unit: "tonnes" - short_unit: "t" - description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes." - description_key: - - *traded_emissions_description_key - - *international_aviation_description_key - traded_emissions_per_capita: - title: "Annual CO₂ emissions embedded in trade (per capita)" - unit: "tonnes per person" - short_unit: "t/person" - display: - shortUnit: t - description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes per person." - description_key: - - *per_capita_description_key - - *traded_emissions_description_key - - *international_aviation_description_key diff --git a/etl/steps/archive/garden/gcp/2023-12-05/global_carbon_budget.py b/etl/steps/archive/garden/gcp/2023-12-05/global_carbon_budget.py deleted file mode 100644 index d11c5411cc7..00000000000 --- a/etl/steps/archive/garden/gcp/2023-12-05/global_carbon_budget.py +++ /dev/null @@ -1,1137 +0,0 @@ -"""This step creates the Global Carbon Budget (GCB) dataset, by the Global Carbon Project (GCP). - -It harmonizes and further processes meadow data, and uses the following auxiliary datasets: -- GGDC's Maddison dataset on GDP, used to calculate emissions per GDP. -- Primary Energy Consumption (mix of sources from the 'energy' namespace) to calculate emissions per unit energy. -- Population (mix of sources), to calculate emissions per capita. -- Regions (mix of sources), to generate aggregates for different continents. -- WorldBank's Income groups, to generate aggregates for different income groups. - -""" -import numpy as np -import owid.catalog.processing as pr -from owid.catalog import Dataset, Table -from owid.datautils import dataframes -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Expected outliers in consumption-based emissions (with negative emissions in the original data, that will be removed). -# NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. -OUTLIERS_IN_CONSUMPTION_DF = [ - ("Panama", 2003), - ("Panama", 2004), - ("Panama", 2005), - ("Panama", 2006), - ("Panama", 2012), - ("Panama", 2013), - ("Venezuela", 2018), -] - -# Regions and income groups to create by aggregating contributions from member countries. -# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. -# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and -# "countries_excluded". The aggregates will be calculated on the resulting countries. -REGIONS = { - # Default continents. - "Africa": {}, - "Asia": {}, - "Europe": {}, - # We exclude GCB's EU27 data, because it appears only in a few metrics, and, when it exists, it is identical to our - # aggregated European Union (27). - "European Union (27)": {}, - "North America": {}, - "Oceania": {}, - "South America": {}, - # Income groups. - "Low-income countries": {}, - "Upper-middle-income countries": {}, - "Lower-middle-income countries": {}, - "High-income countries": {}, - # Additional composite regions. - "Asia (excl. China and India)": { - "additional_regions": ["Asia"], - "excluded_members": ["China", "India"], - }, - "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, - "Europe (excl. EU-28)": { - "additional_regions": ["Europe"], - "excluded_regions": ["European Union (27)"], - "excluded_members": ["United Kingdom"], - }, - "European Union (28)": { - "additional_regions": ["European Union (27)"], - "additional_members": ["United Kingdom"], - }, - "North America (excl. USA)": { - "additional_regions": ["North America"], - "excluded_members": ["United States"], - }, -} - -# Columns to use from GCB fossil CO2 emissions data and how to rename them. -CO2_COLUMNS = { - "country": "country", - "year": "year", - "cement": "emissions_from_cement", - "coal": "emissions_from_coal", - "flaring": "emissions_from_flaring", - "gas": "emissions_from_gas", - "oil": "emissions_from_oil", - "other": "emissions_from_other_industry", - "total": "emissions_total", -} - -# List all sources of emissions considered. -EMISSION_SOURCES = [column for column in CO2_COLUMNS.values() if column not in ["country", "year"]] - -# Columns to use from primary energy consumption data and how to rename them. -PRIMARY_ENERGY_COLUMNS = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "primary_energy_consumption", -} - -# Columns to use from historical emissions data and how to rename them. -HISTORICAL_EMISSIONS_COLUMNS = { - "country": "country", - "year": "year", - # Global fossil emissions are used only for sanity checks. - "global_fossil_emissions": "global_fossil_emissions", - "global_land_use_change_emissions": "global_emissions_from_land_use_change", -} - -# Columns to use from consumption-based emissions data and how to rename them. -CONSUMPTION_EMISSIONS_COLUMNS = { - "country": "country", - "year": "year", - "consumption_emissions": "consumption_emissions", -} - -# Conversion from terawatt-hours to kilowatt-hours. -TWH_TO_KWH = 1e9 - -# Conversion factor to change from billion tonnes of carbon to tonnes of CO2. -BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e9 - -# Conversion factor to change from million tonnes of carbon to tonnes of CO2. -MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 - -# Conversion from million tonnes of CO2 to tonnes of CO2. -MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 = 1e6 - -# Conversion from tonnes of CO2 to kg of CO2 (used for emissions per GDP and per unit energy). -TONNES_OF_CO2_TO_KG_OF_CO2 = 1000 - -# In order to remove uninformative columns, keep only rows where at least one of the following columns has data. -# All other columns are either derived variables, or global variables, or auxiliary variables from other datasets. -COLUMNS_THAT_MUST_HAVE_DATA = [ - "emissions_from_cement", - "emissions_from_coal", - "emissions_from_flaring", - "emissions_from_gas", - "emissions_from_oil", - "emissions_from_other_industry", - "emissions_total", - "consumption_emissions", - "emissions_from_land_use_change", - # 'land_use_change_quality_flag', -] - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load meadow dataset and read all its tables. - ds_meadow = paths.load_dataset("global_carbon_budget") - tb_co2 = ds_meadow["global_carbon_budget_fossil_co2_emissions"].reset_index() - tb_historical = ds_meadow["global_carbon_budget_historical_budget"].reset_index() - tb_consumption = ds_meadow["global_carbon_budget_consumption_emissions"].reset_index() - tb_production = ds_meadow["global_carbon_budget_production_emissions"].reset_index() - tb_land_use = ds_meadow["global_carbon_budget_land_use_change"].reset_index() - - # Load primary energy consumption dataset and read its main table. - ds_energy = paths.load_dataset("primary_energy_consumption") - tb_energy = ds_energy["primary_energy_consumption"].reset_index() - - #################################################################################################################### - # TODO: Remove this temporary solution once primary energy consumption dataset has origins. - error = "Remove temporary solution now that primary energy consumption has origins." - assert not tb_energy["primary_energy_consumption__twh"].metadata.origins, error - from etl.data_helpers.misc import add_origins_to_energy_table - - tb_energy = add_origins_to_energy_table(tb_energy=tb_energy) - #################################################################################################################### - - # Load GDP dataset. - ds_gdp = paths.load_dataset("ggdc_maddison") - - # Load population dataset. - ds_population = paths.load_dataset("population") - - # Load regions dataset. - ds_regions = paths.load_dataset("regions") - - # Load income groups dataset. - ds_income_groups = paths.load_dataset("income_groups") - - # - # Process data. - # - # Prepare fossil CO2 emissions data. - tb_co2 = prepare_fossil_co2_emissions(tb_co2=tb_co2) - - # Prepare consumption-based emission data. - tb_consumption = prepare_consumption_emissions(tb_consumption=tb_consumption) - - # Prepare production-based emission data. - tb_production = prepare_production_emissions(tb_production=tb_production) - - # Prepare land-use emission data. - tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) - - # Select and rename columns from primary energy data. - tb_energy = tb_energy[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS, errors="raise") - - # Prepare historical emissions data. - tb_historical = prepare_historical_emissions(tb_historical=tb_historical) - - # Run sanity checks on input data. - sanity_checks_on_input_data( - tb_production=tb_production, tb_consumption=tb_consumption, tb_historical=tb_historical, tb_co2=tb_co2 - ) - - # Extract global emissions, including bunker and land-use change emissions. - tb_global_emissions = extract_global_emissions( - tb_co2=tb_co2, tb_historical=tb_historical, ds_population=ds_population - ) - - # Harmonize country names. - tb_co2 = harmonize_country_names(tb=tb_co2) - tb_consumption = harmonize_country_names(tb=tb_consumption) - tb_production = harmonize_country_names(tb=tb_production) - tb_land_use = harmonize_country_names(tb=tb_land_use) - - # Fix duplicated rows for Palau. - tb_co2 = fix_duplicated_palau_data(tb_co2=tb_co2) - - # Add new variables to main table (consumption-based emissions, emission intensity, per-capita emissions, etc.). - tb_combined = combine_data_and_add_variables( - tb_co2=tb_co2, - tb_production=tb_production, - tb_consumption=tb_consumption, - tb_global_emissions=tb_global_emissions, - tb_land_use=tb_land_use, - tb_energy=tb_energy, - ds_gdp=ds_gdp, - ds_population=ds_population, - ds_regions=ds_regions, - ds_income_groups=ds_income_groups, - ) - - # Run sanity checks on output data. - sanity_checks_on_output_data(tb_combined) - - # - # Save outputs. - # - # Create a new garden dataset and use metadata from meadow dataset. - ds_garden = create_dataset( - dest_dir=dest_dir, tables=[tb_combined], default_metadata=ds_meadow.metadata, check_variables_metadata=True - ) - ds_garden.save() - - -def sanity_checks_on_input_data( - tb_production: Table, tb_consumption: Table, tb_historical: Table, tb_co2: Table -) -> None: - """Run sanity checks on input data files. - - These checks should be used prior to country harmonization, but after basic processing of the tables. - - Parameters - ---------- - tb_production : Table - Production-based emissions from GCP's official national emissions dataset (excel file). - tb_consumption : Table - Consumption-based emissions from GCP's official national emissions dataset (excel file). - tb_historical : Table - Historical emissions from GCP's official global emissions dataset (excel file). - tb_co2 : Table - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - - """ - tb_production = tb_production.copy() - tb_consumption = tb_consumption.copy() - tb_historical = tb_historical.copy() - tb_co2 = tb_co2.copy() - - # In the original data, Bunkers was included in the national data file, as another country. - # But I suppose it should be considered as another kind of global emission. - # In fact, bunker emissions should coincide for production and consumption emissions. - global_bunkers_emissions = ( - tb_production[tb_production["country"] == "Bunkers"][["year", "production_emissions"]] - .reset_index(drop=True) - .rename(columns={"production_emissions": "global_bunker_emissions"}, errors="raise") - ) - - # Check that we get exactly the same array of bunker emissions from the consumption emissions table - # (on years where there is data for bunker emissions in both datasets). - comparison = pr.merge( - global_bunkers_emissions, - tb_consumption[tb_consumption["country"] == "Bunkers"][["year", "consumption_emissions"]] - .reset_index(drop=True) - .rename(columns={"consumption_emissions": "global_bunker_emissions"}, errors="raise"), - how="inner", - on="year", - suffixes=("", "_check"), - ) - - error = "Bunker emissions were expected to coincide in production and consumption emissions tables." - assert (comparison["global_bunker_emissions"] == comparison["global_bunker_emissions_check"]).all(), error - - # Check that all production-based emissions are positive. - error = "There are negative emissions in tb_production (from the additional variables dataset)." - assert (tb_production.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that all production-based emissions from the fossil CO2 dataset are positive. - error = "There are negative emissions in tb_co2 (from the fossil CO2 dataset)." - assert (tb_co2.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that all consumption-based emissions are positive. - error = "There are negative emissions in tb_consumption (from the national emissions dataset)." - assert (tb_consumption.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error - - # Check that, for the World, production emissions coincides with consumption emissions (on common years). - error = "Production and consumption emissions for the world were expected to be identical." - comparison = pr.merge( - tb_production[tb_production["country"] == "World"].reset_index(drop=True), - tb_consumption[tb_consumption["country"] == "World"].reset_index(drop=True), - how="inner", - on="year", - ) - assert (comparison["production_emissions"] == comparison["consumption_emissions"]).all(), error - - # Check that production emissions for the World coincide with global (historical) emissions (on common years). - comparison = pr.merge( - tb_production[tb_production["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), - tb_historical[["year", "global_fossil_emissions"]], - how="inner", - on="year", - ) - error = "Production emissions for the world were expected to coincide with global fossil emissions." - assert ( - abs(comparison["production_emissions"] - comparison["global_fossil_emissions"]) - / (comparison["global_fossil_emissions"]) - < 0.001 - ).all(), error - - # In the Fossil CO2 file, international transport emissions has been separated into aviation and shipping. - # Emissions are also separated by fuel. - # We'll add them to the global emissions. - global_aviation_and_shipping = ( - tb_co2[tb_co2["country"].isin(["International Aviation", "International Shipping"])] - .dropna() - .pivot(index="year", columns="country", values="emissions_total") - .reset_index() - ) - global_aviation_and_shipping["global_aviation_and_shipping"] = ( - global_aviation_and_shipping["International Aviation"] + global_aviation_and_shipping["International Shipping"] - ) - comparison = ( - tb_production[tb_production["country"] == "Bunkers"] - .reset_index(drop=True) - .rename(columns={"production_emissions": "global_bunker_emissions"}) - .merge( - global_aviation_and_shipping[["year", "global_aviation_and_shipping"]], - how="outer", - on="year", - ) - .sort_values("year") - .reset_index(drop=True) - ) - # Keep only rows where both time series are informed. - comparison = comparison.dropna( - subset=["global_bunker_emissions", "global_aviation_and_shipping"], how="any" - ).reset_index(drop=True) - error = ( - "Bunker emissions from national emissions file should coincide (within 0.0001%) with the sum of aviation" - " and shipping emissions from the Fossil CO2 file." - ) - assert ( - 100 - * abs(comparison["global_bunker_emissions"] - comparison["global_aviation_and_shipping"]) - / (comparison["global_bunker_emissions"]) - < 0.0001 - ).all(), error - - # Now check that all other emissions (that are not from bunker fuels) in tb_production (emissions from the national - # excel file) coincide with emissions in tb_co2 (from the Fossil CO2 emissions csv file). - # Since country names have not yet been harmonized, rename the only countries that are present in both datasets. - comparison = pr.merge( - tb_co2[["country", "year", "emissions_total"]], - tb_production[tb_production["country"] != "Bunkers"].replace({"World": "Global"}), - on=["country", "year"], - how="inner", - ).dropna(subset=["emissions_total", "production_emissions"], how="any") - # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in tb_production), - # omit that row in the comparison. - comparison = comparison.drop( - comparison[(comparison["country"] == "Kuwait") & (comparison["year"] == 1991)].index - ).reset_index(drop=True) - # Check that production emissions from national file coincide with the Fossil CO2 emissions dataset. - # Assert that the difference is smaller than 1%. - error = "Production emissions from national file were expected to coincide with the Fossil CO2 emissions dataset." - assert ( - ( - 100 - * abs(comparison["production_emissions"] - comparison["emissions_total"]) - / (comparison["emissions_total"]) - ).fillna(0) - < 1 - ).all(), error - - -def sanity_checks_on_output_data(tb_combined: Table) -> None: - """Run sanity checks on output data. - - These checks should be run on the very final output table (with an index) prior to storing it as a table. - - Parameters - ---------- - tb_combined : Table - Combination of all input tables, after processing, harmonization, and addition of variables. - - """ - tb_combined = tb_combined.reset_index() - error = "All variables (except traded emissions, growth, and land-use change) should be >= 0 or nan." - positive_variables = [ - col - for col in tb_combined.columns - if col != "country" - if "traded" not in col - if "growth" not in col - if "land_use" not in col - ] - assert (tb_combined[positive_variables].fillna(0) >= 0).all().all(), error - - error = "Production emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert tb_combined[ - (tb_combined["country"] == "World") & (abs(tb_combined["emissions_total_as_share_of_global"] - 100) > 2) - ].empty, error - - error = "Consumption emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert tb_combined[ - (tb_combined["country"] == "World") & (abs(tb_combined["consumption_emissions_as_share_of_global"] - 100) > 2) - ].empty, error - - error = "Population as a share of global population should be 100% for 'World'." - assert tb_combined[ - (tb_combined["country"] == "World") & (tb_combined["population_as_share_of_global"].fillna(100) != 100) - ].empty, error - - error = "All share of global emissions should be smaller than 100% (within 2% error)." - share_variables = [col for col in tb_combined.columns if "share" in col] - assert (tb_combined[share_variables].fillna(0) <= 102).all().all(), error - - # Check that cumulative variables are monotonically increasing. - # Firstly, list columns of cumulative variables, but ignoring cumulative columns as a share of global - # (since they are not necessarily monotonic) and land-use change (which can be negative). - cumulative_cols = [ - col for col in tb_combined.columns if "cumulative" in col if "share" not in col if "land_use" not in col - ] - # Using ".is_monotonic_increasing" can fail when differences between consecutive numbers are very small. - # Instead, sort data backwards in time, and check that consecutive values of cumulative variables always have - # a percentage change that is smaller than, say, 0.1%. - error = ( - "Cumulative variables (not given as a share of global) should be monotonically increasing (except when " - "including land-use change emissions, which can be negative)." - ) - assert ( - tb_combined.sort_values("year", ascending=False) - .groupby("country") - .agg({col: lambda x: ((x.pct_change().dropna() * 100) <= 0.1).all() for col in cumulative_cols}) - .all() - .all() - ), error - - error = ( - "Production emissions as a share of global production emissions for the World should always be 100% " - "(or larger than 98%, given small discrepancies)." - ) - # Consumption emissions as a share of global production emissions is allowed to be smaller than 100%. - share_variables = [col for col in tb_combined.columns if "share" in col if "consumption" not in col] - assert (tb_combined[tb_combined["country"] == "World"][share_variables].fillna(100) > 98).all().all(), error - - error = "Traded emissions for the World should be close to zero (within 2% error)." - world_mask = tb_combined["country"] == "World" - assert ( - abs( - 100 - * tb_combined[world_mask]["traded_emissions"].fillna(0) - / tb_combined[world_mask]["emissions_total"].fillna(1) - ) - < 2 - ).all(), error - - -def prepare_fossil_co2_emissions(tb_co2: Table) -> Table: - """Prepare Fossil CO2 emissions data (basic processing).""" - # Select and rename columns from fossil CO2 data. - tb_co2 = tb_co2[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") - - # Ensure all emissions are given in tonnes of CO2. - tb_co2[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 - - #################################################################################################################### - # For certain years, column "emissions_from_other_industry" is not informed for "World" but it is informed - # for some countries (namely China and US). - # Note that this is not necessarily an issue in the original data: The data provider may have decided that it is - # better to leave the world uninformed where not enough countries are informed. - # However, "emissions_total" for the World seems to include those contributions from China and the US. - # This can be easily checked in the original data by selecting the year 1989 (last year for which there is data for - # China and US, but not for the World). The sum of emissions from all sources (namely coal, oil, gas, cement, and - # flaring, given that "other" is empty) does not add up to "emissions_total". But, if one includes the other - # emissions from China and US, then it does add up. - # This inconsistency causes the cumulative emissions from other industry for China and US to be larger than the - # global cumulative emissions. And the share of global emissions for those countries becomes hence larger than 100%. - # To fix this issue, we aggregate the data for China and US on those years when the world's data is missing (without - # touching other years or other columns), and add that data to the global emissions from other industry. - # NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. - - # Firstly, list of years for which the world has no data for emissions_from_other_industry. - world_missing_years = ( - tb_co2[(tb_co2["country"] == "Global") & (tb_co2["emissions_from_other_industry"].isnull())]["year"] - .unique() - .tolist() # type: ignore - ) - # Data that needs to be aggregated. - data_missing_in_world = tb_co2[ - tb_co2["year"].isin(world_missing_years) & (tb_co2["emissions_from_other_industry"].notnull()) - ] - # Check that there is indeed data to be aggregated (that is missing for the World). - error = ( - "Expected emissions_from_other_industry to be null for the world but not null for certain countries " - "(which was an issue in the original fossil CO2 data). The issue may be fixed and the code can be simplified." - ) - assert len(data_missing_in_world) > 0, error - # Create a table of aggregate data for the World, on those years when it's missing. - aggregated_missing_data = ( - data_missing_in_world.groupby("year") - .agg({"emissions_from_other_industry": "sum"}) - .reset_index() - .assign(**{"country": "Global"}) - ) - # Combine the new table of aggregate data with the main table. - tb_co2 = dataframes.combine_two_overlapping_dataframes( - df1=tb_co2, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True - ) - # NOTE: The previous function currently does not properly propagate metadata, but keeps only the sources of the - # first table. But given that both tables combined have the same source, we don't need to manually change it. - #################################################################################################################### - - # We add the emissions from "Kuwaiti Oil Fires" (which is also included as a separate country) as part of the - # emissions of Kuwait. This ensures that they will be included in region aggregates. - error = "'Kuwaiti Oil Fires' was expected to only have not-null data for 1991." - assert tb_co2[ - (tb_co2["country"] == "Kuwaiti Oil Fires") - & (tb_co2["emissions_total"].notnull()) - & (tb_co2["emissions_total"] != 0) - ]["year"].tolist() == [1991], error - - tb_co2.loc[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991), EMISSION_SOURCES] = ( - tb_co2[(tb_co2["country"] == "Kuwaiti Oil Fires") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values - + tb_co2[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values - ) - - # Check that "emissions_total" agrees with the sum of emissions from individual sources. - error = "The sum of all emissions should add up to total emissions (within 1%)." - assert ( - abs( - tb_co2.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) - - tb_co2["emissions_total"].fillna(0) - ) - / (tb_co2["emissions_total"].fillna(0) + 1e-7) - < 1e-2 - ).all(), error - - # Many rows have zero total emissions, but actually the individual sources are nan. - # Total emissions in those cases should be nan, instead of zero. - no_individual_emissions = tb_co2.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) - tb_co2.loc[no_individual_emissions, "emissions_total"] = np.nan - - return tb_co2 - - -def prepare_consumption_emissions(tb_consumption: Table) -> Table: - """Prepare consumption-based emissions data (basic processing).""" - # Select and rename columns. - tb_consumption = tb_consumption[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( - columns=CONSUMPTION_EMISSIONS_COLUMNS, errors="raise" - ) - - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in tb_consumption.drop(columns=["country", "year"]).columns: - tb_consumption[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - # List indexes of rows in tb_consumption corresponding to outliers (defined above in OUTLIERS_IN_tb_consumption). - outlier_indexes = [ - tb_consumption[(tb_consumption["country"] == outlier[0]) & (tb_consumption["year"] == outlier[1])].index.item() - for outlier in OUTLIERS_IN_CONSUMPTION_DF - ] - - error = ( - "Outliers were expected to have negative consumption emissions. " - "Maybe outliers have been fixed (and should be removed from the code)." - ) - assert (tb_consumption.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error - - # Remove outliers. - tb_consumption = tb_consumption.drop(outlier_indexes).reset_index(drop=True) - - return tb_consumption - - -def prepare_production_emissions(tb_production: Table) -> Table: - """Prepare production-based emissions data (basic processing).""" - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in tb_production.drop(columns=["country", "year"]).columns: - tb_production[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return tb_production - - -def prepare_land_use_emissions(tb_land_use: Table) -> Table: - """Prepare land-use change emissions data (basic processing).""" - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - tb_land_use["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - # There are two additional regions in the land-use change file, namely Global and EU27. - # It makes sense to extract national land-use change contributions from one of the sheets of that file (we currently - # do so from the "BLUE" sheet), since there are no other national land-use change emissions in other files. - # But for global emissions, it makes more sense to take the ones estimated by GCP, which are given in the - # "Historical Budget" sheet of the global emissions file. - # So, remove the data for "Global". - # We also remove EU27 data, as explained above, since we aggregate that data ourselves. - tb_land_use = tb_land_use[~tb_land_use["country"].isin(["Global", "EU27"])].reset_index(drop=True) - - return tb_land_use - - -def prepare_historical_emissions(tb_historical: Table) -> Table: - """Prepare historical emissions data.""" - # Select and rename columns from historical emissions data. - tb_historical = tb_historical[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( - columns=HISTORICAL_EMISSIONS_COLUMNS, errors="raise" - ) - - # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in tb_historical.drop(columns=["country", "year"]).columns: - tb_historical[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - return tb_historical - - -def extract_global_emissions(tb_co2: Table, tb_historical: Table, ds_population: Dataset) -> Table: - """Extract World emissions by combining data from the Fossil CO2 emissions and the global emissions dataset. - - The resulting global emissions data includes bunker and land-use change emissions. - - NOTE: This function has to be used after selecting and renaming columns in tb_co2, but before harmonizing country - names in tb_co2 (so that "International Aviation" and "International Shipping" are still listed as countries). - - Parameters - ---------- - tb_co2 : Table - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - tb_historical : Table - Historical emissions from GCP's official global emissions dataset (excel file). - ds_population : Dataset - Population dataset. - - Returns - ------- - global_emissions : Table - World emissions. - - """ - # "International Aviation" and "International Shipping" are now included as separate countries. - # Combine their emissions into one variable. - global_aviation = ( - tb_co2[tb_co2["country"] == "International Aviation"].set_index(["year"]).drop(columns=["country"]) - ) - global_shipping = ( - tb_co2[tb_co2["country"] == "International Shipping"].set_index(["year"]).drop(columns=["country"]) - ) - global_transport = global_aviation + global_shipping - - # Check that total emissions for international aviation coincide with oil emissions. - # NOTE: International shipping does include emissions from gas, coal and oil. - error = "Total emissions from international aviation do not coincide with oil emissions." - assert all((global_aviation["emissions_from_oil"] - global_aviation["emissions_total"]).dropna() == 0), error - - # Keep only total emissions from international transport. - global_transport = ( - global_transport[["emissions_total"]] - .rename(columns={"emissions_total": "global_emissions_from_international_transport"}, errors="raise") - .dropna() - .reset_index() - ) - - # Create a new table of global emissions. - global_emissions = ( - tb_co2[tb_co2["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] - .rename(columns={column: f"global_{column}" for column in EMISSION_SOURCES}, errors="raise") - .sort_values("year") - .reset_index(drop=True) - ) - - # Add bunker fuels to global emissions. - global_emissions = pr.merge(global_emissions, global_transport, on=["year"], how="outer") - - # Add historical land-use change emissions to table of global emissions. - global_emissions = pr.merge( - global_emissions, tb_historical[["year", "global_emissions_from_land_use_change"]], how="left", on="year" - ) - - # Add variable of total emissions including fossil fuels and land use change. - global_emissions["global_emissions_total_including_land_use_change"] = ( - global_emissions["global_emissions_total"] + global_emissions["global_emissions_from_land_use_change"] - ) - - # Calculate global cumulative emissions. - for column in EMISSION_SOURCES + ["emissions_from_land_use_change", "emissions_total_including_land_use_change"]: - global_emissions[f"global_cumulative_{column}"] = global_emissions[f"global_{column}"].cumsum() - - # Add a country column and add global population. - global_emissions["country"] = "World" - - # Add global population. - global_emissions = geo.add_population_to_table( - tb=global_emissions, ds_population=ds_population, population_col="global_population" - ) - - return global_emissions - - -def harmonize_country_names(tb: Table) -> Table: - """Harmonize country names, and fix known issues with certain regions. - - Parameters - ---------- - tb : Table - Emissions data (either from the fossil CO2, the production-based, consumption-based, or land-use emissions - datasets). - - Returns - ------- - tb : Table - Emissions data after harmonizing country names. - - """ - # Harmonize country names. - tb = geo.harmonize_countries( - df=tb, - countries_file=paths.country_mapping_path, - excluded_countries_file=paths.excluded_countries_path, - warn_on_missing_countries=True, - warn_on_unused_countries=False, - make_missing_countries_nan=False, - warn_on_unknown_excluded_countries=False, - ) - - return tb - - -def fix_duplicated_palau_data(tb_co2: Table) -> Table: - tb = tb_co2.copy() - # Check that there is only one data point for each country-year. - # In the fossil CO2 emissions data, after harmonization, "Pacific Islands (Palau)" is mapped to "Palau", and - # therefore there are rows with different data for the same country-year. - # However, "Pacific Islands (Palau)" have data until 1991, and "Palau" has data from 1992 onwards. - # NOTE: this is not an issue with the original data, and it's simply caused by our harmonization of names. - - # Check that duplicate rows are still there. - error = "Expected 'Palau' data to be duplicated. Remove temporary fix." - assert tb[tb.duplicated(subset=["country", "year"])]["country"].unique().tolist() == ["Palau"], error - - # Select rows corresponding to "Palau" prior to 1992, and to "Pacific Islands (Palau)" from 1992 onwards. - indexes_to_drop = ( - tb[ - (tb["country"] == "Palau") & (tb["year"] < 1992) & (tb.duplicated(subset=["country", "year"], keep="first")) - ].index.tolist() - + tb[ - (tb["country"] == "Palau") & (tb["year"] >= 1992) & (tb.duplicated(subset=["country", "year"], keep="last")) - ].index.tolist() - ) - # Check that the selected rows do not overlap. - assert len(indexes_to_drop) == len(set(indexes_to_drop)) - # Remove those rows. - tb = tb.drop(indexes_to_drop).reset_index(drop=True) - # NOTE: Do not drop empty rows yet, as they will be needed to have a complete population series. - - return tb - - -def fix_consumption_emissions_for_africa(tb_co2_with_regions: Table) -> Table: - # The calculated consumption emissions for Africa differ significantly from those in the GCP dataset. - # GCP's estimate is significantly larger. The reason may be that many African countries do not have data on - # consumption emissions, so the aggregate may be underestimated. Maybe GCP has a different way to estimate Africa's - # consumption emissions. - # We therefore replace our values for Africa (calculated by summing consumption emissions from African countries) - # with those from GCP. - # At the end of the day, the reason why we keep ours and GCP's version of continents is that our definitions may - # differ. But it is unlikely that their definition of the African continent is different from ours. - # NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. - - # First, check that the discrepancy exists in the current data. - tb = tb_co2_with_regions.copy() - consumption_emissions_africa = tb[(tb["country"] == "Africa") & (tb["year"] == 2020)][ - "consumption_emissions" - ].item() - consumption_emissions_africa_gcp = tb[(tb["country"] == "Africa (GCP)") & (tb["year"] == 2020)][ - "consumption_emissions" - ].item() - error = ( - "Discrepancy in consumption emissions between aggregated Africa and Africa (GCP) no longer exists. " - "Remove temporary fix" - ) - assert ( - consumption_emissions_africa_gcp - consumption_emissions_africa - ) / consumption_emissions_africa_gcp > 0.23, error - - # Replace consumption emissions for "Africa" by those by "Africa (GCP)". - consumption_emissions = tb[tb["country"] != "Africa"][["country", "year", "consumption_emissions"]].reset_index( - drop=True - ) - consumption_emissions_for_africa = ( - consumption_emissions[consumption_emissions["country"] == "Africa (GCP)"] - .reset_index(drop=True) - .replace({"Africa (GCP)": "Africa"}) - ) - consumption_emissions = pr.concat([consumption_emissions, consumption_emissions_for_africa], ignore_index=True) - # Replace consumption emissions in main table by the fixed one. - tb = tb.drop(columns="consumption_emissions").merge(consumption_emissions, on=["country", "year"], how="outer") - - # Sanity checks. - # All columns except consumption_emissions should be identical to the original. - error = "Mismatch before and after fixing consumption emissions for Africa." - for col in tb.drop(columns=["consumption_emissions"]).columns: - assert ( - tb[col].dropna().reset_index(drop=True) == tb_co2_with_regions[col].dropna().reset_index(drop=True) - ).all() - # Consumption emissions should be identical to the original except for Africa. - assert ( - tb[tb["country"] != "Africa"]["consumption_emissions"].dropna().reset_index(drop=True) - == tb_co2_with_regions[tb_co2_with_regions["country"] != "Africa"]["consumption_emissions"] - .dropna() - .reset_index(drop=True) - ).all() - - return tb - - -def combine_data_and_add_variables( - tb_co2: Table, - tb_production: Table, - tb_consumption: Table, - tb_global_emissions: Table, - tb_land_use: Table, - tb_energy: Table, - ds_gdp: Dataset, - ds_population: Table, - ds_regions: Dataset, - ds_income_groups: Dataset, -) -> Table: - """Combine all relevant data into one table, add region aggregates, and add custom variables (e.g. emissions per - capita). - - Parameters - ---------- - tb_co2 : Table - Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file), after harmonization. - tb_production : Table - Production-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - tb_consumption : Table - Consumption-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - tb_global_emissions : Table - World emissions (including bunker and land-use change emissions). - tb_land_use : Table - National land-use change emissions from GCP's official dataset (excel file), after harmonization. - tb_energy : Table - Primary energy data. - ds_gdp : Dataset - GDP dataset. - ds_population : Dataset - Population dataset. - ds_regions : Dataset - Regions dataset. - ds_income_groups : Dataset - Income groups dataset. - - Returns - ------- - tb_co2_with_regions : Table - Combined data, with all additional variables and with region aggregates. - - """ - tb_co2_with_regions = tb_co2.copy() - - # Add region aggregates that were included in the national emissions file, but not in the Fossil CO2 emissions file. - gcp_aggregates = sorted(set(tb_production["country"]) - set(tb_co2_with_regions["country"])) - # NOTE: Here, "International transport" is included. This will cause that total emissions have both data for - # international aviation and shipping, and international transport (which is the sum of the former two). - # But international transport will be removed later, in columns when that happens. - tb_co2_with_regions = pr.concat( - [ - tb_co2_with_regions, - tb_production[tb_production["country"].isin(gcp_aggregates)] - .rename(columns={"production_emissions": "emissions_total"}) - .astype({"year": int}), - ], - ignore_index=True, - short_name=paths.short_name, - ).reset_index(drop=True) - - # Add consumption emissions to main table (keep only the countries of the main table). - # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to tb_co2 - # (when merging with tb_production), all countries from tb_consumption should be included in tb_co2. - error = "Some countries in tb_consumption are not included in tb_co2." - assert set(tb_consumption["country"]) < set(tb_co2_with_regions["country"]), error - tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_consumption, on=["country", "year"], how="outer") - - # Add population to original table. - tb_co2_with_regions = geo.add_population_to_table( - tb=tb_co2_with_regions, ds_population=ds_population, warn_on_missing_countries=False - ) - - # Add GDP to main table. - tb_co2_with_regions = geo.add_gdp_to_table(tb=tb_co2_with_regions, ds_gdp=ds_gdp) - - # Add primary energy to main table. - tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_energy, on=["country", "year"], how="left") - - # For convenience, rename columns in land-use change emissions data. - tb_land_use = tb_land_use.rename( - columns={"emissions": "emissions_from_land_use_change", "quality_flag": "land_use_change_quality_flag"} - ) - - # Land-use change data does not include data for the World. Include it by merging with the global dataset. - tb_land_use = pr.concat( - [ - tb_land_use, - tb_global_emissions.rename( - columns={"global_emissions_from_land_use_change": "emissions_from_land_use_change"} - )[["year", "emissions_from_land_use_change"]] - .dropna() - .assign(**{"country": "World"}), - ], - ignore_index=True, - ).astype({"year": int}) - - # Add land-use change emissions to main table. - tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_land_use, on=["country", "year"], how="outer") - - # Add total emissions (including land-use change) for each country. - tb_co2_with_regions["emissions_total_including_land_use_change"] = ( - tb_co2_with_regions["emissions_total"] + tb_co2_with_regions["emissions_from_land_use_change"] - ) - - # Add region aggregates. - # Aggregate not only emissions data, but also population, gdp and primary energy. - # This way we ensure that custom regions (e.g. "North America (excl. USA)") will have all required data. - aggregations = { - column: "sum" - for column in tb_co2_with_regions.columns - if column not in ["country", "year", "land_use_change_quality_flag"] - } - for region in REGIONS: - countries_in_region = geo.list_members_of_region( - region=region, - ds_regions=ds_regions, - ds_income_groups=ds_income_groups, - additional_regions=REGIONS[region].get("additional_regions", None), - excluded_regions=REGIONS[region].get("excluded_regions", None), - additional_members=REGIONS[region].get("additional_members", None), - excluded_members=REGIONS[region].get("excluded_members", None), - include_historical_regions_in_income_groups=True, - ) - tb_co2_with_regions = geo.add_region_aggregates( - df=tb_co2_with_regions, - region=region, - countries_in_region=countries_in_region, - countries_that_must_have_data=[], - frac_allowed_nans_per_year=0.999, - aggregations=aggregations, - ) - - # Fix consumption emissions for Africa. - tb_co2_with_regions = fix_consumption_emissions_for_africa(tb_co2_with_regions=tb_co2_with_regions) - - # Temporarily add global emissions and global cumulative emissions columns to main table, to be able to calculate - # indicators in terms of global emissions. - tb_co2_with_regions = pr.merge( - tb_co2_with_regions, tb_global_emissions.drop(columns="country"), on=["year"], how="left" - ) - - # Temporarily add certain global emissions variables. - # This is done simply to be able to consider "consumption_emissions" as just another type of emission - # when creating additional variables. - tb_co2_with_regions["global_consumption_emissions"] = tb_co2_with_regions["global_emissions_total"].copy() - tb_co2_with_regions["global_cumulative_consumption_emissions"] = tb_co2_with_regions[ - "global_cumulative_emissions_total" - ].copy() - - # Ensure main table is sorted (so that cumulative emissions are properly calculated). - tb_co2_with_regions = tb_co2_with_regions.sort_values(["country", "year"]).reset_index(drop=True) - - # Add new variables for each source of emissions. - for column in EMISSION_SOURCES + [ - "consumption_emissions", - "emissions_from_land_use_change", - "emissions_total_including_land_use_change", - ]: - # Add per-capita variables. - tb_co2_with_regions[f"{column}_per_capita"] = tb_co2_with_regions[column] / tb_co2_with_regions["population"] - - # Add columns for cumulative emissions. - # Rows that had nan emissions will have nan cumulative emissions. - # But nans will not be propagated in the sum. - # This means that countries with some (not all) nans will have the cumulative sum of the informed emissions - # (treating nans as zeros), but will have nan on those rows that were not informed. - tb_co2_with_regions[f"cumulative_{column}"] = tb_co2_with_regions.groupby(["country"])[column].cumsum() - - # Add share of global emissions. - tb_co2_with_regions[f"{column}_as_share_of_global"] = ( - 100 * tb_co2_with_regions[column] / tb_co2_with_regions[f"global_{column}"] - ) - - # Add share of global cumulative emissions. - tb_co2_with_regions[f"cumulative_{column}_as_share_of_global"] = ( - 100 * tb_co2_with_regions[f"cumulative_{column}"] / tb_co2_with_regions[f"global_cumulative_{column}"] - ) - - # Add total emissions per unit energy (in kg of emissions per kWh). - tb_co2_with_regions["emissions_total_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * tb_co2_with_regions["emissions_total"] - / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) - ) - - # Add total emissions (including land-use change) per unit energy (in kg of emissions per kWh). - tb_co2_with_regions["emissions_total_including_land_use_change_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * tb_co2_with_regions["emissions_total_including_land_use_change"] - / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) - ) - - # Add total emissions per unit GDP. - tb_co2_with_regions["emissions_total_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["emissions_total"] / tb_co2_with_regions["gdp"] - ) - - # Add total emissions (including land-use change) per unit GDP. - tb_co2_with_regions["emissions_total_including_land_use_change_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 - * tb_co2_with_regions["emissions_total_including_land_use_change"] - / tb_co2_with_regions["gdp"] - ) - - # Add total consumption emissions per unit GDP. - tb_co2_with_regions["consumption_emissions_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["consumption_emissions"] / tb_co2_with_regions["gdp"] - ) - - # Add variable of emissions embedded in trade. - tb_co2_with_regions["traded_emissions"] = ( - tb_co2_with_regions["consumption_emissions"] - tb_co2_with_regions["emissions_total"] - ) - tb_co2_with_regions["pct_traded_emissions"] = ( - 100 * tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["emissions_total"] - ) - tb_co2_with_regions["traded_emissions_per_capita"] = ( - tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["population"] - ) - - # Add annual percentage growth of total emissions. - tb_co2_with_regions["pct_growth_emissions_total"] = ( - tb_co2_with_regions.groupby("country")["emissions_total"].pct_change() * 100 - ) - - # Add annual percentage growth of total emissions (including land-use change). - tb_co2_with_regions["pct_growth_emissions_total_including_land_use_change"] = ( - tb_co2_with_regions.groupby("country")["emissions_total_including_land_use_change"].pct_change() * 100 - ) - - # Add annual absolute growth of total emissions. - tb_co2_with_regions["growth_emissions_total"] = tb_co2_with_regions.groupby("country")["emissions_total"].diff() - - # Add annual absolute growth of total emissions (including land-use change). - tb_co2_with_regions["growth_emissions_total_including_land_use_change"] = tb_co2_with_regions.groupby("country")[ - "emissions_total_including_land_use_change" - ].diff() - - # Create variable of population as a share of global population. - tb_co2_with_regions["population_as_share_of_global"] = ( - tb_co2_with_regions["population"] / tb_co2_with_regions["global_population"] * 100 - ) - - # Remove temporary columns of global emissions. - tb_co2_with_regions = tb_co2_with_regions.drop( - columns=[column for column in tb_co2_with_regions.columns if column.startswith("global_")] - ) - - # Empty rows of international transport if international aviation and shipping are already informed. - # First find the list of columns where this happens. - international_entities = [entity for entity in set(tb_co2_with_regions["country"]) if "International" in entity] - check = tb_co2_with_regions[tb_co2_with_regions["country"].isin(international_entities)].reset_index(drop=True) - # Check that the only columns where international transport, aviation and shipping are all informed are columns - # derived from total emissions. - columns_with_redundant_international_emissions = [ - column - for column in check.drop(columns=["country", "year"]).columns - if set(check.dropna(subset=column)["country"]) == set(international_entities) - ] - error = ( - "Unexpected columns where international transport is informed as well as international aviation and shipping." - ) - assert all(["emissions_total" in column for column in columns_with_redundant_international_emissions]), error - # Now for those columns, make international transport nan. - for column in columns_with_redundant_international_emissions: - tb_co2_with_regions.loc[tb_co2_with_regions["country"] == "International transport", column] = np.nan - - # Replace infinity values (for example when calculating growth from zero to non-zero) in the data by nan. - for column in tb_co2_with_regions.drop(columns=["country", "year"]).columns: - tb_co2_with_regions.loc[np.isinf(tb_co2_with_regions[column]), column] = np.nan - - # For special GCP countries/regions (e.g. "Europe (GCP)") we should keep only the original data. - # Therefore, make nan all additional variables for those countries/regions, and keep only GCP's original data. - added_variables = tb_co2_with_regions.drop( - columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA - ).columns.tolist() - tb_co2_with_regions.loc[ - (tb_co2_with_regions["country"].str.contains(" (GCP)", regex=False)), added_variables - ] = np.nan - - # Remove uninformative rows (those that have only data for, say, gdp, but not for variables related to emissions). - tb_co2_with_regions = tb_co2_with_regions.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index( - drop=True - ) - - # Set an appropriate index, ensure there are no rows that only have nan, and sort conveniently. - tb_co2_with_regions = tb_co2_with_regions.set_index(["country", "year"], verify_integrity=True) - tb_co2_with_regions = ( - tb_co2_with_regions.dropna(subset=tb_co2_with_regions.columns, how="all").sort_index().sort_index(axis=1) - ) - - # Rename table. - tb_co2_with_regions.metadata.short_name = paths.short_name - - return tb_co2_with_regions diff --git a/etl/steps/archive/garden/health/2022-12-28/deaths_karlinsky.countries.json b/etl/steps/archive/garden/health/2022-12-28/deaths_karlinsky.countries.json deleted file mode 100644 index 0d7a9133f62..00000000000 --- a/etl/steps/archive/garden/health/2022-12-28/deaths_karlinsky.countries.json +++ /dev/null @@ -1,183 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Andorra": "Andorra", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Canada": "Canada", - "Central African Republic": "Central African Republic", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Georgia": "Georgia", - "Germany": "Germany", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guyana": "Guyana", - "Honduras": "Honduras", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Ivory Coast": "Cote d'Ivoire", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Laos": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao": "Macao", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Malta": "Malta", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Mexico": "Mexico", - "Moldova": "Moldova", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Nigeria": "Nigeria", - "North Macedonia": "North Macedonia", - "Northern Mariana Islands": "Northern Mariana Islands", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Palestine": "Palestine", - "Panama": "Panama", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russia": "Russia", - "Rwanda": "Rwanda", - "R\u00e9union": "Reunion", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "San Marino": "San Marino", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "South Africa": "South Africa", - "South Korea": "South Korea", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syria": "Syria", - "Taiwan": "Taiwan", - "Tajikistan": "Tajikistan", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Vietnam": "Vietnam", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Bosnia": "Bosnia and Herzegovina", - "Curaco": "Curacao", - "Democractic Republic of the Congo": "Democratic Republic of Congo", - "US Virgin Islands": "United States Virgin Islands" -} diff --git a/etl/steps/archive/garden/health/2022-12-28/deaths_karlinsky.meta.yml b/etl/steps/archive/garden/health/2022-12-28/deaths_karlinsky.meta.yml deleted file mode 100644 index 8863fd6c923..00000000000 --- a/etl/steps/archive/garden/health/2022-12-28/deaths_karlinsky.meta.yml +++ /dev/null @@ -1,44 +0,0 @@ -tables: - deaths: - title: International Completeness of Death Registration 2015-2019 – Karlinsky (2021) - description: > - Contains all metrics used in the International Completeness of Death Registration 2015-2019 paper. - - Note that there is missing data for some country-years. - variables: - # Life expectancy - reg_deaths: - title: Registered deaths - unit: "deaths" - description: > - The number of deaths registered in the vital registration system for the country-year. - expected_wpp: - title: Expected number of deaths (WPP 2019) - unit: deaths - description: > - The number of deaths expected to occur for the country-year from World Population Prospects 2019. - - expected_gbd: - title: Expected number of deaths (GBD 2019) - unit: deaths - description: > - The number of deaths expected to occur for the country-year from Global Burden of Disease 2019. - - expected_ghe: - title: Expected number of deaths (GHE 2019) - unit: deaths - description: > - The number of deaths expected to occur for the country-year from Global Health Estimates 2019. - - expected_deaths: - title: Expected number of deaths - unit: deaths - description: > - The mean of "Expected number of deaths (WPP 2019)", "Expected number of deaths (GBD 2019)" and "Expected number of deaths (GHE 2019)". - - death_comp: - title: Estimated completeness of death reporting - short_unit: "%" - unit: "%" - description: > - It ranges from 0 to 100. Derived as "Registered deaths" divided by "Expected number of deaths". See paper (https://www.medrxiv.org/content/10.1101/2021.08.12.21261978v1) for details. diff --git a/etl/steps/archive/garden/health/2022-12-28/deaths_karlinsky.py b/etl/steps/archive/garden/health/2022-12-28/deaths_karlinsky.py deleted file mode 100644 index 6167543bf05..00000000000 --- a/etl/steps/archive/garden/health/2022-12-28/deaths_karlinsky.py +++ /dev/null @@ -1,108 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -log = get_logger() - -# naming conventions -N = PathFinder(__file__) -MEADOW_DATASET = "meadow/health/2022-12-28/deaths_karlinsky" -TABLE_NAME = "deaths" - - -def run(dest_dir: str) -> None: - # read dataset from meadow - log.info("deaths_karlinsky: loading meadow table...") - ds_meadow = Dataset(DATA_DIR / MEADOW_DATASET) - tb_meadow = ds_meadow[TABLE_NAME] - - # clean dataframe - log.info("karlinsky: cleaning dataframe...") - df = clean_dataframe(tb_meadow) - - # sanity checks - log.info("karlinsky: sanity checking...") - sanity_check(df) - - # create new dataset with the same metadata as meadow - ds_garden = Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - - # create new table with the same metadata as meadow and add it to dataset - tb_garden = Table(df, short_name=TABLE_NAME) - ds_garden.add(tb_garden) - - # update metadata from yaml file - ds_garden.update_metadata(N.metadata_path) - - ds_garden.save() - - log.info("deaths_karlinsky.end") - - -def clean_dataframe(tb: Table) -> pd.DataFrame: - # convert table to dataframe - df = pd.DataFrame(tb) - # drop and rename columns - df = df.drop(columns=["continent", "source"]) - df = df.rename(columns={"country_name": "country"}) - # harmonize country names - df = harmonize_countries(df) - # set indexes - df = df.set_index(["country", "year"]).sort_index() - return df - - -def harmonize_countries(df: pd.DataFrame) -> pd.DataFrame: - unharmonized_countries = df["country"] - df = geo.harmonize_countries( - df=df, - countries_file=str(N.country_mapping_path), - warn_on_missing_countries=True, - make_missing_countries_nan=True, - ) - - missing_countries = set(unharmonized_countries[df["country"].isnull()]) - if any(missing_countries): - raise RuntimeError( - "The following raw country names have not been harmonized. " - f"Please: (a) edit {N.country_mapping_path} to include these country " - f"names; or (b) add them to {N.excluded_countries_path}." - f"Raw country names: {missing_countries}" - ) - - return df - - -def sanity_check(df: pd.DataFrame) -> None: - # check columns - columns_expected = { - "death_comp", - "expected_deaths", - "expected_gbd", - "expected_ghe", - "expected_wpp", - "reg_deaths", - } - columns_new = set(df.columns).difference(columns_expected) - if columns_new: - raise ValueError(f"Unexpected columns {columns_new}") - - # ensure percentages make sense (within range [0, 100]) - columns_perc = ["death_comp"] - for col in columns_perc: - assert all(df[col] <= 100), f"{col} has values larger than 100%" - assert all(df[col] >= 0), f"{col} has values lower than 0%" - - # ensure absolute values make sense (positive, lower than population) - columns_absolute = [col for col in df.columns if col not in columns_perc] - df_ = df.reset_index() - df_ = geo.add_population_to_dataframe(df_) - for col in columns_absolute: - x = df_.dropna(subset=[col]) - assert all( - x[col] < 0.2 * x["population"] - ), f"{col} contains values that might be too large (compared to population values)!" diff --git a/etl/steps/archive/garden/hmd/2022-11-04/life_tables.countries.json b/etl/steps/archive/garden/hmd/2022-11-04/life_tables.countries.json deleted file mode 100644 index 2a5e4a7cb7a..00000000000 --- a/etl/steps/archive/garden/hmd/2022-11-04/life_tables.countries.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "Australia": "Australia", - "Austria": "Austria", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Bulgaria": "Bulgaria", - "Canada": "Canada", - "Chile": "Chile", - "Croatia": "Croatia", - "Czechia": "Czechia", - "Denmark": "Denmark", - "East Germany": "East Germany", - "Estonia": "Estonia", - "Finland": "Finland", - "Germany": "Germany", - "Greece": "Greece", - "Hong Kong": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "Ireland": "Ireland", - "Japan": "Japan", - "Latvia": "Latvia", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Netherlands": "Netherlands", - "New Zealand": "New Zealand", - "Norway": "Norway", - "Poland": "Poland", - "Portugal": "Portugal", - "Republic of Korea": "South Korea", - "Russia": "Russia", - "Slovenia": "Slovenia", - "Spain": "Spain", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Taiwan": "Taiwan", - "Ukraine": "Ukraine", - "United Kingdom": "United Kingdom", - "West Germany": "West Germany", - "France, Total Population": "France", - "Israel, Total Population": "Israel", - "Italy ": "Italy", - "Slovakia ": "Slovakia", - "The United States of America": "United States" -} \ No newline at end of file diff --git a/etl/steps/archive/garden/hmd/2022-11-04/life_tables.excluded_countries.json b/etl/steps/archive/garden/hmd/2022-11-04/life_tables.excluded_countries.json deleted file mode 100644 index e1ca439c84d..00000000000 --- a/etl/steps/archive/garden/hmd/2022-11-04/life_tables.excluded_countries.json +++ /dev/null @@ -1,9 +0,0 @@ -[ - "England and Wales, Civilian National Population", - "England and Wales, Total Population", - "France, Civilian Population", - "New Zealand -- Maori", - "New Zealand -- Non-Maori", - "Northern Ireland", - "Scotland" -] diff --git a/etl/steps/archive/garden/hmd/2022-11-04/life_tables.meta.yml b/etl/steps/archive/garden/hmd/2022-11-04/life_tables.meta.yml deleted file mode 100644 index 6c9c9900bc0..00000000000 --- a/etl/steps/archive/garden/hmd/2022-11-04/life_tables.meta.yml +++ /dev/null @@ -1,66 +0,0 @@ -dataset: {} - -tables: - period_1x1: - title: Life tables (by age and year) - description: | - Data is provided in one-year age groups and one-year time interval. - variables: &variables-default - central_death_rate: - title: Central death rate - description: Central death rate between ages x and x+n. - probability_of_death: - title: Probability of death - description: Probability of death between ages x and x+n. - avg_survival_length: - title: Average survival length - short_unit: years - unit: years - description: Average length of survival between ages x and x+n for persons dying in the interval. - num_survivors: - title: Number of survivors - unit: survivors - description: Number of survivors at exact age x, assuming survivors at 0 years old is 100,000. - num_deaths: - title: Number of deaths - short_unit: deaths - unit: deaths - description: Number of deaths between ages x and x+n. - num_person_years_lived: - title: Number of person-years lived - unit: person-years - description: Number of person-years lived between ages x and x+n. - num_person_years_remaining: - title: Number of person-years remaining - unit: person-years - description: Number of person-years remaining after exact age x. - life_expectancy: - title: Life expectancy - short_unit: years - unit: years - description: Life expectancy at exact age x (in years). - period_1x5: - title: Life tables (by age and 5-year time interval) - description: | - Data is provided in one-year age groups and five-year time interval. - variables: *variables-default - period_1x10: - title: Life tables (by age and 10-year time interval) - description: | - Data is provided in one-year age groups and ten-year time interval. - variables: *variables-default - period_5x1: - title: Life tables (by 5-year age group and year) - description: | - Data is provided in five-year age groups and one-year time interval. - variables: *variables-default - period_5x5: - title: Life tables (by 5-year age group and 5-year time interval) - description: | - Data is provided in five-year age groups and five-year time interval. - variables: *variables-default - period_5x10: - title: Life tables (by 5-year age group and 10-year time interval) - description: | - Data is provided in five-year age groups and ten-year time interval. - variables: *variables-default diff --git a/etl/steps/archive/garden/hmd/2022-11-04/life_tables.py b/etl/steps/archive/garden/hmd/2022-11-04/life_tables.py deleted file mode 100644 index 8ede77cb525..00000000000 --- a/etl/steps/archive/garden/hmd/2022-11-04/life_tables.py +++ /dev/null @@ -1,87 +0,0 @@ -import json -from typing import List, cast - -import pandas as pd -from owid.catalog import Dataset, Table -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -log = get_logger() - -# naming conventions -N = PathFinder(__file__) - -SHORT_NAME = "life_tables" -MEADOW_VERSION = "2022-11-04" -MEADOW_DATASET = DATA_DIR / f"meadow/hmd/{MEADOW_VERSION}/{SHORT_NAME}" - - -def run(dest_dir: str) -> None: - log.info("life_tables.start") - - # read dataset from meadow - ds_meadow = Dataset(MEADOW_DATASET) - - # init dataset - ds_garden = Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - - # build tables - tables_names = ds_meadow.table_names - for table_name in tables_names: - log.info(f"life_tables:{table_name}.start") - tb_garden = make_table(ds_meadow, table_name) - ds_garden.add(tb_garden) - log.info(f"life_tables:{table_name}.end") - - ds_garden.update_metadata(N.metadata_path) - ds_garden.save() - - log.info("life_tables.end") - - -def make_table(ds_meadow: Dataset, table_name: str) -> Table: - log.info(f"Building table {table_name}...") - - # Country management - tb_garden = ds_meadow[table_name].reset_index() - tb_garden = clean_countries(tb_garden) - tb_garden = tb_garden.set_index(["country", "year", "age"], verify_integrity=True) - - return tb_garden - - -def clean_countries(df: pd.DataFrame) -> pd.DataFrame: - df = exclude_countries(df) - df = harmonize_countries(df) - return df - - -def load_excluded_countries() -> List[str]: - with open(N.excluded_countries_path, "r") as f: - data = json.load(f) - assert isinstance(data, list) - return data - - -def exclude_countries(df: pd.DataFrame) -> pd.DataFrame: - excluded_countries = load_excluded_countries() - return cast(pd.DataFrame, df.loc[~df.country.isin(excluded_countries)]) - - -def harmonize_countries(df: pd.DataFrame) -> pd.DataFrame: - unharmonized_countries = df["country"] - df = geo.harmonize_countries(df=df, countries_file=str(N.country_mapping_path)) - - missing_countries = set(unharmonized_countries[df.country.isnull()]) - if any(missing_countries): - raise RuntimeError( - "The following raw country names have not been harmonized. " - f"Please: (a) edit {N.country_mapping_path} to include these country " - f"names; or (b) add them to {N.excluded_countries_path}." - f"Raw country names: {missing_countries}" - ) - - return df diff --git a/etl/steps/archive/garden/homicide/2023-01-04/unodc.countries.json b/etl/steps/archive/garden/homicide/2023-01-04/unodc.countries.json deleted file mode 100644 index 87d765dd225..00000000000 --- a/etl/steps/archive/garden/homicide/2023-01-04/unodc.countries.json +++ /dev/null @@ -1,212 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia (Plurinational State of)": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cayman Islands": "Cayman Islands", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cura\u00e7ao": "Curacao", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Denmark": "Denmark", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Gibraltar": "Gibraltar", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Holy See": "Vatican", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Isle of Man": "Isle of Man", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Martinique": "Martinique", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Melanesia": "Melanesia", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Republic of Korea": "South Korea", - "Republic of Moldova": "Moldova", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "R\u00e9union": "Reunion", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Martin (French Part)": "Saint Martin (French part)", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "San Marino": "San Marino", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "State of Palestine": "Palestine", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Tajikistan": "Tajikistan", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", - "United Republic of Tanzania": "Tanzania", - "United States Virgin Islands": "United States Virgin Islands", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "China, Hong Kong Special Administrative Region": "Hong Kong", - "China, Macao Special Administrative Region": "Macao", - "China, Taiwan Province of China": "Taiwan", - "Kosovo under UNSCR 1244": "Kosovo", - "T\u00fcrkiye": "Turkey", - "All Africa": "Africa (UN)", - "All Americas": "Americas (UN)", - "All Asia": "Asia (UN)", - "All Europe": "Europe (UN)", - "All Oceania": "Oceania (UN)", - "Channel Island": "Channel Islands" -} diff --git a/etl/steps/archive/garden/homicide/2023-01-04/unodc.excluded_countries.json b/etl/steps/archive/garden/homicide/2023-01-04/unodc.excluded_countries.json deleted file mode 100644 index 0f50eedb994..00000000000 --- a/etl/steps/archive/garden/homicide/2023-01-04/unodc.excluded_countries.json +++ /dev/null @@ -1,21 +0,0 @@ -[ - "Australia and New Zealand", - "Central Asia", - "Eastern Asia", - "Eastern Europe", - "Iraq (Central Iraq)", - "Iraq (Kurdistan Region)", - "Latin America and the Caribbean", - "Northern Africa", - "Northern America", - "Northern Europe", - "South-eastern Asia", - "Southern Asia", - "Southern Europe", - "Sub-Saharan Africa", - "United Kingdom (England and Wales)", - "United Kingdom (Northern Ireland)", - "United Kingdom (Scotland)", - "Western Asia", - "Western Europe" -] \ No newline at end of file diff --git a/etl/steps/archive/garden/homicide/2023-01-04/unodc.meta.yml b/etl/steps/archive/garden/homicide/2023-01-04/unodc.meta.yml deleted file mode 100644 index 24cac9c29d4..00000000000 --- a/etl/steps/archive/garden/homicide/2023-01-04/unodc.meta.yml +++ /dev/null @@ -1,230 +0,0 @@ -dataset: - namespace: homicide - short_name: unodc - title: United Nations Office on Drugs and Crime - Intentional Homicides - description: > - The United Nations Office on Drugs and Crime Intentional Homicide data are sourced from either criminal justice or public health systems. In the former, data are generated by law enforcement or criminal justice authorities in the process of recording and investigating a crime event, whereas in the latter, data are produced by health authorities certifying the cause of death of an individual. - - - The criminal justice data was collected from national authorities with the annual United Nations Survey of Crime Trends and Operations of Criminal Justice Systems (UN-CTS). National focal points working in national agencies responsible for statistics on crime and the criminal justice system and nominated by the Permanent Mission to UNODC are responsible for compiling the data from the other relevant agencies before transmitting the UN-CTS to UNODC. Following the submission, UNODC checks for consistency and coherence with other data sources. - - - Data on homicide from public health sources were primarily obtained from the WHO Mortality Database.10 This dataset is a comprehensive collection of mortality data by cause of death, sex, and age group conducted yearly by the WHO with Member States. Deaths coded with Internatioanl Classification of Disease (ICD10) codes X85-Y09 (injuries inflicted by another person with intent to injure or kill), and ICD10 code Y87.1 (sequelae of assault), generally correspond to the definition of intentional homicide - - - The population data used to calculate homicide rates is sourced from the World Population Prospect, Population Division, United Nations Department of Economic and Social Affairs. - - - The statistical definition contains three elements that characterize the killing of a person as “intentional homicide”: - - - 1. The killing of a person by another person (objective element). - - - 2. The intent of the perpetrator to kill or seriously injure the victim (subjective element). - - - 3. The unlawfulness of the killing (legal element). - - - For recording purposes, all killings that meet the criteria listed above are to be considered intentional homicides, irrespective of definitions provided by national legislations or practices. Killings as a result of terrorist activities are also to be classified as a form of intentional homicide. - - - In order to compile consistent time series of total homicides back to 1990, in several cases data from multiple sources were combined to expand the number of available years within a country’s time series. Time series adjustments were performed when a country had two sources coveringdifferent year-ranges, which had very similar trends in an overlapping time period, but where these trends were at different levels. - - - The countries for which adjusted series for total homicide counts prior to the year 2000 have been produced were the following: Belgium, Brazil, China, Ecuador, Germany, Netherlands, New Zealand, Portugal, South Korea, Spain, Thailand, and United Kingdom. - - licenses: - - url: https://www.un.org/en/about-us/terms-of-use - version: '2023-01-04' - sources: - - name: United Nations Office on Drugs and Crime (2022) - url: https://dataunodc.un.org/dp-intentional-homicide-victims - source_data_url: https://dataunodc.un.org/sites/dataunodc.un.org/files/data_cts_intentional_homicide.xlsx - date_accessed: '2023-01-04' - publication_date: '2022-08-08' - publication_year: 2022 -tables: - unodc: - variables: - firearms_counts: - title: Total homicides by firearm - unit: homicides - display: - numDecimalPlaces: 0 - sharp_object_counts: - title: Total homicides by sharp object - unit: homicides - display: - numDecimalPlaces: 0 - unspecified_means_counts: - title: Total homicides by unspecified means - unit: homicides - display: - numDecimalPlaces: 0 - without_a_weapon__other_mechanism_counts: - title: Total homicides without a weapon - unit: homicides - display: - numDecimalPlaces: 0 - firearms_or_explosives_counts: - title: Total homicides by firearms or explosives - unit: homicides - display: - numDecimalPlaces: 0 - another_weapon_counts: - title: Total homicides by any object used as weapon, including sharp and blunt objects and motor vehicles - unit: homicides - display: - numDecimalPlaces: 0 - firearms_rate_per_100_000_population: - title: Homicides by firearm per 100,000 population - unit: homicides per 100,000 people - display: - numDecimalPlaces: 2 - sharp_object_rate_per_100_000_population: - title: Homicides by sharp object per 100,000 population - unit: homicides per 100,000 people - display: - numDecimalPlaces: 2 - unspecified_means_rate_per_100_000_population: - title: Homicides by unspecified means per 100,000 population - unit: homicides per 100,000 people - display: - numDecimalPlaces: 2 - without_a_weapon__other_mechanism_rate_per_100_000_population: - title: Homicides by without a weapon per 100,000 population - unit: homicides per 100,000 people - display: - numDecimalPlaces: 2 - firearms_or_explosives_rate_per_100_000_population: - title: Homicides by firearms or explosives per 100,000 population - unit: homicides per 100,000 people - display: - numDecimalPlaces: 2 - another_weapon_rate_per_100_000_population: - title: Homicides by sharp or blunt objects, including motor vehicles, per 100,000 population - unit: homicides per 100,000 people - display: - numDecimalPlaces: 2 - counts: - title: Homicides - unit: homicides - display: - numDecimalPlaces: 0 - female_all_ages_counts: - title: Total homicides where the victim was female - unit: homicides - display: - numDecimalPlaces: 0 - female_30_44_counts: - title: Total homicides where the victim was female and aged 30-44 - unit: homicides - display: - numDecimalPlaces: 0 - female_45_59_counts: - title: Total homicides where the victim was female and aged 45-59 - unit: homicides - display: - numDecimalPlaces: 0 - female_60_and_older_counts: - title: Total homicides where the victim was female and aged 60+ - unit: homicides - display: - numDecimalPlaces: 0 - female_0_9_counts: - title: Total homicides where the victim was female and aged 0-9 - unit: homicides - display: - numDecimalPlaces: 0 - female_10__14_counts: - title: Total homicides where the victim was female and aged 10-14 - unit: homicides - display: - numDecimalPlaces: 0 - female_15__17_counts: - title: Total homicides where the victim was female and aged 15-17 - unit: homicides - display: - numDecimalPlaces: 0 - female_18_19_counts: - title: Total homicides where the victim was female and aged 18-19 - unit: homicides - display: - numDecimalPlaces: 0 - female_20_24_counts: - title: Total homicides where the victim was female and aged 20-24 - unit: homicides - display: - numDecimalPlaces: 0 - female_25_29_counts: - title: Total homicides where the victim was female and aged 25-29 - unit: homicides - display: - numDecimalPlaces: 0 - male_all_ages_counts: - title: Total homicides where the victim was male - unit: homicides - display: - numDecimalPlaces: 0 - male_30_44_counts: - title: Total homicides where the victim was male and aged 30-44 - unit: homicides - display: - numDecimalPlaces: 0 - male_45_59_counts: - title: Total homicides where the victim was male and aged 45-59 - unit: homicides - display: - numDecimalPlaces: 0 - male_60_and_older_counts: - title: Total homicides where the victim was male and aged 60+ - unit: homicides - display: - numDecimalPlaces: 0 - male_0_9_counts: - title: Total homicides where the victim was male and aged 0-9 - unit: homicides - display: - numDecimalPlaces: 0 - male_10__14_counts: - title: Total homicides where the victim was male and aged 10-14 - unit: homicides - display: - numDecimalPlaces: 0 - male_15__17_counts: - title: Total homicides where the victim was male and aged 15-17 - unit: homicides - display: - numDecimalPlaces: 0 - male_18_19_counts: - title: Total homicides where the victim was male and aged 18-19 - unit: homicides - display: - numDecimalPlaces: 0 - male_20_24_counts: - title: Total homicides where the victim was male and aged 20-24 - unit: homicides - display: - numDecimalPlaces: 0 - male_25_29_counts: - title: Total homicides where the victim was male and aged 25-29 - unit: homicides - display: - numDecimalPlaces: 0 - rate_per_100_000_population: - title: Homicides per 100,000 population - unit: homicides per 100,000 people - display: - numDecimalPlaces: 2 - female_all_ages_rate_per_100_000_population: - title: Total homicides where the victim was female per 100,000 population - unit: homicides per 100,000 people - display: - numDecimalPlaces: 2 - male_all_ages_rate_per_100_000_population: - title: Total homicides where the victim was male per 100,000 population - unit: homicides per 100,000 people - display: - numDecimalPlaces: 2 diff --git a/etl/steps/archive/garden/homicide/2023-01-04/unodc.py b/etl/steps/archive/garden/homicide/2023-01-04/unodc.py deleted file mode 100644 index e2446716e46..00000000000 --- a/etl/steps/archive/garden/homicide/2023-01-04/unodc.py +++ /dev/null @@ -1,166 +0,0 @@ -import json -from typing import List, cast - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore_table -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -log = get_logger() - -# naming conventions -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - log.info("unodc.start") - - # read dataset from meadow - ds_meadow = Dataset(DATA_DIR / "meadow/homicide/2023-01-04/unodc") - tb_meadow = ds_meadow["unodc"] - - df = pd.DataFrame(tb_meadow) - - log.info("unodc.exclude_countries") - df = exclude_countries(df) - - log.info("unodc.harmonize_countries") - df = harmonize_countries(df) - - df = clean_data(df) - # create new dataset with the same metadata as meadow - ds_garden = Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - - # create new table with the same metadata as meadow and add it to dataset - tb_garden = underscore_table(Table(df, short_name=tb_meadow.metadata.short_name)) - ds_garden.add(tb_garden) - - # update metadata from yaml file - ds_garden.update_metadata(paths.metadata_path) - - ds_garden.save() - - log.info("unodc.end") - - -def load_excluded_countries() -> List[str]: - with open(paths.excluded_countries_path, "r") as f: - data = json.load(f) - assert isinstance(data, list) - return data - - -def exclude_countries(df: pd.DataFrame) -> pd.DataFrame: - excluded_countries = load_excluded_countries() - return cast(pd.DataFrame, df.loc[~df.country.isin(excluded_countries)]) - - -def harmonize_countries(df: pd.DataFrame) -> pd.DataFrame: - unharmonized_countries = df["country"] - df = geo.harmonize_countries(df=df, countries_file=str(paths.country_mapping_path)) - - missing_countries = set(unharmonized_countries[df.country.isnull()]) - if any(missing_countries): - raise RuntimeError( - "The following raw country names have not been harmonized. " - f"Please: (a) edit {paths.country_mapping_path} to include these country " - f"names; or (b) add them to {paths.excluded_countries_path}." - f"Raw country names: {missing_countries}" - ) - - return df - - -def clean_data(df: pd.DataFrame) -> pd.DataFrame: - df = df.copy(deep=True) - - # Splitting the data into that which has the totals and that which is disaggregated by mechanism - df_mech = df[df["dimension"] == "by mechanisms"] - - df_mech = create_mechanism_df(df_mech) - - df_tot = df[df["dimension"] == "Total"] - - df_tot = create_total_df(df_tot) - - df = pd.merge(df_mech, df_tot, how="outer", on=["country", "year"]) - - # Reconciling the variable names with previous aggregated version - - df = df.rename( - columns={ - "Both sexes_All ages_Rate per 100,000 population": "Rate per 100,000 population", - "Both sexes_All ages_Counts": "Counts", - } - ) - - return df - - -def pivot_and_format_df(df, drop_columns, pivot_index, pivot_values, pivot_columns): - """ - - Dropping a selection of columns - - Pivoting by the desired disaggregations e.g. category, unit of measurement - - Tidying the column names - """ - df = df.drop(columns=drop_columns) - df = df.pivot(index=pivot_index, values=pivot_values, columns=pivot_columns) - # Make the columns nice - df.columns = df.columns.droplevel(0) - df.columns = df.columns.map("_".join) - df = df.reset_index() - return df - - -def create_total_df(df_tot: pd.DataFrame) -> pd.DataFrame: - """ - Create the total homicides dataframe where we will have total homicides/homicide rate - disaggregated by age and sex - """ - # To escape the dataframe slice warnings - df_tot = df_tot.copy(deep=True) - # There are some duplicates when sex is unknown so let's remove those rows - df_tot = df_tot[df_tot["sex"] != "Unknown"] - - # Make it more obvious what total age and total sex means - - df_tot["age"] = df_tot["age"].map({"Total": "All ages"}, na_action="ignore").fillna(df_tot["age"]) - df_tot["sex"] = df_tot["sex"].map({"Total": "Both sexes"}, na_action="ignore").fillna(df_tot["sex"]) - - df_tot = pivot_and_format_df( - df_tot, - drop_columns=["region", "subregion", "indicator", "dimension", "category", "source"], - pivot_index=["country", "year"], - pivot_values=["value"], - pivot_columns=["sex", "age", "unit_of_measurement"], - ) - return df_tot - - -def create_mechanism_df(df_mech: pd.DataFrame) -> pd.DataFrame: - """ - Create the homicides by mechanism dataframe where we will have homicides/homicide rate - disaggregated by mechanism (e.g. weapon) - """ - # df_mech = df_mech.drop(columns=["region", "subregion", "indicator", "dimension", "source", "sex", "age"]) - df_mech = df_mech.copy(deep=True) - df_mech["category"] = ( - df_mech["category"] - .map({"Firearms or explosives - firearms": "Firearms", "Another weapon - sharp object": "Sharp object"}) - .fillna(df_mech["category"]) - ) - - # Make the table wider so we have a column for each mechanism - df_mech = pivot_and_format_df( - df_mech, - drop_columns=["region", "subregion", "indicator", "dimension", "source", "sex", "age"], - pivot_index=["country", "year"], - pivot_values=["value"], - pivot_columns=["category", "unit_of_measurement"], - ) - - return df_mech diff --git a/etl/steps/archive/garden/homicide/2023-01-27/unodc.countries.json b/etl/steps/archive/garden/homicide/2023-01-27/unodc.countries.json deleted file mode 100644 index 6495c0e889e..00000000000 --- a/etl/steps/archive/garden/homicide/2023-01-27/unodc.countries.json +++ /dev/null @@ -1,213 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia (Plurinational State of)": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cayman Islands": "Cayman Islands", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cura\u00e7ao": "Curacao", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Denmark": "Denmark", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Gibraltar": "Gibraltar", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Holy See": "Vatican", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Isle of Man": "Isle of Man", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Martinique": "Martinique", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Melanesia": "Melanesia", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Republic of Korea": "South Korea", - "Republic of Moldova": "Moldova", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "R\u00e9union": "Reunion", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Martin (French Part)": "Saint Martin (French part)", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "San Marino": "San Marino", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "State of Palestine": "Palestine", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Tajikistan": "Tajikistan", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", - "United Republic of Tanzania": "Tanzania", - "United States Virgin Islands": "United States Virgin Islands", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "China, Hong Kong Special Administrative Region": "Hong Kong", - "China, Macao Special Administrative Region": "Macao", - "China, Taiwan Province of China": "Taiwan", - "Kosovo under UNSCR 1244": "Kosovo", - "T\u00fcrkiye": "Turkey", - "All Africa": "Africa (UN)", - "All Americas": "Americas (UN)", - "All Asia": "Asia (UN)", - "All Europe": "Europe (UN)", - "All Oceania": "Oceania (UN)", - "Channel Island": "Channel Islands" -} diff --git a/etl/steps/archive/garden/homicide/2023-01-27/unodc.excluded_countries.json b/etl/steps/archive/garden/homicide/2023-01-27/unodc.excluded_countries.json deleted file mode 100644 index 0f50eedb994..00000000000 --- a/etl/steps/archive/garden/homicide/2023-01-27/unodc.excluded_countries.json +++ /dev/null @@ -1,21 +0,0 @@ -[ - "Australia and New Zealand", - "Central Asia", - "Eastern Asia", - "Eastern Europe", - "Iraq (Central Iraq)", - "Iraq (Kurdistan Region)", - "Latin America and the Caribbean", - "Northern Africa", - "Northern America", - "Northern Europe", - "South-eastern Asia", - "Southern Asia", - "Southern Europe", - "Sub-Saharan Africa", - "United Kingdom (England and Wales)", - "United Kingdom (Northern Ireland)", - "United Kingdom (Scotland)", - "Western Asia", - "Western Europe" -] \ No newline at end of file diff --git a/etl/steps/archive/garden/homicide/2023-01-27/unodc.meta.yml b/etl/steps/archive/garden/homicide/2023-01-27/unodc.meta.yml deleted file mode 100644 index a31d49ab98c..00000000000 --- a/etl/steps/archive/garden/homicide/2023-01-27/unodc.meta.yml +++ /dev/null @@ -1,48 +0,0 @@ -tables: - share: - variables: - share_of_homicides_of_both_sexes_where_the_perpertrator_is_an_intimate_partner: - title: Share of total homicides where the perpertrator is an intimate partner - unit: '%' - display: - numDecimalPlaces: 2 - share_of_homicides_of_both_sexes_where_the_perpertrator_is_a_family_member: - title: Share of total homicides where the perpertrator is a family member - unit: '%' - display: - numDecimalPlaces: 2 - share_of_homicides_of_both_sexes_where_the_perpertrator_is_unknown: - title: Share of total homicides where the perpertrator is unknown - unit: '%' - display: - numDecimalPlaces: 2 - share_of_homicides_of_female_where_the_perpertrator_is_an_intimate_partner: - title: Share of homicides where the victim is female and the perpertrator is an intimate partner - unit: '%' - display: - numDecimalPlaces: 2 - share_of_homicides_of_female_where_the_perpertrator_is_a_family_member: - title: Share of homicides where the victim is female and the perpertrator is a family member - unit: '%' - display: - numDecimalPlaces: 2 - share_of_homicides_of_female_where_the_perpertrator_is_unknown: - title: Share of homicides where the victim is female and perpertrator is unknown - unit: '%' - display: - numDecimalPlaces: 2 - share_of_homicides_of_male_where_the_perpertrator_is_an_intimate_partner: - title: Share of homicides where the victim is male and the perpertrator is an intimate partner - unit: '%' - display: - numDecimalPlaces: 2 - share_of_homicides_of_male_where_the_perpertrator_is_a_family_member: - title: Share of homicides where the victim is male and the perpertrator is a family member - unit: '%' - display: - numDecimalPlaces: 2 - share_of_homicides_of_male_where_the_perpertrator_is_unknown: - title: Share of homicides where the victim is male and perpertrator is unknown - unit: '%' - display: - numDecimalPlaces: 2 diff --git a/etl/steps/archive/garden/homicide/2023-01-27/unodc.py b/etl/steps/archive/garden/homicide/2023-01-27/unodc.py deleted file mode 100644 index bb86622285c..00000000000 --- a/etl/steps/archive/garden/homicide/2023-01-27/unodc.py +++ /dev/null @@ -1,263 +0,0 @@ -import json -from typing import List, cast - -import numpy as np -import pandas as pd -from owid.catalog import Table, VariableMeta -from owid.catalog.utils import underscore -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# naming conventions -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - log.info("unodc.start") - - # read dataset from meadow - ds_meadow = paths.meadow_dataset - tb_meadow = ds_meadow["unodc"] - - df = pd.DataFrame(tb_meadow) - - log.info("unodc.exclude_countries") - df = exclude_countries(df) - - log.info("unodc.harmonize_countries") - df = harmonize_countries(df) - - df = clean_up_categories(df) - tb_garden_list = clean_data(df) - - # create new dataset with the same metadata as meadow - ds_garden = create_dataset(dest_dir, tables=tb_garden_list, default_metadata=ds_meadow.metadata) - ds_garden.save() - - log.info("unodc.end") - - -def load_excluded_countries() -> List[str]: - with open(paths.excluded_countries_path, "r") as f: - data = json.load(f) - assert isinstance(data, list) - return data - - -def exclude_countries(df: pd.DataFrame) -> pd.DataFrame: - excluded_countries = load_excluded_countries() - return cast(pd.DataFrame, df.loc[~df.country.isin(excluded_countries)]) - - -def harmonize_countries(df: pd.DataFrame) -> pd.DataFrame: - unharmonized_countries = df["country"] - df = geo.harmonize_countries(df=df, countries_file=str(paths.country_mapping_path)) - - missing_countries = set(unharmonized_countries[df.country.isnull()]) - if any(missing_countries): - raise RuntimeError( - "The following raw country names have not been harmonized. " - f"Please: (a) edit {paths.country_mapping_path} to include these country " - f"names; or (b) add them to {paths.excluded_countries_path}." - f"Raw country names: {missing_countries}" - ) - - return df - - -def clean_data(df: pd.DataFrame) -> list[Table]: - """ - Splitting the data into four dataframes/tables based on the dimension columns: - * Total - * by mechanism - * by relationship to perpertrator - * by situational context - """ - df["age"] = df["age"].map({"Total": "All ages"}, na_action="ignore").fillna(df["age"]) - df["sex"] = df["sex"].map({"Total": "Both sexes"}, na_action="ignore").fillna(df["sex"]) - - tb_mech = create_table(df, table_name="by mechanisms") - tb_perp = create_table(df, table_name="by relationship to perpetrator") - tb_situ = create_table(df, table_name="by situational context") - tb_tot = create_total_table(df) - - tb_share = calculate_share_of_homicides(tb_tot, tb_perp) - tb_share.update_metadata_from_yaml(paths.metadata_path, "share") - tb_garden_list = [tb_mech, tb_tot, tb_perp, tb_situ, tb_share] - - return tb_garden_list - - -def create_table(df: pd.DataFrame, table_name: str) -> Table: - """ - Create the homicides by mechanism dataframe where we will have homicides/homicide rate - disaggregated by mechanism (e.g. weapon) - - """ - assert any(df["dimension"] == table_name), "table_name must be a dimension in df" - df_filter = df[df["dimension"] == table_name] - - # Make the table wider so we have a column for each mechanism - tb_filter = pivot_and_format_table( - df_filter, - drop_columns=["region", "subregion", "indicator", "dimension", "source"], - pivot_index=["country", "year"], - pivot_values=["value"], - pivot_columns=["unit_of_measurement", "category", "sex", "age"], - table_name=table_name, - ) - - return tb_filter - - -def create_total_table(df: pd.DataFrame) -> Table: - """ - Create the total homicides dataframe where we will have total homicides/homicide rate - disaggregated by age and sex - """ - df_tot = df[df["dimension"] == "Total"] - # To escape the dataframe slice warnings - df_tot = df_tot.copy(deep=True) - # There are some duplicates when sex is unknown so let's remove those rows - df_tot = df_tot[df_tot["sex"] != "Unknown"] - - # Make it more obvious what total age and total sex means - - df_tot = pivot_and_format_table( - df_tot, - drop_columns=["region", "subregion", "indicator", "dimension", "category", "source"], - pivot_index=["country", "year"], - pivot_values=["value"], - pivot_columns=["unit_of_measurement", "sex", "age"], - table_name="Total", - ) - df_tot = df_tot.dropna(how="all", axis=1) - - return df_tot - - -def pivot_and_format_table(df_piv, drop_columns, pivot_index, pivot_values, pivot_columns, table_name) -> Table: - """ - - Dropping a selection of columns - - Pivoting by the desired disaggregations e.g. category, unit of measurement - - Tidying the column names - """ - df_piv = df_piv.drop(columns=drop_columns) - df_piv = df_piv.pivot(index=pivot_index, columns=pivot_columns, values=pivot_values) - - df_piv.columns = df_piv.columns.droplevel(0) - tb_garden = Table(short_name=underscore(table_name)) - for col in df_piv.columns: - col_metadata = build_metadata(col, table_name=table_name) - new_col = underscore(" ".join(col).strip()) - tb_garden[new_col] = df_piv[col] - tb_garden[new_col].metadata = col_metadata - - return tb_garden - - -def build_metadata(col: tuple, table_name: str) -> VariableMeta: - """ - Building the variable level metadata for each of the age-sex-metric combinations - """ - metric_dict = { - "Counts": { - "title": "Number of homicides", - "unit": "homicides", - "short_unit": "", - "numDecimalPlaces": 0, - }, - "Rate per 100,000 population": { - "title": "Homicide rate per 100,000 population", - "unit": "homicides per 100,000 people", - "short_unit": "", - "numDecimalPlaces": 2, - }, - } - - if table_name == "by mechanisms": - title = f"{metric_dict[col[0]]['title']} - {col[1]}" - description = ( - f"The {metric_dict[col[0]]['title'].lower()}, where the homicide was carried out using {col[1].lower()}." - ) - elif table_name == "Total": - title = f"{metric_dict[col[0]]['title']} - {col[1]} - {col[2]}" - description = f"The {metric_dict[col[0]]['title'].lower()} recorded in a year." - elif table_name == "by relationship to perpetrator": - title = f"{metric_dict[col[0]]['title']} - {col[1]} - {col[2]} - {col[3]}" - description = f"The {metric_dict[col[0]]['title'].lower()} shown by the victims relationship to the perpertrator. The age and sex characteristics relate to that of the victim, rather than the perpertrator." - elif table_name == "by situational context": - title = f"{metric_dict[col[0]]['title']} - {col[1]} - {col[2]} - {col[3]}" - description = f"The {metric_dict[col[0]]['title'].lower()} shown by the situational context of the homicide." - else: - title = "" - description = "" - meta = VariableMeta( - title=title, - description=description, - unit=f"{metric_dict[col[0]]['unit']}", - short_unit=f"{metric_dict[col[0]]['short_unit']}", - ) - meta.display = { - "numDecimalPlaces": metric_dict[col[0]]["numDecimalPlaces"], - } - return meta - - -def clean_up_categories(df: pd.DataFrame) -> pd.DataFrame: - """ - Make the categories used in the dataset a bit more readable. - - """ - category_dict = { - "Firearms or explosives - firearms": "firearms", - "Another weapon - sharp object": "a sharp object", - "Unspecified means": "unspecified means", - "Without a weapon/ other Mechanism": " without a weapon or by another mechanism", - "Firearms or explosives": "firearms or explosives", - "Another weapon": "sharp or blunt object, including motor vehicles", - "Intimate partner or family member": "Perpertrator is an intimate partner or family member", - "Intimate partner or family member: Intimate partner": "Perpertrator is an intimate partner", - "Intimate partner or family member: Family member": "Perpertrator is a family member", - "Other Perpetrator known to the victim": "Another known perpetrator", - "Perpetrator unknown": "Perpertrator is unknown", - "Relationship to perpetrator is not known": "Perpertrator where the relationship to the victim is not known", - "Socio-political homicide - terrorist offences": "Terrorist offences", - "Unknown types of homicide": "Unknown situational context", - } - df = df.replace({"category": category_dict}) - - assert df["category"].isna().sum() == 0 - return df - - -def calculate_share_of_homicides(total_table: Table, perp_table: Table) -> Table: - """ - Calculate the share of total homicides where: - - * The perpertrator is an intimate partner - * The perpertrator is a family member - * The perpertrator is unknown - """ - merge_table = pd.merge(total_table, perp_table, on=["country", "year"]) - - sexes = ["both_sexes", "female", "male"] - perpertrators = [ - "perpertrator_is_an_intimate_partner", - "perpertrator_is_a_family_member", - "perpertrator_is_unknown", - ] - share_df = pd.DataFrame() - for sex in sexes: - sex_select = f"counts_{sex}_all_ages" - for perp in perpertrators: - perp_select = f"counts_{perp}_{sex}_all_ages" - new_col = underscore(f"Share of homicides of {sex} where the {perp}") - share_df[new_col] = (merge_table[perp_select] / merge_table[sex_select]) * 100 - share_df[new_col] = share_df[new_col].replace(np.inf, np.nan) - share_table = Table(share_df, short_name="share") - return share_table diff --git a/etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.countries.json b/etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.countries.json deleted file mode 100644 index d843d289d3b..00000000000 --- a/etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.countries.json +++ /dev/null @@ -1,224 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Andorra": "Andorra", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia (Plurinational State of)": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic People's Republic of Korea": "North Korea", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Global": "World", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North America": "North America (WB)", - "North Macedonia": "North Macedonia", - "Northern Mariana Islands": "Northern Mariana Islands", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Palestine": "Palestine", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Republic of Korea": "South Korea", - "Republic of Moldova": "Moldova", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "San Marino": "San Marino", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Taiwan (Province of China)": "Taiwan", - "Tajikistan": "Tajikistan", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tokelau": "Tokelau", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United Republic of Tanzania": "Tanzania", - "United States Virgin Islands": "United States Virgin Islands", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "African Region": "African Region (WHO)", - "East Asia & Pacific - WB": "East Asia & Pacific (WB)", - "Eastern Mediterranean Region": "Eastern Mediterranean Region (WHO)", - "Europe & Central Asia - WB": "Europe & Central Asia", - "European Region": "European Region (WHO)", - "Latin America & Caribbean - WB": "Latin America & Caribbean", - "Middle East & North Africa - WB": "Middle East & North Africa (WB)", - "Region of the Americas": "Region of the Americas (WHO)", - "South Asia - WB": "South Asia (WB)", - "South-East Asia Region": "South-East Asia Region (WHO)", - "Sub-Saharan Africa - WB": "Sub-Saharan Africa (WB)", - "Western Pacific Region": "Western Pacific Region (WHO)", - "World Bank High Income": "World Bank High Income", - "World Bank Low Income": "World Bank Low Income", - "World Bank Lower Middle Income": "World Bank Lower Middle Income", - "World Bank Upper Middle Income": "World Bank Upper Middle Income" -} diff --git a/etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.meta.yml b/etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.meta.yml deleted file mode 100644 index ee97b2d0cb1..00000000000 --- a/etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.meta.yml +++ /dev/null @@ -1,15 +0,0 @@ -dataset: - title: Drug Use Disorders - Global Burden of Disease Study 2019 (GBD 2019) - description: 'This aggregate cause incorporates death and disability resulting from opioid use disorder, amphetamine use disorder, cocaine use disorder, cannabis use disorder, and a residual category of other drug use disorders.' - licenses: - - name: Free-of-Charge Non-Commercial-User Agreement - url: https://www.healthdata.org/data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement - sources: - - name: IHME, Global Burden of Disease (2019) - - url: https://vizhub.healthdata.org/gbd-results/ -tables: - gbd_drug_disorders: - variables: - val: - title: The annual number of deaths from drug use disorders per 100,000 people aged 15- to 49-years for both sexes - unit: 'deaths per 100,000 population' \ No newline at end of file diff --git a/etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.py b/etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.py deleted file mode 100644 index ff460f6d9c2..00000000000 --- a/etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Load a meadow dataset and create a garden dataset.""" - -import pandas as pd -from owid.catalog import Dataset, Table -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - log.info("gbd_drug_disorders.start") - - # - # Load inputs. - # - # Load meadow dataset. - ds_meadow: Dataset = paths.load_dependency("gbd_drug_disorders") - - # Read table from meadow dataset. - tb_meadow = ds_meadow["gbd_drug_disorders"] - - # Create a dataframe with data from the table. - df = pd.DataFrame(tb_meadow) - - # - # Process data. - # - log.info("gbd_drug_disorders.harmonize_countries") - df = geo.harmonize_countries( - df=df, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path - ) - - # Create a new table with the processed data. - tb_garden = Table(df, like=tb_meadow) - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_meadow.metadata) - - # Save changes in the new garden dataset. - ds_garden.save() - - log.info("gbd_drug_disorders.end") diff --git a/etl/steps/archive/garden/irena/2022-10-20/renewable_electricity_capacity.countries.json b/etl/steps/archive/garden/irena/2022-10-20/renewable_electricity_capacity.countries.json deleted file mode 100644 index b683f347c60..00000000000 --- a/etl/steps/archive/garden/irena/2022-10-20/renewable_electricity_capacity.countries.json +++ /dev/null @@ -1,237 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Africa": "Africa (IRENA)", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Asia": "Asia (IRENA)", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Curacao": "Curacao", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Europe": "Europe (IRENA)", - "Faroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao PDR": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Mexico": "Mexico", - "Micronesia": "Micronesia (country)", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oceania": "Oceania (IRENA)", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Palestine": "Palestine", - "Panama": "Panama", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Reunion": "Reunion", - "Romania": "Romania", - "Rwanda": "Rwanda", - "Samoa": "Samoa", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Togo": "Togo", - "Tokelau": "Tokelau", - "Tonga": "Tonga", - "Tunisia": "Tunisia", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "USA": "United States", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Viet Nam": "Vietnam", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Amer Samoa": "American Samoa", - "Antigua Barb": "Antigua and Barbuda", - "BES Islands": "Bonaire Sint Eustatius and Saba", - "Bosnia Herzg": "Bosnia and Herzegovina", - "Br Virgin Is": "British Virgin Islands", - "Brunei Darsm": "Brunei", - "C America + Carib": "Central America and the Caribbean (IRENA)", - "Cayman Is": "Cayman Islands", - "Cent Afr Rep": "Central African Republic", - "China HK SAR": "Hong Kong", - "Chinese Taipei": "Taiwan", - "Congo DR": "Democratic Republic of Congo", - "Congo Rep": "Congo", - "Cook Is": "Cook Islands", - "Cote d Ivoire": "Cote d'Ivoire", - "Dominican Rep": "Dominican Republic", - "EU 27": "European Union (27)", - "Eq Guinea": "Equatorial Guinea", - "Eurasia": "Eurasia (IRENA)", - "European Union": "European Union (IRENA)", - "Falklands Malv": "Falkland Islands", - "Fr Guiana": "French Guiana", - "Fr Polynesia": "French Polynesia", - "Guinea Bissau": "Guinea-Bissau", - "Iran IR": "Iran", - "Korea DPR": "North Korea", - "Korea Rep": "South Korea", - "Kosovo*": "Kosovo", - "Marshall Is": "Marshall Islands", - "Middle East": "Middle East (IRENA)", - "Moldova Rep": "Moldova", - "N America": "North America (IRENA)", - "New Caledon": "New Caledonia", - "Papua N Guin": "Papua New Guinea", - "Russian Fed": "Russia", - "S America": "South America (IRENA)", - "Sao Tome Prn": "Sao Tome and Principe", - "Solomon Is": "Solomon Islands", - "South Georgia": "South Georgia and the South Sandwich Islands", - "St Barth": "Saint Barthelemy", - "St Kitts Nevis": "Saint Kitts and Nevis", - "St Lucia": "Saint Lucia", - "St Martin": "Saint Martin (French part)", - "St Pierre Mq": "Saint Pierre and Miquelon", - "St Vincent Gren": "Saint Vincent and the Grenadines", - "Syrian AR": "Syria", - "Timor Leste": "East Timor", - "Trinidad Tobago": "Trinidad and Tobago", - "Turks Caicos": "Turks and Caicos Islands", - "T\u00fcrkiye": "Turkey", - "UK": "United Kingdom", - "US Virgin Is": "United States Virgin Islands", - "United Arab Em": "United Arab Emirates" -} diff --git a/etl/steps/archive/garden/irena/2022-10-20/renewable_electricity_capacity.meta.yml b/etl/steps/archive/garden/irena/2022-10-20/renewable_electricity_capacity.meta.yml deleted file mode 100644 index 62d3e9cb140..00000000000 --- a/etl/steps/archive/garden/irena/2022-10-20/renewable_electricity_capacity.meta.yml +++ /dev/null @@ -1,118 +0,0 @@ -dataset: - short_name: renewable_electricity_capacity - description: | - IRENA publishes detailed statistics on renewable energy capacity, power generation and renewable energy balances. This data is collected directly from members using the IRENA Renewable Energy Statistics questionnaire and is also supplemented by desk research where official statistics are not available. - - Pumped storage is included under the "Hydropower" category but not in the "Total renewable energy". Generation from mixed plants is split between hydropower and pumped storage as appropriate. - -tables: - renewable_electricity_capacity: - variables: - bagasse: - title: Bagasse - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - bioenergy: - title: Bioenergy - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - biogas: - title: Biogas - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - concentrated_solar_power: - title: Concentrated solar power - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - geothermal_energy: - title: Geothermal energy - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - hydropower: - title: Hydropower - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - liquid_biofuels: - title: Liquid biofuels - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - marine_energy: - title: Marine energy - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - offshore_wind_energy: - title: Offshore wind energy - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - onshore_wind_energy: - title: Onshore wind energy - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - other_solid_biofuels: - title: Other solid biofuels - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - pure_pumped_storage: - title: Pure pumped storage - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - renewable_hydropower_including_mixed_plants: - title: Renewable hydropower including mixed plants - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - solar_energy: - title: Solar energy - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - solar_photovoltaic: - title: Solar photovoltaic - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - solid_biofuels_and_renewable_waste: - title: Solid biofuels and renewable waste - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - total_renewable_energy: - title: Total renewable energy - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 - wind_energy: - title: Wind energy - short_unit: MW - unit: Megawatts - display: - numDecimalPlaces: 0 diff --git a/etl/steps/archive/garden/irena/2022-10-20/renewable_electricity_capacity.py b/etl/steps/archive/garden/irena/2022-10-20/renewable_electricity_capacity.py deleted file mode 100644 index 75093d30818..00000000000 --- a/etl/steps/archive/garden/irena/2022-10-20/renewable_electricity_capacity.py +++ /dev/null @@ -1,67 +0,0 @@ -"""Create a dataset of renewable electricity capacity using IRENA's Renewable Electricity Capacity and Generation. - -""" - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore_table - -from etl.data_helpers import geo -from etl.paths import DATA_DIR, STEP_DIR - -# Details of inputs. -MEADOW_DATASET_NAME = "renewable_electricity_capacity_and_generation" -MEADOW_VERSION = "2022-10-20" -MEADOW_DATASET_PATH = DATA_DIR / f"meadow/irena/{MEADOW_VERSION}/{MEADOW_DATASET_NAME}" -# Details of outputs. -DATASET_NAME = "renewable_electricity_capacity" -TABLE_NAME = DATASET_NAME -VERSION = MEADOW_VERSION -COUNTRIES_PATH = STEP_DIR / f"data/garden/irena/{VERSION}/{DATASET_NAME}.countries.json" -METADATA_PATH = STEP_DIR / f"data/garden/irena/{VERSION}/{DATASET_NAME}.meta.yml" - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load dataset from Meadow. - ds_meadow = Dataset(MEADOW_DATASET_PATH) - # Load main table from dataset. - tb_meadow = ds_meadow[ds_meadow.table_names[0]] - # Create a dataframe out of the main table. - df = pd.DataFrame(tb_meadow).reset_index() - - # - # Process data. - # - # Harmonize country names. - df = geo.harmonize_countries(df=df, countries_file=COUNTRIES_PATH) - - # Reshape dataframe to have each technology as a separate column, and sort conveniently. - df = ( - df.pivot(index=["country", "year"], columns=["technology"], values="capacity") - .rename_axis(None, axis=1) - .sort_index() - .sort_index(axis=1) - ) - - # For convenience, remove parentheses from column names. - df = df.rename(columns={column: column.replace("(", "").replace(")", "") for column in df.columns}) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = Dataset.create_empty(dest_dir) - ds_garden.metadata = ds_meadow.metadata - # Ensure all columns are snake, lower case. - tb_garden = underscore_table(Table(df)) - tb_garden.short_name = TABLE_NAME - # Load dataset's metadata from yaml file. - ds_garden.metadata.update_from_yaml(METADATA_PATH) - # Load main table's metadata from yaml file. - tb_garden.update_metadata_from_yaml(METADATA_PATH, TABLE_NAME) - # Add table to dataset and save dataset. - ds_garden.add(tb_garden) - ds_garden.save() diff --git a/etl/steps/archive/garden/irena/2022-10-20/renewable_power_generation_costs.countries.json b/etl/steps/archive/garden/irena/2022-10-20/renewable_power_generation_costs.countries.json deleted file mode 100644 index fda7f074228..00000000000 --- a/etl/steps/archive/garden/irena/2022-10-20/renewable_power_generation_costs.countries.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "Australia": "Australia", - "Brazil": "Brazil", - "Canada": "Canada", - "China": "China", - "Denmark": "Denmark", - "France": "France", - "Germany": "Germany", - "India": "India", - "Italy": "Italy", - "Japan": "Japan", - "Mexico": "Mexico", - "Netherlands": "Netherlands", - "Republic of Korea": "South Korea", - "Spain": "Spain", - "Sweden": "Sweden", - "Ukraine": "Ukraine", - "United Kingdom": "United Kingdom", - "United States": "United States", - "Viet Nam": "Vietnam", - "World": "World", - "T\u00fcrkiye": "Turkey" -} \ No newline at end of file diff --git a/etl/steps/archive/garden/irena/2022-10-20/renewable_power_generation_costs.meta.yml b/etl/steps/archive/garden/irena/2022-10-20/renewable_power_generation_costs.meta.yml deleted file mode 100644 index f3511dbd1f0..00000000000 --- a/etl/steps/archive/garden/irena/2022-10-20/renewable_power_generation_costs.meta.yml +++ /dev/null @@ -1,45 +0,0 @@ -dataset: - namespace: irena - title: Renewable power generation costs - short_name: renewable_power_generation_costs - version: 2022-10-20 - description: | - Levelized cost of energy (LCOE) estimates the average cost per unit of energy generated across the lifetime of a new power plant. It is measured in 2021 US$ per kilowatt-hour. - sources: - - - name: International Renewable Energy Agency (IRENA) (2022) - published_by: © 2022 by International Renewable Energy Agency (IRENA) - url: https://irena.org/publications/2022/Jul/Renewable-Power-Generation-Costs-in-2021 - date_accessed: 2022-10-20 - -tables: - renewable_power_generation_costs: - variables: - bioenergy: - title: Bioenergy levelized cost of energy - short_unit: $/kWh - unit: $ per kilowatt-hour - concentrated_solar_power: - title: Concentrated solar power levelized cost of energy - short_unit: $/kWh - unit: $ per kilowatt-hour - geothermal: - title: Geothermal levelized cost of energy - short_unit: $/kWh - unit: $ per kilowatt-hour - hydropower: - title: Hydropower levelized cost of energy - short_unit: $/kWh - unit: $ per kilowatt-hour - offshore_wind: - title: Offshore wind levelized cost of energy - short_unit: $/kWh - unit: $ per kilowatt-hour - onshore_wind: - title: Onshore wind levelized cost of energy - short_unit: $/kWh - unit: $ per kilowatt-hour - solar_photovoltaic: - title: Solar photovoltaic levelized cost of energy - short_unit: $/kWh - unit: $ per kilowatt-hour diff --git a/etl/steps/archive/garden/irena/2022-10-20/renewable_power_generation_costs.py b/etl/steps/archive/garden/irena/2022-10-20/renewable_power_generation_costs.py deleted file mode 100644 index 63a3a3e4e17..00000000000 --- a/etl/steps/archive/garden/irena/2022-10-20/renewable_power_generation_costs.py +++ /dev/null @@ -1,42 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore_table - -from etl.data_helpers import geo -from etl.helpers import PathFinder - -# Get naming conventions. -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load dataset from Meadow. - ds_meadow = N.meadow_dataset - # Load main table from dataset. - tb_meadow = ds_meadow[ds_meadow.table_names[0]] - - # Create a dataframe out of the table. - df = pd.DataFrame(tb_meadow).reset_index() - - # Harmonize country names. - df = geo.harmonize_countries(df=df, countries_file=N.country_mapping_path) - - # - # Save outputs. - # - # Create a new Garden dataset. - ds_garden = Dataset.create_empty(dest_dir) - - # Ensure all columns are snake, lower case. - tb_garden = underscore_table(Table(df)) - - # Load metadata from yaml file. - ds_garden.metadata.update_from_yaml(N.metadata_path, if_source_exists="append") - tb_garden.update_metadata_from_yaml(N.metadata_path, ds_meadow.table_names[0]) - - # Add table to dataset and save dataset. - ds_garden.add(tb_garden) - ds_garden.save() diff --git a/etl/steps/archive/garden/irena/2023-01-04/renewable_electricity_capacity.countries.json b/etl/steps/archive/garden/irena/2023-01-04/renewable_electricity_capacity.countries.json deleted file mode 100644 index b683f347c60..00000000000 --- a/etl/steps/archive/garden/irena/2023-01-04/renewable_electricity_capacity.countries.json +++ /dev/null @@ -1,237 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Africa": "Africa (IRENA)", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Asia": "Asia (IRENA)", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Curacao": "Curacao", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Europe": "Europe (IRENA)", - "Faroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao PDR": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Mexico": "Mexico", - "Micronesia": "Micronesia (country)", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oceania": "Oceania (IRENA)", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Palestine": "Palestine", - "Panama": "Panama", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Reunion": "Reunion", - "Romania": "Romania", - "Rwanda": "Rwanda", - "Samoa": "Samoa", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Togo": "Togo", - "Tokelau": "Tokelau", - "Tonga": "Tonga", - "Tunisia": "Tunisia", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "USA": "United States", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Viet Nam": "Vietnam", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Amer Samoa": "American Samoa", - "Antigua Barb": "Antigua and Barbuda", - "BES Islands": "Bonaire Sint Eustatius and Saba", - "Bosnia Herzg": "Bosnia and Herzegovina", - "Br Virgin Is": "British Virgin Islands", - "Brunei Darsm": "Brunei", - "C America + Carib": "Central America and the Caribbean (IRENA)", - "Cayman Is": "Cayman Islands", - "Cent Afr Rep": "Central African Republic", - "China HK SAR": "Hong Kong", - "Chinese Taipei": "Taiwan", - "Congo DR": "Democratic Republic of Congo", - "Congo Rep": "Congo", - "Cook Is": "Cook Islands", - "Cote d Ivoire": "Cote d'Ivoire", - "Dominican Rep": "Dominican Republic", - "EU 27": "European Union (27)", - "Eq Guinea": "Equatorial Guinea", - "Eurasia": "Eurasia (IRENA)", - "European Union": "European Union (IRENA)", - "Falklands Malv": "Falkland Islands", - "Fr Guiana": "French Guiana", - "Fr Polynesia": "French Polynesia", - "Guinea Bissau": "Guinea-Bissau", - "Iran IR": "Iran", - "Korea DPR": "North Korea", - "Korea Rep": "South Korea", - "Kosovo*": "Kosovo", - "Marshall Is": "Marshall Islands", - "Middle East": "Middle East (IRENA)", - "Moldova Rep": "Moldova", - "N America": "North America (IRENA)", - "New Caledon": "New Caledonia", - "Papua N Guin": "Papua New Guinea", - "Russian Fed": "Russia", - "S America": "South America (IRENA)", - "Sao Tome Prn": "Sao Tome and Principe", - "Solomon Is": "Solomon Islands", - "South Georgia": "South Georgia and the South Sandwich Islands", - "St Barth": "Saint Barthelemy", - "St Kitts Nevis": "Saint Kitts and Nevis", - "St Lucia": "Saint Lucia", - "St Martin": "Saint Martin (French part)", - "St Pierre Mq": "Saint Pierre and Miquelon", - "St Vincent Gren": "Saint Vincent and the Grenadines", - "Syrian AR": "Syria", - "Timor Leste": "East Timor", - "Trinidad Tobago": "Trinidad and Tobago", - "Turks Caicos": "Turks and Caicos Islands", - "T\u00fcrkiye": "Turkey", - "UK": "United Kingdom", - "US Virgin Is": "United States Virgin Islands", - "United Arab Em": "United Arab Emirates" -} diff --git a/etl/steps/archive/garden/irena/2023-01-04/renewable_electricity_capacity.meta.yml b/etl/steps/archive/garden/irena/2023-01-04/renewable_electricity_capacity.meta.yml deleted file mode 100644 index 1d72bddd673..00000000000 --- a/etl/steps/archive/garden/irena/2023-01-04/renewable_electricity_capacity.meta.yml +++ /dev/null @@ -1,146 +0,0 @@ -dataset: - short_name: renewable_electricity_capacity - description: | - IRENA publishes detailed statistics on renewable energy capacity, power generation and renewable energy balances. This data is collected directly from members using the IRENA Renewable Energy Statistics questionnaire and is also supplemented by desk research where official statistics are not available. - - Some technologies include others, following this schema: - - Total renewable energy - 1. Hydropower - 1.1 Renewable hydropower - 1.2 Pumped storage * - 2. Marine - 3. Wind - 3.1 Onshore wind energy - 3.2 Offshore wind energy - 4. Solar - 4.1 Solar photovoltaic - 4.2 Concentrated solar power - 5. Bioenergy - 5.1 Solid biofuels - 5.1.1 Bagasse - 5.1.2 Renewable municipal waste - 5.1.3 Other solid biofuels - 5.2 Liquid biofuels - 5.3 Biogas - 6. Geothermal - - * Pumped storage is included under the "Hydropower" category but not in the "Total renewable energy". Generation from mixed plants is split between hydropower and pumped storage as appropriate. - -tables: - renewable_electricity_capacity: - variables: - bagasse: - title: Bagasse - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - bioenergy: - title: Bioenergy - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - biogas: - title: Biogas - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - concentrated_solar_power: - title: Concentrated solar power - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - geothermal_energy: - title: Geothermal energy - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - hydropower: - title: Hydropower - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - liquid_biofuels: - title: Liquid biofuels - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - marine_energy: - title: Marine energy - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - offshore_wind_energy: - title: Offshore wind energy - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - onshore_wind_energy: - title: Onshore wind energy - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - other_solid_biofuels: - title: Other solid biofuels - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - pure_pumped_storage: - title: Pure pumped storage - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - renewable_hydropower_including_mixed_plants: - title: Renewable hydropower including mixed plants - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - renewable_municipal_waste: - title: Renewable municipal waste - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - solar_energy: - title: Solar energy - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - solar_photovoltaic: - title: Solar photovoltaic - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - solid_biofuels_and_renewable_waste: - title: Solid biofuels and renewable waste - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - total_renewable_energy: - title: Total renewable energy - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 - wind_energy: - title: Wind energy - short_unit: MW - unit: megawatts - display: - numDecimalPlaces: 0 diff --git a/etl/steps/archive/garden/irena/2023-01-04/renewable_electricity_capacity.py b/etl/steps/archive/garden/irena/2023-01-04/renewable_electricity_capacity.py deleted file mode 100644 index 816dfdf580a..00000000000 --- a/etl/steps/archive/garden/irena/2023-01-04/renewable_electricity_capacity.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Create a dataset of renewable electricity capacity using IRENA's Renewable Electricity Capacity and Generation. - -""" - -import pandas as pd -from owid import catalog - -from etl.data_helpers import geo -from etl.helpers import PathFinder - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load dataset from Meadow. - ds_meadow: catalog.Dataset = paths.load_dependency("renewable_electricity_capacity_and_generation") - # Load main table from dataset. - tb_meadow = ds_meadow["renewable_electricity_capacity_and_generation"] - # Create a dataframe out of the main table. - df = pd.DataFrame(tb_meadow).reset_index() - - # - # Process data. - # - # Harmonize country names. - df = geo.harmonize_countries(df=df, countries_file=paths.country_mapping_path) - - # Reshape dataframe to have each technology as a separate column, and sort conveniently. - df = ( - df.pivot(index=["country", "year"], columns=["technology"], values="capacity") - .rename_axis(None, axis=1) - .sort_index() - .sort_index(axis=1) - ) - - # For convenience, remove parentheses from column names. - df = df.rename(columns={column: column.replace("(", "").replace(")", "") for column in df.columns}) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - ds_garden.metadata.version = paths.version - - # Create a new table. - tb_garden = catalog.Table(df, underscore=True, short_name=paths.short_name) - - # Add table to dataset. - ds_garden.add(tb_garden) - - # Update metadata and save dataset. - ds_garden.update_metadata(paths.metadata_path) - ds_garden.save() diff --git a/etl/steps/archive/garden/irena/2023-01-04/renewable_power_generation_costs.countries.json b/etl/steps/archive/garden/irena/2023-01-04/renewable_power_generation_costs.countries.json deleted file mode 100644 index fda7f074228..00000000000 --- a/etl/steps/archive/garden/irena/2023-01-04/renewable_power_generation_costs.countries.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "Australia": "Australia", - "Brazil": "Brazil", - "Canada": "Canada", - "China": "China", - "Denmark": "Denmark", - "France": "France", - "Germany": "Germany", - "India": "India", - "Italy": "Italy", - "Japan": "Japan", - "Mexico": "Mexico", - "Netherlands": "Netherlands", - "Republic of Korea": "South Korea", - "Spain": "Spain", - "Sweden": "Sweden", - "Ukraine": "Ukraine", - "United Kingdom": "United Kingdom", - "United States": "United States", - "Viet Nam": "Vietnam", - "World": "World", - "T\u00fcrkiye": "Turkey" -} \ No newline at end of file diff --git a/etl/steps/archive/garden/irena/2023-01-04/renewable_power_generation_costs.meta.yml b/etl/steps/archive/garden/irena/2023-01-04/renewable_power_generation_costs.meta.yml deleted file mode 100644 index 626b782e2e0..00000000000 --- a/etl/steps/archive/garden/irena/2023-01-04/renewable_power_generation_costs.meta.yml +++ /dev/null @@ -1,53 +0,0 @@ -dataset: - namespace: irena - title: Renewable power generation costs (IRENA, 2023) - short_name: renewable_power_generation_costs - version: 2023-01-04 - description: | - Levelized cost of energy (LCOE) estimates the average cost per unit of energy generated across the lifetime of a new power plant. It is measured in 2021 US$ per kilowatt-hour. - sources: - - - name: International Renewable Energy Agency (IRENA) - published_by: International Renewable Energy Agency © IRENA 2022 - url: https://irena.org/publications/2022/Jul/Renewable-Power-Generation-Costs-in-2021 - date_accessed: 2022-10-20 - -tables: - renewable_power_generation_costs: - variables: - bioenergy: - title: Bioenergy levelized cost of energy - short_unit: $/kWh - unit: 2021 US$ per kilowatt-hour - concentrated_solar_power: - title: Concentrated solar power levelized cost of energy - short_unit: $/kWh - unit: 2021 US$ per kilowatt-hour - geothermal: - title: Geothermal levelized cost of energy - short_unit: $/kWh - unit: 2021 US$ per kilowatt-hour - hydropower: - title: Hydropower levelized cost of energy - short_unit: $/kWh - unit: 2021 US$ per kilowatt-hour - offshore_wind: - title: Offshore wind levelized cost of energy - short_unit: $/kWh - unit: 2021 US$ per kilowatt-hour - onshore_wind: - title: Onshore wind levelized cost of energy - short_unit: $/kWh - unit: 2021 US$ per kilowatt-hour - solar_photovoltaic: - title: Solar photovoltaic levelized cost of energy - short_unit: $/kWh - unit: 2021 US$ per kilowatt-hour - solar_photovoltaic_module_prices: - variables: - cost: - title: Solar photovoltaic module prices - short_unit: "$/W" - unit: "2021 US$ per Watt" - description: | - Global average price of solar photovoltaic modules. diff --git a/etl/steps/archive/garden/irena/2023-01-04/renewable_power_generation_costs.py b/etl/steps/archive/garden/irena/2023-01-04/renewable_power_generation_costs.py deleted file mode 100644 index 4cdbc19c9ce..00000000000 --- a/etl/steps/archive/garden/irena/2023-01-04/renewable_power_generation_costs.py +++ /dev/null @@ -1,45 +0,0 @@ -import pandas as pd -from owid import catalog - -from etl.data_helpers import geo -from etl.helpers import PathFinder - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load dataset from Meadow. - ds_meadow: catalog.Dataset = paths.load_dependency("renewable_power_generation_costs") - # Load main table from dataset. - tb_meadow = ds_meadow["renewable_power_generation_costs"] - # Load table on solar photovoltaic module prices. - tb_meadow_solar_pv = ds_meadow["solar_photovoltaic_module_prices"] - - # Create dataframes out of the tables. - df = pd.DataFrame(tb_meadow).reset_index() - df_pv = pd.DataFrame(tb_meadow_solar_pv).reset_index() - - # Harmonize country names. - df = geo.harmonize_countries(df=df, countries_file=paths.country_mapping_path) - - # - # Save outputs. - # - # Create a new Garden dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - - # Create a new table of LCOE and add it to the dataset. - tb_garden = catalog.Table(df, underscore=True, short_name=paths.short_name) - ds_garden.add(tb_garden) - - # Create a new table of solar PV module prices and add it to the dataset. - tb_garden_pv = catalog.Table(df_pv, underscore=True, short_name="solar_photovoltaic_module_prices") - ds_garden.add(tb_garden_pv) - - # Update metadata and save dataset. - ds_garden.update_metadata(paths.metadata_path) - ds_garden.save() diff --git a/etl/steps/archive/garden/met_office_hadley_centre/2023-01-02/near_surface_temperature.meta.yml b/etl/steps/archive/garden/met_office_hadley_centre/2023-01-02/near_surface_temperature.meta.yml deleted file mode 100644 index e7612fac673..00000000000 --- a/etl/steps/archive/garden/met_office_hadley_centre/2023-01-02/near_surface_temperature.meta.yml +++ /dev/null @@ -1,35 +0,0 @@ -dataset: - short_name: near_surface_temperature - title: Near-surface temperature anomaly (Met Office Hadley Centre, 2022) - description: | - Temperature anomalies are based on the HadCRUT5 near-surface temperature dataset as published by the Met Office Hadley Centre. Temperature anomalies are given in degrees celsius relative to the average temperature over the period 1961-1990. - - These are available for the Northern Hemisphere and the Southern Hemisphere. The global mean has been calculated by averaging anomalies for northern and southern hemispheres. - sources: - - - name: Met Office Hadley Centre - published_by: | - Morice, C.P., J.J. Kennedy, N.A. Rayner, J.P. Winn, E. Hogan, R.E. Killick, R.J.H. Dunn, T.J. Osborn, P.D. Jones and I.R. Simpson (in press) An updated assessment of near-surface temperature change from 1850: the HadCRUT5 dataset. Journal of Geophysical Research (Atmospheres) doi:10.1029/2019JD032361 (supporting information). - date_accessed: 2023-01-02 - publication_year: 2022 - url: "https://www.metoffice.gov.uk/hadobs/hadcrut5/" - -tables: - near_surface_temperature: - variables: - temperature_anomaly: - title: Global average temperature anomaly relative to 1961-1990 - short_unit: °C - unit: degrees Celsius - description: | - Annual average temperature anomaly (relative to 1961-90). - - The global mean has been calculated by averaging anomalies for northern and southern hemispheres. - upper_limit: - title: Upper bound (95% confidence interval) of the annual temperature anomaly - short_unit: °C - unit: degrees Celsius - lower_limit: - title: Lower bound (95% confidence interval) of the annual temperature anomaly - short_unit: °C - unit: degrees Celsius diff --git a/etl/steps/archive/garden/met_office_hadley_centre/2023-01-02/near_surface_temperature.py b/etl/steps/archive/garden/met_office_hadley_centre/2023-01-02/near_surface_temperature.py deleted file mode 100644 index ce613426dce..00000000000 --- a/etl/steps/archive/garden/met_office_hadley_centre/2023-01-02/near_surface_temperature.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Curate near surface temperature dataset by Met Office Hadley Centre. - -""" - -import pandas as pd -from owid.catalog import Dataset, Table - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -# Get naming conventions. -N = PathFinder(__file__) - -# Meadow and garden dataset versions. -MEADOW_VERSION = "2023-01-02" -GARDEN_VERSION = MEADOW_VERSION - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load dataset from meadow. - ds_meadow = Dataset(DATA_DIR / f"meadow/met_office_hadley_centre/{MEADOW_VERSION}/near_surface_temperature") - tb_meadow = ds_meadow["near_surface_temperature"] - df = pd.DataFrame(tb_meadow) - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as in meadow. - ds_garden = Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - - # Create a new table with the same metadata as in meadow and add it to the dataset. - tb_garden = Table(df, like=tb_meadow) - ds_garden.add(tb_garden) - - # Update dataset metadata and save dataset. - ds_garden.update_metadata(N.metadata_path) - ds_garden.save() diff --git a/etl/steps/archive/garden/met_office_hadley_centre/2023-01-17/near_surface_temperature.meta.yml b/etl/steps/archive/garden/met_office_hadley_centre/2023-01-17/near_surface_temperature.meta.yml deleted file mode 100644 index 4a5e3abb658..00000000000 --- a/etl/steps/archive/garden/met_office_hadley_centre/2023-01-17/near_surface_temperature.meta.yml +++ /dev/null @@ -1,23 +0,0 @@ -dataset: - title: Near-surface temperature anomaly (Met Office Hadley Centre, 2023) - description: "" - -tables: - near_surface_temperature: - variables: - temperature_anomaly: - title: Global average temperature anomaly relative to 1961-1990 - short_unit: °C - unit: degrees Celsius - description: | - Annual average temperature anomaly (relative to 1961-90). - - The global mean has been calculated by averaging anomalies for northern and southern hemispheres. - upper_limit: - title: Upper bound (95% confidence interval) of the annual temperature anomaly - short_unit: °C - unit: degrees Celsius - lower_limit: - title: Lower bound (95% confidence interval) of the annual temperature anomaly - short_unit: °C - unit: degrees Celsius \ No newline at end of file diff --git a/etl/steps/archive/garden/met_office_hadley_centre/2023-01-17/near_surface_temperature.py b/etl/steps/archive/garden/met_office_hadley_centre/2023-01-17/near_surface_temperature.py deleted file mode 100644 index e11436d549d..00000000000 --- a/etl/steps/archive/garden/met_office_hadley_centre/2023-01-17/near_surface_temperature.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Load a meadow dataset and create a garden dataset.""" - -import pandas as pd -from owid.catalog import Dataset, Table -from structlog import get_logger - -from etl.helpers import PathFinder - -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - log.info("near_surface_temperature.start") - - # - # Load inputs. - # - # Load meadow dataset. - ds_meadow: Dataset = paths.load_dependency("near_surface_temperature") - - # Read table from meadow dataset. - tb_meadow = ds_meadow["near_surface_temperature"] - - # Create a dataframe with data from the table. - df = pd.DataFrame(tb_meadow) - - # - # Process data. - # - # Create a new table with the processed data. - tb_garden = Table(df, like=tb_meadow) - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - - # Add table of processed data to the new dataset. - ds_garden.add(tb_garden) - - # Update dataset and table metadata using the adjacent yaml file. - ds_garden.update_metadata(paths.metadata_path) - - # Save changes in the new garden dataset. - ds_garden.save() - - log.info("near_surface_temperature.end") diff --git a/etl/steps/archive/garden/papers/2022-11-04/riley_2005/__init__.py b/etl/steps/archive/garden/papers/2022-11-04/riley_2005/__init__.py deleted file mode 100644 index e41420165b9..00000000000 --- a/etl/steps/archive/garden/papers/2022-11-04/riley_2005/__init__.py +++ /dev/null @@ -1,81 +0,0 @@ -import json -from typing import List, cast - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore_table -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -log = get_logger() - -# naming conventions -N = PathFinder(__file__) -COUNTRY_MAPPING_PATH = N.directory / "countries.json" -EXCLUDED_COUNTRIES_PATH = N.directory / "excluded_countries.json" -METADATA_PATH = N.directory / "meta.yml" - - -def run(dest_dir: str) -> None: - log.info("riley_2005.start") - - # read dataset from meadow - ds_meadow = Dataset(DATA_DIR / "meadow/papers/2022-11-04/riley_2005") - tb_meadow = ds_meadow["riley_2005"] - - df = pd.DataFrame(tb_meadow) - - log.info("riley_2005.exclude_countries") - df = exclude_countries(df) - - log.info("riley_2005.harmonize_countries") - df = harmonize_countries(df) - - ds_garden = Dataset.create_empty(dest_dir) - ds_garden.metadata = ds_meadow.metadata - - tb_garden = underscore_table(Table(df)) - tb_garden.metadata = tb_meadow.metadata - for col in tb_garden.columns: - tb_garden[col].metadata = tb_meadow[col].metadata - - ds_garden.metadata.update_from_yaml(METADATA_PATH) - tb_garden.update_metadata_from_yaml(METADATA_PATH, "riley_2005") - - tb_garden = tb_garden.set_index(["entity", "year"]) - - ds_garden.add(tb_garden) - ds_garden.save() - - log.info("riley_2005.end") - - -def load_excluded_countries() -> List[str]: - with open(EXCLUDED_COUNTRIES_PATH, "r") as f: - data = json.load(f) - assert isinstance(data, list) - return data - - -def exclude_countries(df: pd.DataFrame) -> pd.DataFrame: - excluded_countries = load_excluded_countries() - return cast(pd.DataFrame, df.loc[~df.entity.isin(excluded_countries)]) - - -def harmonize_countries(df: pd.DataFrame) -> pd.DataFrame: - unharmonized_countries = df["entity"] - df = geo.harmonize_countries(df=df, countries_file=str(COUNTRY_MAPPING_PATH), country_col="entity") - - missing_countries = set(unharmonized_countries[df.entity.isnull()]) - if any(missing_countries): - raise RuntimeError( - "The following raw country names have not been harmonized. " - f"Please: (a) edit {COUNTRY_MAPPING_PATH} to include these country " - f"names; or (b) add them to {EXCLUDED_COUNTRIES_PATH}." - f"Raw country names: {missing_countries}" - ) - - return df diff --git a/etl/steps/archive/garden/papers/2022-11-04/riley_2005/countries.json b/etl/steps/archive/garden/papers/2022-11-04/riley_2005/countries.json deleted file mode 100644 index 51acfbdf297..00000000000 --- a/etl/steps/archive/garden/papers/2022-11-04/riley_2005/countries.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "Africa": "Africa (Riley 2005)", - "Asia": "Asia (Riley 2005)", - "Europe": "Europe (Riley 2005)", - "Oceania": "Oceania (Riley 2005)", - "World": "World", - "Americas": "Americas (Riley 2005)" -} \ No newline at end of file diff --git a/etl/steps/archive/garden/papers/2022-11-04/riley_2005/excluded_countries.json b/etl/steps/archive/garden/papers/2022-11-04/riley_2005/excluded_countries.json deleted file mode 100644 index 7a88417e503..00000000000 --- a/etl/steps/archive/garden/papers/2022-11-04/riley_2005/excluded_countries.json +++ /dev/null @@ -1,3 +0,0 @@ -[ - "Soviet Union" -] \ No newline at end of file diff --git a/etl/steps/archive/garden/papers/2022-11-04/riley_2005/meta.yml b/etl/steps/archive/garden/papers/2022-11-04/riley_2005/meta.yml deleted file mode 100644 index b73908d0f67..00000000000 --- a/etl/steps/archive/garden/papers/2022-11-04/riley_2005/meta.yml +++ /dev/null @@ -1,33 +0,0 @@ -dataset: - namespace: papers - short_name: riley_2005 - title: Estimates of Regional and Global Life Expectancy, 1800-2001 - Riley (2005) - description: | - Historians and demographers have gone to considerable trouble to reconstruct life expectancy in the past in individual countries. - - This overview collects information from a large body of that work and links estimates for historical populations to those provided by the United Nations, the World Bank, and other sources for 1950–2001. The result is a picture of regional and global life expectancy at birth for selected years from 1800 to 2001. The bibliography of more than 700 sources is published separately on the web. - - Continents definitions may differ from OWID's definitions. The individual countries included are those that had populations in 2000 of at least 400,000, plus Iceland. Smaller countries have been excluded chiefly because of the difficulty of finding estimates for life expectancy before the 1940s. For 2001 the population included in the estimates is 99.6 percent of the total midyear world population. - - For more details please read the 'Notes' section from the source. - licenses: - - name: JSTOR - url: https://about.jstor.org/terms/ - version: "2022-11-04" - sources: - - name: Riley (2005) - url: https://doi.org/10.1111/j.1728-4457.2005.00083.x - source_data_url: https://u.demog.berkeley.edu/~jrw/Biblio/Eprints/%20P-S/riley.2005_estimates.global.e0.pdf - owid_data_url: https://walden.nyc3.digitaloceanspaces.com/papers/2022-11-01/riley_2005.pdf - date_accessed: "2022-11-01" - publication_date: "2005-10-21" - publication_year: 2005 -tables: - riley_2005: - title: Life expectancy (at birth) - description: Life expectancy at birth estimates. - variables: - life_expectancy: - title: life_expectancy - short_unit: "years" - unit: "years" diff --git a/etl/steps/archive/garden/papers/2023-01-04/farmer_lafond_2016.meta.yml b/etl/steps/archive/garden/papers/2023-01-04/farmer_lafond_2016.meta.yml deleted file mode 100644 index 1ddc2bf061d..00000000000 --- a/etl/steps/archive/garden/papers/2023-01-04/farmer_lafond_2016.meta.yml +++ /dev/null @@ -1,419 +0,0 @@ -dataset: - namespace: papers - short_name: farmer_lafond_2016 - title: Cost of different technologies (Farmer & Lafond (2016), 2023) - description: | - Cost of each technology, expressed in different units, that have been chosen for visualization purposes, namely: - + Acrylic fiber is measured in 1966 USD/lbs. - + Acrylonitrile is measured in 1966 USD/lbs. - + Aluminum is measured in 1966 USD/lbs. - + Ammonia is measured in 1966 USD/lbs. - + Aniline is measured in 1966 USD/lbs. - + Automotive (US) is measured in Gallons/Mile. - + Beer (Japan) is measured in 1955 Yen. - + Benzene is measured in 1958 USD. - + Bisphenol A is measured in 1966 USD/lbs. - + Caprolactam is measured in 1966 USD/lbs. - + Carbon black is measured in 1966 USD/lbs. - + Carbon disulfide is measured in 1966 USD/lbs. - + CCGT power is measured in 1990 USD/kW. - + Concentrating solar is measured in US cents/kWh. - + Corn (US) is measured in acres/1000 bushels. - + Crude oil is measured in 1958 USD. - + Cyclohexane is measured in 1966 USD/lbs. - + DNA sequencing is measured in 2013 USD/human-size genome. - + DRAM is measured in 2005 USD/thousand bits. - + Electric range is measured in 1958 USD. - + Ethanol (Brazil) is measured in 2002 USD/GJ. - + Ethanolamine is measured in 1966 USD/lbs. - + Ethylene is measured in 1966 USD/lbs. - + Formaldehyde is measured in 1966 USD/lbs. - + Free standing gas range is measured in 1958 USD. - + Geothermal electricity is measured in 2005 US cents/kWh. - + Hard disk drive is measured in 2005 USD/megabyte. - + Hydrofluoric acid is measured in 1966 USD/lbs. - + Isopropyl alcohol is measured in 1966 USD/lbs. - + Laser diode is measured in Yen. - + Low-density polyethylene is measured in 1958 USD/pound. - + Magnesium is measured in 1966 USD/lbs. - + Maleic anhydride is measured in 1966 USD/lbs. - + Methanol is measured in 1966 USD/lbs. - + Milk (US) is measured in Heads/Mil.lbs. - + Monochrome television is measured in 1958 USD per unit. - + Motor gasoline is measured in 1958 USD/Gallon. - + Neoprene rubber is measured in 1966 USD/lbs. - + Nuclear electricity is measured in 2004 USD/Watt. - + Onshore gas pipeline is measured in dollar/mile-inch. - + Paraxylene is measured in 1958 USD. - + Pentaerythritol is measured in 1966 USD/lbs. - + Phenol is measured in 1966 USD/lbs. - + Photovoltaics is measured in 2013 USD/Wp. - + Phthalic anhydride is measured in 1966 USD/lbs. - + Polyester fiber is measured in 1966 USD/lbs. - + Polyethylene HD is measured in 1966 USD/lbs. - + Polyethylene LD is measured in 1966 USD/lbs. - + Polypropylene is measured in 1958 USD/pound. - + Polystyrene is measured in 1958 USD/pound. - + Polyvinylchloride is measured in 1958 USD/pound. - + Primary aluminum is measured in 1958 USD/pound. - + Primary magnesium is measured in 1958 USD/pound. - + Refined cane sugar is measured in 1958 USD. - + Sodium is measured in 1966 USD/lbs. - + Sodium chlorate is measured in 1966 USD/lbs. - + Sodium hydrosulfite is measured in 1966 USD/lbs. - + Sorbitol is measured in 1966 USD/lbs. - + Styrene is measured in 1966 USD/lbs. - + Titanium sponge is measured in 1958 USD/lbs. - + Titanium dioxide is measured in 1966 USD/lbs. - + Transistor is measured in 2005 USD. - + Urea is measured in 1966 USD/lbs. - + Vinyl acetate is measured in 1966 USD/lbs. - + Vinyl chloride is measured in 1966 USD/lbs. - + Wind turbine (Denmark) is measured in DKK/kW. - - According to Farmer & Lafond (2016), the data are mostly taken from the Santa-Fe [Performance Curve Database](https://pcdb.santafe.edu/). The database has been constructed from personal communications and from [Colpier and Cornland (2002)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0095), [Goldemberg et al. (2004)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0130), [Lieberman (1984)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0180), [Lipman and Sperling (1999)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0190), [Zhao (1999)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0310), [McDonald and Schrattenholzer (2001)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0205), [Neij et al. (2003)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0235), [Moore (2006)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0215), [Nemet (2006)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0240), [Schilling and Esmundo (2009)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0265). The data on photovoltaic prices has been collected from public releases of Strategies Unlimited, Navigant and SPV Market Research. The data on nuclear energy is from [Koomey and Hultman (2007)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0165) and [Cooper (2009)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0100). The DNA sequencing data is from [Wetterstrand (2015)](https://www.sciencedirect.com/science/article/pii/S0048733315001699#bib0290) (cost per human-size genome), and for each year the last available month (September for 2001-2002 and October afterwards) was taken and corrected for inflation using the US GDP deflator. - licenses: - - name: Creative Commons 4.0 - url: https://www.sciencedirect.com/science/article/pii/S0048733315001699 - version: '2023-01-04' - sources: - - name: J. D. Farmer & F. Lafond (2016) - published_by: | - How predictable is technological progress? J. D. Farmer & F. Lafond, Research Policy Volume 45, Issue 3, April 2016, Pages 647-665. - url: https://www.sciencedirect.com/science/article/pii/S0048733315001699 - date_accessed: '2023-01-04' - publication_date: '2016-04-01' - publication_year: 2016 -tables: - farmer_lafond_2016: - variables: - acrylic_fiber: - title: Acrylic fiber - description: Cost of acrylic fiber, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - acrylonitrile: - title: Acrylonitrile - description: Cost of acrylonitrile, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - aluminum: - title: Aluminum - description: Cost of aluminum, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - ammonia: - title: Ammonia - description: Cost of ammonia, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - aniline: - title: Aniline - description: Cost of aniline, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - automotive_us: - title: Automotive (US) - description: Cost of automotive (US), measured in Gallons/Mile. - unit: Gallons/Mile - short_unit: '' - beer_japan: - title: Beer (Japan) - description: Cost of beer (Japan), measured in 1955 Yen. - unit: 1955 Yen - short_unit: '' - benzene: - title: Benzene - description: Cost of benzene, measured in 1958 USD. - unit: 1958 USD - short_unit: '' - bisphenol_a: - title: Bisphenol A - description: Cost of bisphenol A, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - caprolactam: - title: Caprolactam - description: Cost of caprolactam, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - carbon_black: - title: Carbon black - description: Cost of carbon black, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - carbon_disulfide: - title: Carbon disulfide - description: Cost of carbon disulfide, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - ccgt_power: - title: CCGT power - description: Cost of cCGT power, measured in 1990 USD/kW. - unit: 1990 USD/kW - short_unit: '' - concentrating_solar: - title: Concentrating solar - description: Cost of concentrating solar, measured in US cents/kWh. - unit: US cents/kWh - short_unit: '' - corn_us: - title: Corn (US) - description: Cost of corn (US), measured in acres/1000 bushels. - unit: acres/1000 bushels - short_unit: '' - crude_oil: - title: Crude oil - description: Cost of crude oil, measured in 1958 USD. - unit: 1958 USD - short_unit: '' - cyclohexane: - title: Cyclohexane - description: Cost of cyclohexane, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - dna_sequencing: - title: DNA sequencing - description: Cost of dNA sequencing, measured in 2013 USD/human-size genome. - unit: 2013 USD/human-size genome - short_unit: '' - dram: - title: DRAM - description: Cost of dRAM, measured in 2005 USD/thousand bits. - unit: 2005 USD/thousand bits - short_unit: '' - electric_range: - title: Electric range - description: Cost of electric range, measured in 1958 USD. - unit: 1958 USD - short_unit: '' - ethanol_brazil: - title: Ethanol (Brazil) - description: Cost of ethanol (Brazil), measured in 2002 USD/GJ. - unit: 2002 USD/GJ - short_unit: '' - ethanolamine: - title: Ethanolamine - description: Cost of ethanolamine, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - ethylene: - title: Ethylene - description: Cost of ethylene, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - formaldehyde: - title: Formaldehyde - description: Cost of formaldehyde, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - free_standing_gas_range: - title: Free standing gas range - description: Cost of free standing gas range, measured in 1958 USD. - unit: 1958 USD - short_unit: '' - geothermal_electricity: - title: Geothermal electricity - description: Cost of geothermal electricity, measured in 2005 US cents/kWh. - unit: 2005 US cents/kWh - short_unit: '' - hard_disk_drive: - title: Hard disk drive - description: Cost of hard disk drive, measured in 2005 USD/megabyte. - unit: 2005 USD/megabyte - short_unit: '' - hydrofluoric_acid: - title: Hydrofluoric acid - description: Cost of hydrofluoric acid, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - isopropyl_alcohol: - title: Isopropyl alcohol - description: Cost of isopropyl alcohol, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - laser_diode: - title: Laser diode - description: Cost of laser diode, measured in Yen. - unit: Yen - short_unit: '' - low_density_polyethylene: - title: Low-density polyethylene - description: Cost of low-density polyethylene, measured in 1958 USD/pound. - unit: 1958 USD/pound - short_unit: '' - magnesium: - title: Magnesium - description: Cost of magnesium, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - maleic_anhydride: - title: Maleic anhydride - description: Cost of maleic anhydride, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - methanol: - title: Methanol - description: Cost of methanol, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - milk_us: - title: Milk (US) - description: Cost of milk (US), measured in Heads/Mil.lbs. - unit: Heads/Mil.lbs - short_unit: '' - monochrome_television: - title: Monochrome television - description: Cost of monochrome television, measured in 1958 USD per unit. - unit: 1958 USD per unit - short_unit: '' - motor_gasoline: - title: Motor gasoline - description: Cost of motor gasoline, measured in 1958 USD/Gallon. - unit: 1958 USD/Gallon - short_unit: '' - neoprene_rubber: - title: Neoprene rubber - description: Cost of neoprene rubber, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - nuclear_electricity: - title: Nuclear electricity - description: Cost of nuclear electricity, measured in 2004 USD/Watt. - unit: 2004 USD/Watt - short_unit: '' - onshore_gas_pipeline: - title: Onshore gas pipeline - description: Cost of onshore gas pipeline, measured in dollar/mile-inch. - unit: dollar/mile-inch - short_unit: '' - paraxylene: - title: Paraxylene - description: Cost of paraxylene, measured in 1958 USD. - unit: 1958 USD - short_unit: '' - pentaerythritol: - title: Pentaerythritol - description: Cost of pentaerythritol, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - phenol: - title: Phenol - description: Cost of phenol, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - photovoltaics: - title: Photovoltaics - description: Cost of photovoltaics, measured in 2013 USD/Wp. - unit: 2013 USD/Wp - short_unit: '' - phthalic_anhydride: - title: Phthalic anhydride - description: Cost of phthalic anhydride, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - polyester_fiber: - title: Polyester fiber - description: Cost of polyester fiber, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - polyethylene_hd: - title: Polyethylene HD - description: Cost of polyethylene HD, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - polyethylene_ld: - title: Polyethylene LD - description: Cost of polyethylene LD, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - polypropylene: - title: Polypropylene - description: Cost of polypropylene, measured in 1958 USD/pound. - unit: 1958 USD/pound - short_unit: '' - polystyrene: - title: Polystyrene - description: Cost of polystyrene, measured in 1958 USD/pound. - unit: 1958 USD/pound - short_unit: '' - polyvinylchloride: - title: Polyvinylchloride - description: Cost of polyvinylchloride, measured in 1958 USD/pound. - unit: 1958 USD/pound - short_unit: '' - primary_aluminum: - title: Primary aluminum - description: Cost of primary aluminum, measured in 1958 USD/pound. - unit: 1958 USD/pound - short_unit: '' - primary_magnesium: - title: Primary magnesium - description: Cost of primary magnesium, measured in 1958 USD/pound. - unit: 1958 USD/pound - short_unit: '' - refined_cane_sugar: - title: Refined cane sugar - description: Cost of refined cane sugar, measured in 1958 USD. - unit: 1958 USD - short_unit: '' - sodium: - title: Sodium - description: Cost of sodium, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - sodium_chlorate: - title: Sodium chlorate - description: Cost of sodium chlorate, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - sodium_hydrosulfite: - title: Sodium hydrosulfite - description: Cost of sodium hydrosulfite, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - sorbitol: - title: Sorbitol - description: Cost of sorbitol, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - styrene: - title: Styrene - description: Cost of styrene, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - titanium_sponge: - title: Titanium sponge - description: Cost of titanium sponge, measured in 1958 USD/lbs. - unit: 1958 USD/lbs - short_unit: '' - titanium_dioxide: - title: Titanium dioxide - description: Cost of titanium dioxide, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - transistor: - title: Transistor - description: Cost of transistor, measured in 2005 USD. - unit: 2005 USD - short_unit: '' - urea: - title: Urea - description: Cost of urea, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - vinyl_acetate: - title: Vinyl acetate - description: Cost of vinyl acetate, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - vinyl_chloride: - title: Vinyl chloride - description: Cost of vinyl chloride, measured in 1966 USD/lbs. - unit: 1966 USD/lbs - short_unit: '' - wind_turbine_denmark: - title: Wind turbine (Denmark) - description: Cost of wind turbine (Denmark), measured in DKK/kW. - unit: DKK/kW - short_unit: '' diff --git a/etl/steps/archive/garden/papers/2023-01-04/farmer_lafond_2016.py b/etl/steps/archive/garden/papers/2023-01-04/farmer_lafond_2016.py deleted file mode 100644 index 44dd946692d..00000000000 --- a/etl/steps/archive/garden/papers/2023-01-04/farmer_lafond_2016.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Harmonize data from Farmer & Lafond (2016) paper on the evolution of the cost of different technologies. - -""" - -from owid import catalog - -from etl.helpers import PathFinder - -# Get paths and naming conventions for current data step. -paths = PathFinder(__file__) - -# Columns to select from Meadow table, and how to rename them. -COLUMNS = { - "acrylicfiber": "acrylic_fiber", - "acrylonitrile": "acrylonitrile", - "aluminum": "aluminum", - "ammonia": "ammonia", - "aniline": "aniline", - "automotive__us": "automotive_us", - "beer__japan": "beer_japan", - "benzene": "benzene", - "bisphenola": "bisphenol_a", - "caprolactam": "caprolactam", - "carbonblack": "carbon_black", - "carbondisulfide": "carbon_disulfide", - "ccgt_power": "ccgt_power", - "concentrating_solar": "concentrating_solar", - "corn__us": "corn_us", - "crude_oil": "crude_oil", - "cyclohexane": "cyclohexane", - "dna_sequencing": "dna_sequencing", - "dram": "dram", - "electric_range": "electric_range", - "ethanol__brazil": "ethanol_brazil", - "ethanolamine": "ethanolamine", - "ethylene": "ethylene", - "formaldehyde": "formaldehyde", - "free_standing_gas_range": "free_standing_gas_range", - "geothermal_electricity": "geothermal_electricity", - "hard_disk_drive": "hard_disk_drive", - "hydrofluoricacid": "hydrofluoric_acid", - "isopropylalcohol": "isopropyl_alcohol", - "laser_diode": "laser_diode", - "low_density_polyethylene": "low_density_polyethylene", - "magnesium": "magnesium", - "maleicanhydride": "maleic_anhydride", - "methanol": "methanol", - "milk__us": "milk_us", - "monochrome_television": "monochrome_television", - "motor_gasoline": "motor_gasoline", - "neoprenerubber": "neoprene_rubber", - "nuclear_electricity": "nuclear_electricity", - "onshore_gas_pipeline": "onshore_gas_pipeline", - "paraxylene": "paraxylene", - "pentaerythritol": "pentaerythritol", - "phenol": "phenol", - "photovoltaics": "photovoltaics", - "phthalicanhydride": "phthalic_anhydride", - "polyesterfiber": "polyester_fiber", - "polyethylenehd": "polyethylene_hd", - "polyethyleneld": "polyethylene_ld", - "polypropylene": "polypropylene", - "polystyrene": "polystyrene", - "polyvinylchloride": "polyvinylchloride", - "primary_aluminum": "primary_aluminum", - "primary_magnesium": "primary_magnesium", - "refined_cane_sugar": "refined_cane_sugar", - "sodium": "sodium", - "sodiumchlorate": "sodium_chlorate", - "sodiumhydrosulfite": "sodium_hydrosulfite", - "sorbitol": "sorbitol", - "styrene": "styrene", - "titanium_sponge": "titanium_sponge", - "titanium_dioxide": "titanium_dioxide", - "transistor": "transistor", - "urea": "urea", - "vinylacetate": "vinyl_acetate", - "vinylchloride": "vinyl_chloride", - "wind_turbine__denmark": "wind_turbine_denmark", -} - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load dataset from Meadow. - ds_meadow: catalog.Dataset = paths.load_dependency("farmer_lafond_2016") - tb_meadow = ds_meadow["farmer_lafond_2016"] - - # - # Process data. - # - # Rename technologies conveniently (both in column names and in metadata). - tb_garden = tb_meadow.rename(columns=COLUMNS, errors="raise")[COLUMNS.values()].copy() - - # - # Save outputs. - # - # Create a new dataset with the same metadata as meadow - ds_garden = catalog.Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - - # Create a new table with the same metadata as meadow and add it to the dataset. - ds_garden.add(tb_garden) - - # Update dataset metadata and save dataset. - ds_garden.update_metadata(paths.metadata_path) - ds_garden.save() diff --git a/etl/steps/archive/garden/papers/2023-01-04/nemet_2009.meta.yml b/etl/steps/archive/garden/papers/2023-01-04/nemet_2009.meta.yml deleted file mode 100644 index a044309a461..00000000000 --- a/etl/steps/archive/garden/papers/2023-01-04/nemet_2009.meta.yml +++ /dev/null @@ -1,16 +0,0 @@ -dataset: - namespace: papers - short_name: nemet_2009 - title: Cost and capacity of photovoltaic technology - licenses: - - name: Creative Commons Attribution 3.0 Unported License - url: https://pcdb.santafe.edu/graph.php?curve=158 - version: '2023-01-04' - sources: - - name: G. G. Nemet (2009) - published_by: | - Interim monitoring of cost dynamics for publicly supported energy technologies. Energy Policy 37(3): 825-835. by Nemet, G. F. (2009). - url: https://www.sciencedirect.com/science/article/abs/pii/S0301421508005910 - date_accessed: '2023-01-04' - publication_date: '2009-03-01' - publication_year: 2009 diff --git a/etl/steps/archive/garden/papers/2023-01-04/nemet_2009.py b/etl/steps/archive/garden/papers/2023-01-04/nemet_2009.py deleted file mode 100644 index 44e3a9d62c7..00000000000 --- a/etl/steps/archive/garden/papers/2023-01-04/nemet_2009.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Harmonize data from Nemet (2009) paper on cost and capacity of photovoltaic energy. - -""" - -from owid import catalog - -from etl.helpers import PathFinder - -# Get paths and naming conventions for current data step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load dataset from Meadow. - ds_meadow: catalog.Dataset = paths.load_dependency("nemet_2009") - tb_meadow = ds_meadow["nemet_2009"] - - # - # Save outputs. - # - # Create a new dataset with the same metadata as meadow - ds_garden = catalog.Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - - # Create a new table and add it to the dataset. - tb_garden = tb_meadow.copy() - ds_garden.add(tb_garden) - - # Update dataset metadata and save dataset. - ds_garden.update_metadata(paths.metadata_path) - ds_garden.save() diff --git a/etl/steps/archive/garden/rff/2022-09-14/emissions_weighted_carbon_price.countries.json b/etl/steps/archive/garden/rff/2022-09-14/emissions_weighted_carbon_price.countries.json deleted file mode 100644 index 1574c947c15..00000000000 --- a/etl/steps/archive/garden/rff/2022-09-14/emissions_weighted_carbon_price.countries.json +++ /dev/null @@ -1,204 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas, The": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo, Dem. Rep.": "Democratic Republic of Congo", - "Congo, Rep.": "Congo", - "Costa Rica": "Costa Rica", - "Cote d'Ivoire": "Cote d'Ivoire", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt, Arab Rep.": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Ethiopia": "Ethiopia", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "Gabon": "Gabon", - "Gambia, The": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Grenada": "Grenada", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong SAR, China": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran, Islamic Rep.": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea, Rep.": "South Korea", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyz Republic": "Kyrgyzstan", - "Lao PDR": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao SAR, China": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Moldova": "Moldova", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Samoa": "Samoa", - "San Marino": "San Marino", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovak Republic": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "St. Kitts and Nevis": "Saint Kitts and Nevis", - "St. Lucia": "Saint Lucia", - "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela, RB": "Venezuela", - "Vietnam": "Vietnam", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen, Rep.": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Federated States of Micronesia": "Micronesia (country)", - "Korea, Dem. Rep.": "North Korea", - "Taiwan, China": "Taiwan", - "Vatican City": "Vatican", - "West Bank and Gaza": "Palestine" -} diff --git a/etl/steps/archive/garden/rff/2022-09-14/emissions_weighted_carbon_price.meta.yml b/etl/steps/archive/garden/rff/2022-09-14/emissions_weighted_carbon_price.meta.yml deleted file mode 100644 index f60c3b175f8..00000000000 --- a/etl/steps/archive/garden/rff/2022-09-14/emissions_weighted_carbon_price.meta.yml +++ /dev/null @@ -1,137 +0,0 @@ -dataset: - short_name: emissions_weighted_carbon_price - title: Emissions-weighted carbon price - description: | - The World Carbon Pricing Database covers national and subnational economic mechanisms relating to carbon emissions from 1990 onwards. It was developed from several key sources: most notably, policy documents from countries and regions themselves. Secondly, from other sources such as the International Carbon Action Partnership. - - The dataset primarily focuses on economic instruments targeting carbon dioxide (CO2) emissions. In some cases these instruments also cover other greenhouse gases. However, any pricing mechanisms that target non-CO2 gases (such as methane or nitrous oxide) specifically, are not included. - - A country is considered to have a carbon tax or emissions trading system if at least one IPCC sector or gas is covered by the instrument. These instruments do not need to cover all sectors within the economy for this to apply. - - For each country, researchers calculate an emissions-weighted carbon price for the economy. To do this, they rely on two metrics: - - - Carbon prices applied at the sectoral level (e.g. electricity, or road transport) - - Each sector’s contribution to a country’s CO2 emissions (e.g. what percentage of a country’s emissions come from electricity, or road transport) - They then weight each sector’s carbon price by the relevant sector’s contribution to CO2 emissions, and aggregate these figures to get an economy-wide weighted carbon price. - - A full technical note on this methodology is provided by the authors [here](https://www.rff.org/publications/working-papers/emissions-weighted-carbon-price-sources-and-methods/). - sources: - - name: Dolphin, Pollitt and Newbery (2020). Emissions-weighted Carbon Price. - published_by: >- - Dolphin, G., Pollitt, M. and Newbery, D. 2020. The political economy of carbon pricing: a panel analysis. - Oxford Economic Papers 72(2): 472-500. - publication_year: 2022 - publication_date: 2022-01-18 - url: https://github.com/g-dolphin/ECP -tables: - emissions_weighted_carbon_price: - title: Emissions-weighted carbon price - variables: - co2_with_ets_as_share_of_co2: - title: CO2 emissions covered by an ETS as a share of the country's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_ets_as_share_of_ghg: - title: CO2 emissions covered by an ETS as a share of the country's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_ets_as_share_of_world_co2: - title: CO2 emissions covered by an ETS as a share of the world's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_ets_as_share_of_world_ghg: - title: CO2 emissions covered by an ETS as a share of the world's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_as_share_of_co2: - title: CO2 emissions covered by a carbon tax as a share of the country's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_as_share_of_ghg: - title: CO2 emissions covered by a carbon tax as a share of the country's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_as_share_of_world_co2: - title: CO2 emissions covered by a carbon tax as a share of the world's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_as_share_of_world_ghg: - title: CO2 emissions covered by a carbon tax as a share of the world's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_or_ets_as_share_of_co2: - title: CO2 emissions covered by a carbon tax or an ETS as a share of the country's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_or_ets_as_share_of_ghg: - title: CO2 emissions covered by a carbon tax or an ETS as a share of the country's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_or_ets_as_share_of_world_co2: - title: CO2 emissions covered by a carbon tax or an ETS as a share of the world's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_or_ets_as_share_of_world_ghg: - title: CO2 emissions covered by a carbon tax or an ETS as a share of the world's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - price_with_ets_weighted_by_share_of_co2: - title: Average price on emissions covered by an ETS, weighted by the share of the country's CO2 emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 - price_with_ets_weighted_by_share_of_ghg: - title: Average price on emissions covered by an ETS, weighted by the share of the country's GHG emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 - price_with_tax_or_ets_weighted_by_share_of_co2: - title: Average price on emissions covered by a carbon tax or an ETS, weighted by the share of the country's CO2 emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 - price_with_tax_or_ets_weighted_by_share_of_ghg: - title: Average price on emissions covered by a carbon tax or an ETS, weighted by the share of the country's GHG emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 - price_with_tax_weighted_by_share_of_co2: - title: Average price on emissions covered by a carbon tax, weighted by the share of the country's CO2 emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 - price_with_tax_weighted_by_share_of_ghg: - title: Average price on emissions covered by a carbon tax, weighted by the share of the country's GHG emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 diff --git a/etl/steps/archive/garden/rff/2022-09-14/emissions_weighted_carbon_price.py b/etl/steps/archive/garden/rff/2022-09-14/emissions_weighted_carbon_price.py deleted file mode 100644 index 6f153c8f06e..00000000000 --- a/etl/steps/archive/garden/rff/2022-09-14/emissions_weighted_carbon_price.py +++ /dev/null @@ -1,216 +0,0 @@ -"""Combine dataset on coverage of emissions with the average prices of emissions covered by an ETS or a carbon tax. - -""" - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore_table - -from etl.data_helpers import geo -from etl.paths import DATA_DIR, STEP_DIR - -# Details on garden dataset to be exported. -VERSION = "2022-09-14" -DATASET_NAME = "emissions_weighted_carbon_price" -TABLE_NAME = DATASET_NAME -# Path to country names file. -# NOTE: This countries file contains as many countries as the file for world_cabon_pricing, plus "World". -# Here we ignore all regions inside countries. -COUNTRIES_PATH = STEP_DIR / f"data/garden/rff/{VERSION}/{DATASET_NAME}.countries.json" -# Path to metadata file. -METADATA_PATH = STEP_DIR / f"data/garden/rff/{VERSION}/{DATASET_NAME}.meta.yml" -# Details on meadow datasets to be imported. -MEADOW_VERSION = "2022-09-14" -MEADOW_PATH_ECONOMY = DATA_DIR / f"meadow/rff/{MEADOW_VERSION}/emissions_weighted_carbon_price__economy" -MEADOW_PATH_COVERAGE = DATA_DIR / f"meadow/rff/{MEADOW_VERSION}/emissions_weighted_carbon_price__coverage" - -# Columns to keep from raw dataset and how to rename them. -COLUMNS = { - "jurisdiction": "country", - "year": "year", - # Emissions-weighted average price on emissions covered by either a carbon tax or an ETS. - # Weights: share of jurisdiction total CO2 emissions. - "ecp_all_jurco2_kusd": "price_with_tax_or_ets_weighted_by_share_of_co2", - # Emissions-weighted average price on emissions covered by either a carbon tax or an ETS. - # Weights: share of jurisdiction total GHG emissions. - "ecp_all_jurghg_kusd": "price_with_tax_or_ets_weighted_by_share_of_ghg", - # Emissions-weighted average price on emissions covered by an ETS. - # Weights: share of jurisdiction total CO2 emissions. - "ecp_ets_jurco2_kusd": "price_with_ets_weighted_by_share_of_co2", - # Emissions-weighted average price on emissions covered by an ETS. - # Weights: share of jurisdiction total GHG emissions. - "ecp_ets_jurghg_kusd": "price_with_ets_weighted_by_share_of_ghg", - # Emissions-weighted average price on emissions covered by a carbon tax. - # Weights: share of jurisdiction total CO2 emissions. - "ecp_tax_jurco2_kusd": "price_with_tax_weighted_by_share_of_co2", - # Emissions-weighted average price on emissions covered by a carbon tax. - # Weights: share of jurisdiction total GHG emissions. - "ecp_tax_jurghg_kusd": "price_with_tax_weighted_by_share_of_ghg", - # CO2 emissions covered by either a carbon tax or an ETS as a share of jurisdiction total CO2 emissions. - "cov_all_co2_jurco2": "co2_with_tax_or_ets_as_share_of_co2", - # CO2 emissions covered by either a carbon tax or an ETS as a share of jurisdiction total GHG emissions. - "cov_all_co2_jurghg": "co2_with_tax_or_ets_as_share_of_ghg", - # CO2 emissions covered by either carbon taxes or an ETS as a share of world total CO2 emissions. - "cov_all_co2_wldco2": "co2_with_tax_or_ets_as_share_of_world_co2", - # CO2 emissions covered by either carbon taxes or an ETS as a share of world total GHG emissions. - "cov_all_co2_wldghg": "co2_with_tax_or_ets_as_share_of_world_ghg", - # CO2 emissions covered by an ETS as a share of jurisdiction total CO2 emissions. - "cov_ets_co2_jurco2": "co2_with_ets_as_share_of_co2", - # CO2 emissions covered by an ETS as a share of jurisdiction total GHG emissions. - "cov_ets_co2_jurghg": "co2_with_ets_as_share_of_ghg", - # CO2 emissions covered by an ETS as a share of world total CO2 emissions. - "cov_ets_co2_wldco2": "co2_with_ets_as_share_of_world_co2", - # CO2 emissions covered by an ETS as a share of world total GHG emissions. - "cov_ets_co2_wldghg": "co2_with_ets_as_share_of_world_ghg", - # CO2 emissions covered by a carbon tax as a share of jurisdiction total CO2 emissions. - "cov_tax_co2_jurco2": "co2_with_tax_as_share_of_co2", - # CO2 emissions covered by a carbon tax as a share of jurisdiction total GHG emissions. - "cov_tax_co2_jurghg": "co2_with_tax_as_share_of_ghg", - # CO2 emissions covered by a carbon tax as a share of world total CO2 emissions. - "cov_tax_co2_wldco2": "co2_with_tax_as_share_of_world_co2", - # CO2 emissions covered by a carbon tax as a share of world total GHG emissions. - "cov_tax_co2_wldghg": "co2_with_tax_as_share_of_world_ghg", - # # Other variables that are only relevant when considering sub-country regions (that we ignore for now): - # # Emissions-weighted average price on emissions covered by either a carbon tax or an ETS. - # # Weights: share of national jurisdiction total CO2 emissions. - # 'ecp_all_supraco2_kusd': 'price_with_tax_or_ets_weighted_by_share_of_country_co2', - # # Emissions-weighted average price on emissions covered by either a carbon tax or an ETS. - # # Weights: share of national jurisdiction total GHG emissions. - # 'ecp_all_supraghg_kusd': 'price_with_tax_or_ets_weighted_by_share_of_country_ghg', - # # Emissions-weighted average price on emissions covered by an ETS. - # # Weights: share of national jurisdiction total CO2 emissions. - # 'ecp_ets_supraco2_kusd': 'price_with_ets_weighted_by_share_of_country_co2', - # # Emissions-weighted average price on emissions covered by an ETS. - # # Weights: share of national jurisdiction total GHG emissions. - # 'ecp_ets_supraghg_kusd': 'price_with_ets_weighted_by_share_of_country_ghg', - # # Emissions-weighted average price on emissions covered by a carbon tax. - # # Weights: share of national jurisdiction total CO2 emissions. - # 'ecp_tax_supraco2_kusd': 'price_with_tax_weighted_by_share_of_country_co2', - # # Emissions-weighted average price on emissions covered by a carbon tax. - # # Weights: share of national jurisdiction total GHG emissions. - # 'ecp_tax_supraghg_kusd': 'price_with_tax_weighted_by_share_of_country_ghg', - # # CO2 emissions covered by either carbon taxes or an ETS as a share of national jurisdiction CO2 emissions. - # 'cov_all_co2_supraco2': 'co2_with_tax_or_ets_as_share_of_country_co2', - # # CO2 emissions covered by either carbon taxes or an ETS as a share of national jurisdiction GHG emissions. - # 'cov_all_co2_supraghg': 'co2_with_tax_or_ets_as_share_of_country_ghg', - # # CO2 emissions covered by an ETS as a share of national jurisdiction total CO2 emissions. - # 'cov_ets_co2_supraco2': 'co2_with_ets_as_share_of_country_co2', - # # CO2 emissions covered by an ETS as a share of national jurisdiction total GHG emissions. - # 'cov_ets_co2_supraghg': 'co2_with_ets_as_share_of_country_ghg', - # # CO2 emissions covered by a carbon tax as a share of national jurisdiction total CO2 emissions. - # 'cov_tax_co2_supraco2': 'co2_with_tax_as_share_of_country_co2', - # # CO2 emissions covered by a carbon tax as a share of national jurisdiction total GHG emissions. - # 'cov_tax_co2_supraghg': 'co2_with_tax_as_share_of_country_ghg', -} - - -def sanity_checks(df_economy: pd.DataFrame, df_coverage: pd.DataFrame) -> None: - """Sanity checks on the raw data from meadow. - - Parameters - ---------- - df_economy : pd.DataFrame - Raw data from meadow on prices. - df_coverage : pd.DataFrame - Raw data from meadow on coverage. - - """ - error = "Both dataframes were expected to have the same jurisdictions (although this may not be necessary)." - assert set(df_economy["jurisdiction"]) == set(df_coverage["jurisdiction"]), error - error = "Coverage should have the same (or less) years than economy (current year may be missing in coverage)." - assert set(df_coverage["year"]) <= set(df_economy["year"]), error - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read datasets from meadow. - ds_economy = Dataset(MEADOW_PATH_ECONOMY) - ds_coverage = Dataset(MEADOW_PATH_COVERAGE) - # Get tables from datasets. - tb_economy = ds_economy[ds_economy.table_names[0]] - tb_coverage = ds_coverage[ds_coverage.table_names[0]] - # Create dataframes from tables. - df_economy = pd.DataFrame(tb_economy).reset_index() - df_coverage = pd.DataFrame(tb_coverage).reset_index() - - # - # Process data. - # - # Sanity checks on raw data. - sanity_checks(df_economy=df_economy, df_coverage=df_coverage) - - # Convert all values in coverage to percentages (instead of fractions). - df_coverage.loc[:, [column for column in df_coverage.columns if column not in ["jurisdiction", "year"]]] *= 100 - - # Combine both dataframes. - df_combined = pd.merge(df_economy, df_coverage, how="outer", on=["jurisdiction", "year"]) - - # Select and rename columns. - df_combined = df_combined[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") - - ##################################### - - # Temporary patch. - # For some reason, "World" has empty "co2_with_ets_as_share_of_co2" and "co2_with_tax_as_share_of_co2", - # but not empty "co2_with_tax_or_ets_as_share_of_co2"; on the other hand, it has non-empty - # "co2_with_ets_as_share_of_world_co2" and "co2_with_tax_as_share_of_world_co2", and empty - # "co2_with_tax_or_ets_as_share_of_world_co2". This seems arbitrary, and I think none should be empty (it may - # be a bug in the generation of the original dataset). - # For now, simply ensure all "co2_with_*_as_share_of_*_co2" for "World" are not empty (i.e. treat "World" as just - # another country). - map_missing_variables_for_world = { - "co2_with_ets_as_share_of_co2": "co2_with_ets_as_share_of_world_co2", - "co2_with_tax_as_share_of_co2": "co2_with_tax_as_share_of_world_co2", - "co2_with_tax_or_ets_as_share_of_world_co2": "co2_with_tax_or_ets_as_share_of_co2", - } - for old_variable, new_variable in map_missing_variables_for_world.items(): - empty_ets_for_world_mask = (df_combined["country"] == "World") & (df_combined[old_variable].isnull()) - df_combined.loc[empty_ets_for_world_mask, old_variable] = df_combined[empty_ets_for_world_mask][new_variable] - - ##################################### - - # Harmonize country names. - # Notes: - # * Here it would be better to have a list of excluded countries, but this is not yet implemented - # in harmonize_countries. For example, if a new country is included, it will be ignored here - # (while instead it should raise a warning). - df_combined = geo.harmonize_countries( - df=df_combined, - countries_file=str(COUNTRIES_PATH), - warn_on_unused_countries=False, - warn_on_missing_countries=False, - make_missing_countries_nan=True, - ) - - # Remove sub-regions within a country. - df_combined = df_combined.dropna(subset=["country"]).reset_index(drop=True) - - # Sanity checks. - error = "There should be no columns with only nans." - assert df_combined.columns[df_combined.isna().all()].empty, error - error = f"Country named 'World' should be included in the countries file {COUNTRIES_PATH.name}." - assert "World" in set(df_combined["country"]), error - - # Set an appropriate index and sort conveniently. - df_combined = df_combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create main table. - tb_garden = underscore_table(Table(df_combined)) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = Dataset.create_empty(dest_dir) - # Fetch metadata from any of the meadow steps (if any). - ds_garden.metadata = ds_economy.metadata - # Update dataset metadata using metadata yaml file. - ds_garden.metadata.update_from_yaml(METADATA_PATH, if_source_exists="replace") - # Update main table metadata using metadata yaml file. - tb_garden.update_metadata_from_yaml(METADATA_PATH, TABLE_NAME) - # Add tables to dataset. - ds_garden.add(tb_garden) - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/rff/2022-09-14/shared.py b/etl/steps/archive/garden/rff/2022-09-14/shared.py deleted file mode 100644 index 7e7f4d18c5b..00000000000 --- a/etl/steps/archive/garden/rff/2022-09-14/shared.py +++ /dev/null @@ -1,3 +0,0 @@ -from pathlib import Path - -CURRENT_DIR = Path(__file__).parent diff --git a/etl/steps/archive/garden/rff/2022-09-14/sub_national_jurisdictions.json b/etl/steps/archive/garden/rff/2022-09-14/sub_national_jurisdictions.json deleted file mode 100644 index a4f5356ec3f..00000000000 --- a/etl/steps/archive/garden/rff/2022-09-14/sub_national_jurisdictions.json +++ /dev/null @@ -1,187 +0,0 @@ -{ - "Canada": [ - "Alberta", - "British Columbia", - "Manitoba", - "New Brunswick", - "Newfoundland and Labrador", - "Northwest Territories", - "Nova Scotia", - "Nunavut", - "Ontario", - "Prince Edward Island", - "Quebec", - "Saskatchewan", - "Yukon" - ], - "China": [ - "Anhui Province", - "Beijing Municipality", - "Chongqing Municipality", - "Fujian Province", - "Gansu Province", - "Guangdong Province", - "Guangxi Zhuang Autonomous Region", - "Guizhou Province", - "Hainan Province", - "Hebei Province", - "Heilongjiang Province", - "Henan Province", - "Hong Kong Special Administrative Region", - "Hubei Province", - "Hunan Province", - "Inner Mongolia Autonomous Region", - "Jiangsu Province", - "Jiangxi Province", - "Jilin Province", - "Liaoning Province", - "Macau Special Administrative Region", - "Ningxia Hui Autonomous Region", - "Qinghai Province", - "Shaanxi Province", - "Shandong Province", - "Shanghai Municipality", - "Shanxi Province", - "Shenzhen", - "Sichuan Province", - "Tianjin Municipality", - "Tibet Autonomous Region", - "Xinjiang Uyghur Autonomous Region", - "Yunnan Province", - "Zhejiang Province" - ], - "Japan": [ - "Aichi", - "Akita", - "Aomori", - "Chiba", - "Ehime", - "Fukui", - "Fukuoka", - "Fukushima", - "Gifu", - "Gunma", - "Hiroshima", - "Hokkaido", - "Hyogo", - "Ibaraki", - "Ishikawa", - "Iwate", - "Kagawa", - "Kagoshima", - "Kanagawa", - "Kochi", - "Kumamoto", - "Kyoto", - "Mie", - "Miyagi", - "Miyazaki", - "Nagano", - "Nagasaki", - "Nara", - "Niigata", - "Oita", - "Okayama", - "Okinawa", - "Osaka", - "Saga", - "Saitama", - "Shiga", - "Shimane", - "Shizuoka", - "Tochigi", - "Tokushima", - "Tokyo", - "Tottori", - "Toyama", - "Wakayama", - "Yamagata", - "Yamaguchi", - "Yamanashi" - ], - "United States": [ - "Alabama", - "Alaska", - "Arizona", - "Arkansas", - "California", - "Colorado", - "Connecticut", - "Delaware", - "Florida", - "Georgia_US", - "Hawaii", - "Idaho", - "Illinois", - "Indiana", - "Iowa", - "Kansas", - "Kentucky", - "Louisiana", - "Maine", - "Maryland", - "Massachusetts", - "Michigan", - "Minnesota", - "Mississippi", - "Missouri", - "Montana", - "Nebraska", - "Nevada", - "New Hampshire", - "New Jersey", - "New Mexico", - "New York", - "North Carolina", - "North Dakota", - "Ohio", - "Oklahoma", - "Oregon", - "Pennsylvania", - "Rhode Island", - "South Carolina", - "South Dakota", - "Tennessee", - "Texas", - "Utah", - "Vermont", - "Virginia", - "Washington", - "West Virginia", - "Wisconsin", - "Wyoming" - ], - "Mexico": [ - "Aguascalientes", - "Baja California", - "Baja California Sur", - "Campeche", - "Chiapas", - "Chihuahua", - "Coahuila de Zaragoza", - "Colima", - "Durango", - "Guanajuato", - "Guerrero", - "Hidalgo", - "Jalisco", - "Mexico State", - "Ciudad de Mexico", - "Michoacan de Ocampo", - "Morelos", - "Nayarit", - "Nuevo Leon", - "Oaxaca", - "Puebla", - "Queretaro de Arteaga", - "Quintana Roo", - "San Luis Potosi", - "Sinaloa", - "Sonora", - "Tabasco", - "Tamaulipas", - "Tlaxcala", - "Veracruz de Ignacio de la Llave", - "Yucatan" - ] -} diff --git a/etl/steps/archive/garden/rff/2022-09-14/world_carbon_pricing.countries.json b/etl/steps/archive/garden/rff/2022-09-14/world_carbon_pricing.countries.json deleted file mode 100644 index e11cfbeaee8..00000000000 --- a/etl/steps/archive/garden/rff/2022-09-14/world_carbon_pricing.countries.json +++ /dev/null @@ -1,203 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas, The": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo, Dem. Rep.": "Democratic Republic of Congo", - "Congo, Rep.": "Congo", - "Costa Rica": "Costa Rica", - "Cote d'Ivoire": "Cote d'Ivoire", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt, Arab Rep.": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Ethiopia": "Ethiopia", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "Gabon": "Gabon", - "Gambia, The": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Grenada": "Grenada", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong SAR, China": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran, Islamic Rep.": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea, Rep.": "South Korea", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyz Republic": "Kyrgyzstan", - "Lao PDR": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao SAR, China": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Moldova": "Moldova", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Samoa": "Samoa", - "San Marino": "San Marino", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovak Republic": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "St. Kitts and Nevis": "Saint Kitts and Nevis", - "St. Lucia": "Saint Lucia", - "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela, RB": "Venezuela", - "Vietnam": "Vietnam", - "Western Sahara": "Western Sahara", - "Yemen, Rep.": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Federated States of Micronesia": "Micronesia (country)", - "Korea, Dem. Rep.": "North Korea", - "Taiwan, China": "Taiwan", - "Vatican City": "Vatican", - "West Bank and Gaza": "Palestine" -} diff --git a/etl/steps/archive/garden/rff/2022-09-14/world_carbon_pricing.meta.yml b/etl/steps/archive/garden/rff/2022-09-14/world_carbon_pricing.meta.yml deleted file mode 100644 index 1b2c2de0b35..00000000000 --- a/etl/steps/archive/garden/rff/2022-09-14/world_carbon_pricing.meta.yml +++ /dev/null @@ -1,65 +0,0 @@ -dataset: - description: | - The World Carbon Pricing Database covers national and subnational economic mechanisms relating to carbon emissions from 1990 onwards. It was developed from several key sources: most notably, policy documents from countries and regions themselves. Secondly, from other sources such as the International Carbon Action Partnership. - - The dataset primarily focuses on economic instruments targeting carbon dioxide (CO2) emissions. In some cases these instruments also cover other greenhouse gases. However, any pricing mechanisms that target non-CO2 gases (such as methane or nitrouse oxide) specifically, are not included. - - A country is considered to have a carbon tax or emissions trading system if at least one IPCC sector or gas is covered by the instrument. These instruments do not need to cover all sectors within the economy for this to apply. - - sources: - - - name: Dolphin and Xiahou (2022). World carbon pricing database. - published_by: "Dolphin, G., Xiahou, Q. World carbon pricing database: sources and methods. Sci Data 9, 573 (2022)." - publication_year: 2022 - publication_date: 2022-09-14 - url: https://github.com/g-dolphin/WorldCarbonPricingDatabase - -tables: - world_carbon_pricing: - title: World carbon pricing - variables: - ets: - title: Covered by emissions trading system - unit: "" - description: A binary value indicating whether the sector(-fuel) is covered by at least one emissions trading system. - ets_price: - title: ETS price - short_unit: current LCU / t CO2e - unit: Current local currency unit per ton of CO2 equivalent - description: Price of an emissions allowance in current local currency unit per ton of CO2 equivalent. - display: - numDecimalPlaces: 2 - sector_name: - title: IPCC sector name. - unit: "" - description: IPCC sector name. - tax: - title: Covered by tax instrument - unit: "" - description: A binary value indicating whether the sector(-fuel) is covered by at least one tax instrument. - tax_rate_gross: - title: Gross tax rate - short_unit: current LCU / t CO2e - unit: Current local currenty unit per ton of CO2 equivalent - description: Tax rate in current local currency unit (LCU) per ton of CO2 equivalent - display: - numDecimalPlaces: 2 - tax_rate_net: - title: Net tax rate - short_unit: current LCU / t CO2e - unit: Current local currenty unit per ton of CO2 equivalent - description: Net tax rate (accounting for exemption) in current LCU per ton of CO2 equivalent. - display: - numDecimalPlaces: 2 - world_carbon_pricing_any_sector: - title: World carbon pricing for any sector - variables: - ets: - title: Covered by emissions trading system in at least one sector - unit: "" - description: This variable indicates whether at least one sector(-fuel) is covered by at least one emissions trading system at the national level, or only at the sub-national level, or whether no sector is covered. - tax: - title: Covered by tax instrument in at least one sector - unit: "" - description: This variable indicates whether at least one sector(-fuel) is covered by at least one carbon tax at the national level, or only at the sub-national level, or whether no sector is covered. - diff --git a/etl/steps/archive/garden/rff/2022-09-14/world_carbon_pricing.py b/etl/steps/archive/garden/rff/2022-09-14/world_carbon_pricing.py deleted file mode 100644 index eb958690f63..00000000000 --- a/etl/steps/archive/garden/rff/2022-09-14/world_carbon_pricing.py +++ /dev/null @@ -1,321 +0,0 @@ -from typing import Dict, List, cast - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.datautils import dataframes, io -from shared import CURRENT_DIR - -from etl.data_helpers import geo -from etl.helpers import PathFinder -from etl.paths import DATA_DIR, STEP_DIR - -# Details of the input dataset. -MEADOW_DATASET_NAME = "world_carbon_pricing" -MEADOW_VERSION = "2022-09-14" -MEADOW_MAIN_DATASET_PATH = DATA_DIR / f"meadow/rff/{MEADOW_VERSION}/{MEADOW_DATASET_NAME}" -MEADOW_SUBNATIONAL_DATASET_PATH = DATA_DIR / "meadow/rff/2022-09-14/world_carbon_pricing__subnational" -# Details of the output tables. -GARDEN_MAIN_TABLE_NAME = MEADOW_DATASET_NAME -GARDEN_VERSION = MEADOW_VERSION -GARDEN_ANY_SECTOR_TABLE_NAME = "world_carbon_pricing_any_sector" -# Get naming convention. -N = PathFinder(str(CURRENT_DIR / MEADOW_DATASET_NAME)) - -# Labels for the variables showing whether any sector is covered by an ETS or a carbon tax at the national or only -# sub-national level. -LABEL_ETS_NOT_COVERED = "No ETS" -LABEL_ETS_COVERED = "Has an ETS" -LABEL_ETS_COVERED_ONLY_SUBNATIONAL = "Has an ETS only at a sub-national level" -LABEL_TAX_NOT_COVERED = "No carbon tax" -LABEL_TAX_COVERED = "Has a carbon tax" -LABEL_TAX_COVERED_ONLY_SUBNATIONAL = "Has a carbon tax only at a sub-national level" -# If a country-years has both national and subnational coverage, mention only the national and ignore subnational. -LABEL_ETS_COVERED_NATIONAL_AND_SUBNATIONAL = "Has an ETS" -LABEL_TAX_COVERED_NATIONAL_AND_SUBNATIONAL = "Has a carbon tax" - -# Columns to keep from raw dataset and how to rename them. -COLUMNS = { - "jurisdiction": "country", - "year": "year", - "ipcc_code": "ipcc_code", - "product": "product", - "sector_name": "sector_name", - "tax": "tax", - "ets": "ets", - "tax_rate_excl_ex_clcu": "tax_rate_gross", - "tax_rate_incl_ex_clcu": "tax_rate_net", - "ets_price": "ets_price", -} - -# Columns to use as index in main table. -INDEX_COLUMNS = ["country", "year", "ipcc_code", "product"] -# Columns to use as index in table simplified to show whether there is coverage for any sector. -INDEX_COLUMNS_ANY_SECTOR = ["country", "year"] - -# Mapping of countries and the regions of the country included in the sub-national dataset. -# In the future, it would be good to load this mapping as additional data (however, the mapping is hardcoded in the -# original repository, so it's not trivial to get this mapping automatically). -COUNTRY_MEMBERS_FILE = STEP_DIR / f"data/garden/rff/{GARDEN_VERSION}/sub_national_jurisdictions.json" - - -def sanity_checks(df: pd.DataFrame) -> None: - """Sanity checks on the raw data. - - Parameters - ---------- - df : pd.DataFrame - Raw data from meadow. - - """ - column_checks = ( - df.groupby("jurisdiction") - .agg( - { - # Columns 'tax' and 'ets' must contain only 0 and/or 1. - "tax": lambda x: set(x) <= {0, 1}, - "ets": lambda x: set(x) <= {0, 1}, - } - ) - .all() - ) - # Column tax_id either is nan or has one value, which is the iso code of the country followed by "tax" - # (e.g. 'aus_tax'). However there is at least one exception, Norway has 'nor_tax_I', so maybe the data is - # expected to have more than one 'tax_id'. - - # Similarly, 'ets_id' is either nan, or usually just one value, e.g. "eu_ets" for EU countries, or "nzl_ets", - # "mex_ets", etc. However for the UK there are two, namely {'gbr_ets', 'eu_ets'}. - - error = f"Unexpected content in columns {column_checks[~column_checks].index.tolist()}." - assert column_checks.all(), error - - -def prepare_data(df: pd.DataFrame) -> pd.DataFrame: - """Prepare data. - - Parameters - ---------- - df : pd.DataFrame - Raw data. - - Returns - ------- - df : pd.DataFrame - Clean data. - - """ - df = df.copy() - - # Select and rename columns. - df = df[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") - - # Column 'product' has many nans. Convert them into empty strings. - df["product"] = df["product"].cat.add_categories("").fillna("") - - return df - - -def get_coverage_for_any_sector(df: pd.DataFrame) -> pd.DataFrame: - """Create a dataframe showing whether a country has any sector covered by an ets/carbon tax. - - Parameters - ---------- - df : pd.DataFrame - Original national or sub-national data, disaggregated by sector. - - Returns - ------- - df_any_sector : pd.DataFrame - Coverage for any sector. - - """ - # Create a simplified dataframe that gives, for each country and year, whether the country has any sector(-fuel) - # that is covered by at least one tax instrument. And idem for ets. - df_any_sector = ( - df.reset_index() - .groupby(["country", "year"], observed=True) - .agg({"ets": lambda x: min(x.sum(), 1), "tax": lambda x: min(x.sum(), 1)}) - .astype(int) - .reset_index() - ) - - return df_any_sector - - -def prepare_subnational_data(df_subnational: pd.DataFrame, country_members: Dict[str, List[str]]) -> pd.DataFrame: - """Create a dataframe showing whether a country has any sub-national jurisdiction for which any sector is covered by - an ets/carbon tax. - - The 'country' column of this dataframe does not need to be harmonized, since we are mapping the original - sub-national jurisdiction names to the harmonized name of the country. - - Parameters - ---------- - df_subnational : pd.DataFrame - Sub-national data, disaggregated by sector. - - Returns - ------- - pd.DataFrame - Processed sub-national data. - - """ - # Prepare subnational data. - df_subnational = prepare_data(df_subnational) - # Map subnational regions to their corresponding country. - subregions_to_country = { - subregion: country for country in list(country_members) for subregion in country_members[country] - } - df_subnational["country"] = dataframes.map_series( - series=df_subnational["country"], - mapping=subregions_to_country, - warn_on_missing_mappings=True, - warn_on_unused_mappings=True, - ) - # Get coverage of "any sector", where we only care about having at least one sector covered by carbon tax/ets. - df_subnational = get_coverage_for_any_sector(df=df_subnational) - - return df_subnational - - -def combine_national_and_subnational_data( - df_any_sector_national: pd.DataFrame, df_any_sector_subnational: pd.DataFrame -) -> pd.DataFrame: - """Combine national and sub-national data on whether countries have any sector covered by a tax instrument. - - The returned dataframe will have the following labels: - * Whether a country-year has no sector covered. - * Whether a country-year has at least one sector covered at the national level. - * Whether a country-year has at least one sector in one sub-national jurisdiction covered, but no sector covered at - the national level. - * Whether a country-year has at least one sector in both a sub-national and the national jurisdiction covered. - However, for now we disregard this option, by using the same label as for only national coverage. - - Parameters - ---------- - df_any_sector_national : pd.DataFrame - National data on whether countries have any sector covered by a tax instrument. - df_any_sector_subnational : pd.DataFrame - Sub-national data on whether countries have any sector covered by a tax instrument. - - Returns - ------- - df_any_sector : pd.DataFrame - Combined dataframe showing whether a country has at least one sector covered by a tax instrument at a national - level, or only at the sub-national level, or not at all. - - """ - # Combine national and subnational data. - df_any_sector = pd.merge( - df_any_sector_national, - df_any_sector_subnational, - on=["country", "year"], - how="left", - suffixes=("_national", "_subnational"), - ).fillna(0) - - # Create two new columns ets and tax, that are: - # * 0 if no ets/tax exists. - # * 1 if there is a national ets/tax and not a subnational ets/tax. - # * 2 if there is a subnational ets/tax and not a national ets/tax. - # * 3 if there are both a national and a subnational ets/tax. - df_any_sector = df_any_sector.assign( - **{ - "ets": df_any_sector["ets_national"] + 2 * df_any_sector["ets_subnational"], - "tax": df_any_sector["tax_national"] + 2 * df_any_sector["tax_subnational"], - } - )[["country", "year", "ets", "tax"]] - - # Now replace 0, 1, 2, and 3 by their corresponding labels. - ets_mapping = { - 0: LABEL_ETS_NOT_COVERED, - 1: LABEL_ETS_COVERED, - 2: LABEL_ETS_COVERED_ONLY_SUBNATIONAL, - 3: LABEL_ETS_COVERED_NATIONAL_AND_SUBNATIONAL, - } - tax_mapping = { - 0: LABEL_TAX_NOT_COVERED, - 1: LABEL_TAX_COVERED, - 2: LABEL_TAX_COVERED_ONLY_SUBNATIONAL, - 3: LABEL_TAX_COVERED_NATIONAL_AND_SUBNATIONAL, - } - df_any_sector["ets"] = dataframes.map_series(series=df_any_sector["ets"], mapping=ets_mapping) - df_any_sector["tax"] = dataframes.map_series(series=df_any_sector["tax"], mapping=tax_mapping) - - return cast(pd.DataFrame, df_any_sector) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read main dataset from meadow. - ds_meadow = Dataset(MEADOW_MAIN_DATASET_PATH) - - # Read subnational dataset from meadow. - ds_meadow_subnational = Dataset(MEADOW_SUBNATIONAL_DATASET_PATH) - - # Get main table from dataset. - tb_meadow = ds_meadow[ds_meadow.table_names[0]] - - # Get table for subnational data from dataset. - tb_meadow_subnational = ds_meadow_subnational[ds_meadow_subnational.table_names[0]] - - # Construct a dataframe from the main table. - df = pd.DataFrame(tb_meadow) - - # Construct a dataframe for subnational data. - df_subnational = pd.DataFrame(tb_meadow_subnational) - - # Load dictionary mapping sub-national jurisdictions to their countries. - country_members = io.load_json(COUNTRY_MEMBERS_FILE) - - # - # Process data. - # - # Sanity checks on raw data. - sanity_checks(df=df) - sanity_checks(df=df_subnational) - - # Prepare data. - df = prepare_data(df=df) - - # Harmonize country names. - df = geo.harmonize_countries(df=df, countries_file=str(N.country_mapping_path), warn_on_unused_countries=False) - - # Create a simplified table for "any sector" of national data. - df_any_sector_national = get_coverage_for_any_sector(df=df) - - # Create a simplified dataframe with the coverage for "any sector" of subnational data. - df_any_sector_subnational = prepare_subnational_data(df_subnational=df_subnational, country_members=country_members) - - # Combine national and subnational data. - df_any_sector = combine_national_and_subnational_data( - df_any_sector_national=df_any_sector_national, df_any_sector_subnational=df_any_sector_subnational - ) - - # Prepare output tables. - tb = ( - Table(df, short_name=GARDEN_MAIN_TABLE_NAME, underscore=True) - .set_index(INDEX_COLUMNS, verify_integrity=True) - .sort_index() - .sort_index(axis=1) - ) - tb_any_sector = ( - Table(df_any_sector, short_name=GARDEN_ANY_SECTOR_TABLE_NAME, underscore=True) - .set_index(INDEX_COLUMNS_ANY_SECTOR, verify_integrity=True) - .sort_index() - .sort_index(axis=1) - ) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = Dataset.create_empty(dest_dir, metadata=ds_meadow.metadata) - - ds_garden.add(tb) - ds_garden.add(tb_any_sector) - ds_garden.update_metadata(N.metadata_path) - - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/rff/2022-10-11/emissions_weighted_carbon_price.countries.json b/etl/steps/archive/garden/rff/2022-10-11/emissions_weighted_carbon_price.countries.json deleted file mode 100644 index 1574c947c15..00000000000 --- a/etl/steps/archive/garden/rff/2022-10-11/emissions_weighted_carbon_price.countries.json +++ /dev/null @@ -1,204 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas, The": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo, Dem. Rep.": "Democratic Republic of Congo", - "Congo, Rep.": "Congo", - "Costa Rica": "Costa Rica", - "Cote d'Ivoire": "Cote d'Ivoire", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt, Arab Rep.": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Ethiopia": "Ethiopia", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "Gabon": "Gabon", - "Gambia, The": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Grenada": "Grenada", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong SAR, China": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran, Islamic Rep.": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea, Rep.": "South Korea", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyz Republic": "Kyrgyzstan", - "Lao PDR": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao SAR, China": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Moldova": "Moldova", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Samoa": "Samoa", - "San Marino": "San Marino", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovak Republic": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "St. Kitts and Nevis": "Saint Kitts and Nevis", - "St. Lucia": "Saint Lucia", - "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela, RB": "Venezuela", - "Vietnam": "Vietnam", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen, Rep.": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Federated States of Micronesia": "Micronesia (country)", - "Korea, Dem. Rep.": "North Korea", - "Taiwan, China": "Taiwan", - "Vatican City": "Vatican", - "West Bank and Gaza": "Palestine" -} diff --git a/etl/steps/archive/garden/rff/2022-10-11/emissions_weighted_carbon_price.meta.yml b/etl/steps/archive/garden/rff/2022-10-11/emissions_weighted_carbon_price.meta.yml deleted file mode 100644 index f60c3b175f8..00000000000 --- a/etl/steps/archive/garden/rff/2022-10-11/emissions_weighted_carbon_price.meta.yml +++ /dev/null @@ -1,137 +0,0 @@ -dataset: - short_name: emissions_weighted_carbon_price - title: Emissions-weighted carbon price - description: | - The World Carbon Pricing Database covers national and subnational economic mechanisms relating to carbon emissions from 1990 onwards. It was developed from several key sources: most notably, policy documents from countries and regions themselves. Secondly, from other sources such as the International Carbon Action Partnership. - - The dataset primarily focuses on economic instruments targeting carbon dioxide (CO2) emissions. In some cases these instruments also cover other greenhouse gases. However, any pricing mechanisms that target non-CO2 gases (such as methane or nitrous oxide) specifically, are not included. - - A country is considered to have a carbon tax or emissions trading system if at least one IPCC sector or gas is covered by the instrument. These instruments do not need to cover all sectors within the economy for this to apply. - - For each country, researchers calculate an emissions-weighted carbon price for the economy. To do this, they rely on two metrics: - - - Carbon prices applied at the sectoral level (e.g. electricity, or road transport) - - Each sector’s contribution to a country’s CO2 emissions (e.g. what percentage of a country’s emissions come from electricity, or road transport) - They then weight each sector’s carbon price by the relevant sector’s contribution to CO2 emissions, and aggregate these figures to get an economy-wide weighted carbon price. - - A full technical note on this methodology is provided by the authors [here](https://www.rff.org/publications/working-papers/emissions-weighted-carbon-price-sources-and-methods/). - sources: - - name: Dolphin, Pollitt and Newbery (2020). Emissions-weighted Carbon Price. - published_by: >- - Dolphin, G., Pollitt, M. and Newbery, D. 2020. The political economy of carbon pricing: a panel analysis. - Oxford Economic Papers 72(2): 472-500. - publication_year: 2022 - publication_date: 2022-01-18 - url: https://github.com/g-dolphin/ECP -tables: - emissions_weighted_carbon_price: - title: Emissions-weighted carbon price - variables: - co2_with_ets_as_share_of_co2: - title: CO2 emissions covered by an ETS as a share of the country's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_ets_as_share_of_ghg: - title: CO2 emissions covered by an ETS as a share of the country's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_ets_as_share_of_world_co2: - title: CO2 emissions covered by an ETS as a share of the world's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_ets_as_share_of_world_ghg: - title: CO2 emissions covered by an ETS as a share of the world's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_as_share_of_co2: - title: CO2 emissions covered by a carbon tax as a share of the country's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_as_share_of_ghg: - title: CO2 emissions covered by a carbon tax as a share of the country's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_as_share_of_world_co2: - title: CO2 emissions covered by a carbon tax as a share of the world's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_as_share_of_world_ghg: - title: CO2 emissions covered by a carbon tax as a share of the world's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_or_ets_as_share_of_co2: - title: CO2 emissions covered by a carbon tax or an ETS as a share of the country's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_or_ets_as_share_of_ghg: - title: CO2 emissions covered by a carbon tax or an ETS as a share of the country's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_or_ets_as_share_of_world_co2: - title: CO2 emissions covered by a carbon tax or an ETS as a share of the world's CO2 emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - co2_with_tax_or_ets_as_share_of_world_ghg: - title: CO2 emissions covered by a carbon tax or an ETS as a share of the world's GHG emissions - unit: '%' - short_unit: '%' - display: - numDecimalPlaces: 2 - price_with_ets_weighted_by_share_of_co2: - title: Average price on emissions covered by an ETS, weighted by the share of the country's CO2 emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 - price_with_ets_weighted_by_share_of_ghg: - title: Average price on emissions covered by an ETS, weighted by the share of the country's GHG emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 - price_with_tax_or_ets_weighted_by_share_of_co2: - title: Average price on emissions covered by a carbon tax or an ETS, weighted by the share of the country's CO2 emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 - price_with_tax_or_ets_weighted_by_share_of_ghg: - title: Average price on emissions covered by a carbon tax or an ETS, weighted by the share of the country's GHG emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 - price_with_tax_weighted_by_share_of_co2: - title: Average price on emissions covered by a carbon tax, weighted by the share of the country's CO2 emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 - price_with_tax_weighted_by_share_of_ghg: - title: Average price on emissions covered by a carbon tax, weighted by the share of the country's GHG emissions - unit: 2019 US$ per tonne of carbon dioxide equivalents - short_unit: 2019 US$/ tCO2e - display: - numDecimalPlaces: 2 diff --git a/etl/steps/archive/garden/rff/2022-10-11/emissions_weighted_carbon_price.py b/etl/steps/archive/garden/rff/2022-10-11/emissions_weighted_carbon_price.py deleted file mode 100644 index 6c94268afc1..00000000000 --- a/etl/steps/archive/garden/rff/2022-10-11/emissions_weighted_carbon_price.py +++ /dev/null @@ -1,198 +0,0 @@ -"""Combine dataset on coverage of emissions with the average prices of emissions covered by an ETS or a carbon tax. - -""" - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore_table -from shared import LAST_INFORMED_YEAR, MEADOW_VERSION, VERSION - -from etl.data_helpers import geo -from etl.paths import DATA_DIR, STEP_DIR - -# Details on garden dataset to be exported. -DATASET_NAME = "emissions_weighted_carbon_price" -TABLE_NAME = DATASET_NAME -# Path to country names file. -# NOTE: This countries file contains as many countries as the file for world_cabon_pricing, plus "World". -# Here we ignore all regions inside countries. -COUNTRIES_PATH = STEP_DIR / f"data/garden/rff/{VERSION}/{DATASET_NAME}.countries.json" -# Path to metadata file. -METADATA_PATH = STEP_DIR / f"data/garden/rff/{VERSION}/{DATASET_NAME}.meta.yml" -# Details on meadow datasets to be imported. -MEADOW_PATH_ECONOMY = DATA_DIR / f"meadow/rff/{MEADOW_VERSION}/emissions_weighted_carbon_price__economy" -MEADOW_PATH_COVERAGE = DATA_DIR / f"meadow/rff/{MEADOW_VERSION}/emissions_weighted_carbon_price__coverage" - -# Columns to keep from raw dataset and how to rename them. -COLUMNS = { - "jurisdiction": "country", - "year": "year", - # Emissions-weighted average price on emissions covered by either a carbon tax or an ETS. - # Weights: share of jurisdiction total CO2 emissions. - "ecp_all_jurco2_kusd": "price_with_tax_or_ets_weighted_by_share_of_co2", - # Emissions-weighted average price on emissions covered by either a carbon tax or an ETS. - # Weights: share of jurisdiction total GHG emissions. - "ecp_all_jurghg_kusd": "price_with_tax_or_ets_weighted_by_share_of_ghg", - # Emissions-weighted average price on emissions covered by an ETS. - # Weights: share of jurisdiction total CO2 emissions. - "ecp_ets_jurco2_kusd": "price_with_ets_weighted_by_share_of_co2", - # Emissions-weighted average price on emissions covered by an ETS. - # Weights: share of jurisdiction total GHG emissions. - "ecp_ets_jurghg_kusd": "price_with_ets_weighted_by_share_of_ghg", - # Emissions-weighted average price on emissions covered by a carbon tax. - # Weights: share of jurisdiction total CO2 emissions. - "ecp_tax_jurco2_kusd": "price_with_tax_weighted_by_share_of_co2", - # Emissions-weighted average price on emissions covered by a carbon tax. - # Weights: share of jurisdiction total GHG emissions. - "ecp_tax_jurghg_kusd": "price_with_tax_weighted_by_share_of_ghg", - # CO2 emissions covered by either a carbon tax or an ETS as a share of jurisdiction total CO2 emissions. - "cov_all_co2_jurco2": "co2_with_tax_or_ets_as_share_of_co2", - # CO2 emissions covered by either a carbon tax or an ETS as a share of jurisdiction total GHG emissions. - "cov_all_co2_jurghg": "co2_with_tax_or_ets_as_share_of_ghg", - # CO2 emissions covered by either carbon taxes or an ETS as a share of world total CO2 emissions. - "cov_all_co2_wldco2": "co2_with_tax_or_ets_as_share_of_world_co2", - # CO2 emissions covered by either carbon taxes or an ETS as a share of world total GHG emissions. - "cov_all_co2_wldghg": "co2_with_tax_or_ets_as_share_of_world_ghg", - # CO2 emissions covered by an ETS as a share of jurisdiction total CO2 emissions. - "cov_ets_co2_jurco2": "co2_with_ets_as_share_of_co2", - # CO2 emissions covered by an ETS as a share of jurisdiction total GHG emissions. - "cov_ets_co2_jurghg": "co2_with_ets_as_share_of_ghg", - # CO2 emissions covered by an ETS as a share of world total CO2 emissions. - "cov_ets_co2_wldco2": "co2_with_ets_as_share_of_world_co2", - # CO2 emissions covered by an ETS as a share of world total GHG emissions. - "cov_ets_co2_wldghg": "co2_with_ets_as_share_of_world_ghg", - # CO2 emissions covered by a carbon tax as a share of jurisdiction total CO2 emissions. - "cov_tax_co2_jurco2": "co2_with_tax_as_share_of_co2", - # CO2 emissions covered by a carbon tax as a share of jurisdiction total GHG emissions. - "cov_tax_co2_jurghg": "co2_with_tax_as_share_of_ghg", - # CO2 emissions covered by a carbon tax as a share of world total CO2 emissions. - "cov_tax_co2_wldco2": "co2_with_tax_as_share_of_world_co2", - # CO2 emissions covered by a carbon tax as a share of world total GHG emissions. - "cov_tax_co2_wldghg": "co2_with_tax_as_share_of_world_ghg", - # # Other variables that are only relevant when considering sub-country regions (that we ignore for now): - # # Emissions-weighted average price on emissions covered by either a carbon tax or an ETS. - # # Weights: share of national jurisdiction total CO2 emissions. - # 'ecp_all_supraco2_kusd': 'price_with_tax_or_ets_weighted_by_share_of_country_co2', - # # Emissions-weighted average price on emissions covered by either a carbon tax or an ETS. - # # Weights: share of national jurisdiction total GHG emissions. - # 'ecp_all_supraghg_kusd': 'price_with_tax_or_ets_weighted_by_share_of_country_ghg', - # # Emissions-weighted average price on emissions covered by an ETS. - # # Weights: share of national jurisdiction total CO2 emissions. - # 'ecp_ets_supraco2_kusd': 'price_with_ets_weighted_by_share_of_country_co2', - # # Emissions-weighted average price on emissions covered by an ETS. - # # Weights: share of national jurisdiction total GHG emissions. - # 'ecp_ets_supraghg_kusd': 'price_with_ets_weighted_by_share_of_country_ghg', - # # Emissions-weighted average price on emissions covered by a carbon tax. - # # Weights: share of national jurisdiction total CO2 emissions. - # 'ecp_tax_supraco2_kusd': 'price_with_tax_weighted_by_share_of_country_co2', - # # Emissions-weighted average price on emissions covered by a carbon tax. - # # Weights: share of national jurisdiction total GHG emissions. - # 'ecp_tax_supraghg_kusd': 'price_with_tax_weighted_by_share_of_country_ghg', - # # CO2 emissions covered by either carbon taxes or an ETS as a share of national jurisdiction CO2 emissions. - # 'cov_all_co2_supraco2': 'co2_with_tax_or_ets_as_share_of_country_co2', - # # CO2 emissions covered by either carbon taxes or an ETS as a share of national jurisdiction GHG emissions. - # 'cov_all_co2_supraghg': 'co2_with_tax_or_ets_as_share_of_country_ghg', - # # CO2 emissions covered by an ETS as a share of national jurisdiction total CO2 emissions. - # 'cov_ets_co2_supraco2': 'co2_with_ets_as_share_of_country_co2', - # # CO2 emissions covered by an ETS as a share of national jurisdiction total GHG emissions. - # 'cov_ets_co2_supraghg': 'co2_with_ets_as_share_of_country_ghg', - # # CO2 emissions covered by a carbon tax as a share of national jurisdiction total CO2 emissions. - # 'cov_tax_co2_supraco2': 'co2_with_tax_as_share_of_country_co2', - # # CO2 emissions covered by a carbon tax as a share of national jurisdiction total GHG emissions. - # 'cov_tax_co2_supraghg': 'co2_with_tax_as_share_of_country_ghg', -} - - -def sanity_checks(df_economy: pd.DataFrame, df_coverage: pd.DataFrame) -> None: - """Sanity checks on the raw data from meadow. - - Parameters - ---------- - df_economy : pd.DataFrame - Raw data from meadow on prices. - df_coverage : pd.DataFrame - Raw data from meadow on coverage. - - """ - error = "Both dataframes were expected to have the same jurisdictions (although this may not be necessary)." - assert set(df_economy["jurisdiction"]) == set(df_coverage["jurisdiction"]), error - error = "Coverage should have the same (or less) years than economy (current year may be missing in coverage)." - assert set(df_coverage["year"]) <= set(df_economy["year"]), error - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read datasets from meadow. - ds_economy = Dataset(MEADOW_PATH_ECONOMY) - ds_coverage = Dataset(MEADOW_PATH_COVERAGE) - # Get tables from datasets. - tb_economy = ds_economy[ds_economy.table_names[0]] - tb_coverage = ds_coverage[ds_coverage.table_names[0]] - # Create dataframes from tables. - df_economy = pd.DataFrame(tb_economy).reset_index() - df_coverage = pd.DataFrame(tb_coverage).reset_index() - - # - # Process data. - # - # Sanity checks on raw data. - sanity_checks(df_economy=df_economy, df_coverage=df_coverage) - - # Convert all values in coverage to percentages (instead of fractions). - df_coverage.loc[:, [column for column in df_coverage.columns if column not in ["jurisdiction", "year"]]] *= 100 - - # Combine both dataframes. - df_combined = pd.merge(df_economy, df_coverage, how="outer", on=["jurisdiction", "year"]) - - # Select and rename columns. - df_combined = df_combined[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") - - # Harmonize country names. - # Notes: - # * Here it would be better to have a list of excluded countries, but this is not yet implemented - # in harmonize_countries. For example, if a new country is included, it will be ignored here - # (while instead it should raise a warning). - df_combined = geo.harmonize_countries( - df=df_combined, - countries_file=str(COUNTRIES_PATH), - warn_on_unused_countries=False, - warn_on_missing_countries=False, - make_missing_countries_nan=True, - ) - - # Remove sub-regions within a country. - df_combined = df_combined.dropna(subset=["country"]).reset_index(drop=True) - - # Given that the most recent data is incomplete, keep only data points prior to (or at) a certain year - # (given by global variables LAST_INFORMED_YEAR). - df_combined = df_combined[df_combined["year"] <= LAST_INFORMED_YEAR].reset_index(drop=True) - - # Sanity checks. - error = "There should be no columns with only nans." - assert df_combined.columns[df_combined.isna().all()].empty, error - error = f"Country named 'World' should be included in the countries file {COUNTRIES_PATH.name}." - assert "World" in set(df_combined["country"]), error - - # Set an appropriate index and sort conveniently. - df_combined = df_combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create main table. - tb_garden = underscore_table(Table(df_combined)) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = Dataset.create_empty(dest_dir) - # Fetch metadata from any of the meadow steps (if any). - ds_garden.metadata = ds_economy.metadata - # Update dataset metadata using metadata yaml file. - ds_garden.metadata.update_from_yaml(METADATA_PATH, if_source_exists="replace") - # Update main table metadata using metadata yaml file. - tb_garden.update_metadata_from_yaml(METADATA_PATH, TABLE_NAME) - # Add tables to dataset. - ds_garden.add(tb_garden) - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/rff/2022-10-11/shared.py b/etl/steps/archive/garden/rff/2022-10-11/shared.py deleted file mode 100644 index 244e0ffa976..00000000000 --- a/etl/steps/archive/garden/rff/2022-10-11/shared.py +++ /dev/null @@ -1,10 +0,0 @@ -from pathlib import Path - -# Given that the most recent data may be incomplete, we will keep only data points prior to (or at) the following year. -LAST_INFORMED_YEAR = 2020 -# Directory for current version. -CURRENT_DIR = Path(__file__).parent -# Version of current garden datasets to be created. -VERSION = str(CURRENT_DIR.name) -# Version of meadow datasets to be imported. -MEADOW_VERSION = VERSION diff --git a/etl/steps/archive/garden/rff/2022-10-11/sub_national_jurisdictions.json b/etl/steps/archive/garden/rff/2022-10-11/sub_national_jurisdictions.json deleted file mode 100644 index a4f5356ec3f..00000000000 --- a/etl/steps/archive/garden/rff/2022-10-11/sub_national_jurisdictions.json +++ /dev/null @@ -1,187 +0,0 @@ -{ - "Canada": [ - "Alberta", - "British Columbia", - "Manitoba", - "New Brunswick", - "Newfoundland and Labrador", - "Northwest Territories", - "Nova Scotia", - "Nunavut", - "Ontario", - "Prince Edward Island", - "Quebec", - "Saskatchewan", - "Yukon" - ], - "China": [ - "Anhui Province", - "Beijing Municipality", - "Chongqing Municipality", - "Fujian Province", - "Gansu Province", - "Guangdong Province", - "Guangxi Zhuang Autonomous Region", - "Guizhou Province", - "Hainan Province", - "Hebei Province", - "Heilongjiang Province", - "Henan Province", - "Hong Kong Special Administrative Region", - "Hubei Province", - "Hunan Province", - "Inner Mongolia Autonomous Region", - "Jiangsu Province", - "Jiangxi Province", - "Jilin Province", - "Liaoning Province", - "Macau Special Administrative Region", - "Ningxia Hui Autonomous Region", - "Qinghai Province", - "Shaanxi Province", - "Shandong Province", - "Shanghai Municipality", - "Shanxi Province", - "Shenzhen", - "Sichuan Province", - "Tianjin Municipality", - "Tibet Autonomous Region", - "Xinjiang Uyghur Autonomous Region", - "Yunnan Province", - "Zhejiang Province" - ], - "Japan": [ - "Aichi", - "Akita", - "Aomori", - "Chiba", - "Ehime", - "Fukui", - "Fukuoka", - "Fukushima", - "Gifu", - "Gunma", - "Hiroshima", - "Hokkaido", - "Hyogo", - "Ibaraki", - "Ishikawa", - "Iwate", - "Kagawa", - "Kagoshima", - "Kanagawa", - "Kochi", - "Kumamoto", - "Kyoto", - "Mie", - "Miyagi", - "Miyazaki", - "Nagano", - "Nagasaki", - "Nara", - "Niigata", - "Oita", - "Okayama", - "Okinawa", - "Osaka", - "Saga", - "Saitama", - "Shiga", - "Shimane", - "Shizuoka", - "Tochigi", - "Tokushima", - "Tokyo", - "Tottori", - "Toyama", - "Wakayama", - "Yamagata", - "Yamaguchi", - "Yamanashi" - ], - "United States": [ - "Alabama", - "Alaska", - "Arizona", - "Arkansas", - "California", - "Colorado", - "Connecticut", - "Delaware", - "Florida", - "Georgia_US", - "Hawaii", - "Idaho", - "Illinois", - "Indiana", - "Iowa", - "Kansas", - "Kentucky", - "Louisiana", - "Maine", - "Maryland", - "Massachusetts", - "Michigan", - "Minnesota", - "Mississippi", - "Missouri", - "Montana", - "Nebraska", - "Nevada", - "New Hampshire", - "New Jersey", - "New Mexico", - "New York", - "North Carolina", - "North Dakota", - "Ohio", - "Oklahoma", - "Oregon", - "Pennsylvania", - "Rhode Island", - "South Carolina", - "South Dakota", - "Tennessee", - "Texas", - "Utah", - "Vermont", - "Virginia", - "Washington", - "West Virginia", - "Wisconsin", - "Wyoming" - ], - "Mexico": [ - "Aguascalientes", - "Baja California", - "Baja California Sur", - "Campeche", - "Chiapas", - "Chihuahua", - "Coahuila de Zaragoza", - "Colima", - "Durango", - "Guanajuato", - "Guerrero", - "Hidalgo", - "Jalisco", - "Mexico State", - "Ciudad de Mexico", - "Michoacan de Ocampo", - "Morelos", - "Nayarit", - "Nuevo Leon", - "Oaxaca", - "Puebla", - "Queretaro de Arteaga", - "Quintana Roo", - "San Luis Potosi", - "Sinaloa", - "Sonora", - "Tabasco", - "Tamaulipas", - "Tlaxcala", - "Veracruz de Ignacio de la Llave", - "Yucatan" - ] -} diff --git a/etl/steps/archive/garden/rff/2022-10-11/world_carbon_pricing.countries.json b/etl/steps/archive/garden/rff/2022-10-11/world_carbon_pricing.countries.json deleted file mode 100644 index e11cfbeaee8..00000000000 --- a/etl/steps/archive/garden/rff/2022-10-11/world_carbon_pricing.countries.json +++ /dev/null @@ -1,203 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas, The": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo, Dem. Rep.": "Democratic Republic of Congo", - "Congo, Rep.": "Congo", - "Costa Rica": "Costa Rica", - "Cote d'Ivoire": "Cote d'Ivoire", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt, Arab Rep.": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Ethiopia": "Ethiopia", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "Gabon": "Gabon", - "Gambia, The": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Grenada": "Grenada", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong SAR, China": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran, Islamic Rep.": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Korea, Rep.": "South Korea", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyz Republic": "Kyrgyzstan", - "Lao PDR": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao SAR, China": "Macao", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Moldova": "Moldova", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "North Macedonia": "North Macedonia", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Samoa": "Samoa", - "San Marino": "San Marino", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovak Republic": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "St. Kitts and Nevis": "Saint Kitts and Nevis", - "St. Lucia": "Saint Lucia", - "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela, RB": "Venezuela", - "Vietnam": "Vietnam", - "Western Sahara": "Western Sahara", - "Yemen, Rep.": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Federated States of Micronesia": "Micronesia (country)", - "Korea, Dem. Rep.": "North Korea", - "Taiwan, China": "Taiwan", - "Vatican City": "Vatican", - "West Bank and Gaza": "Palestine" -} diff --git a/etl/steps/archive/garden/rff/2022-10-11/world_carbon_pricing.meta.yml b/etl/steps/archive/garden/rff/2022-10-11/world_carbon_pricing.meta.yml deleted file mode 100644 index 1d81bff8d09..00000000000 --- a/etl/steps/archive/garden/rff/2022-10-11/world_carbon_pricing.meta.yml +++ /dev/null @@ -1,65 +0,0 @@ -dataset: - description: | - The World Carbon Pricing Database covers national and subnational economic mechanisms relating to carbon emissions from 1990 onwards. It was developed from several key sources: most notably, policy documents from countries and regions themselves. Secondly, from other sources such as the International Carbon Action Partnership. - - The dataset primarily focuses on economic instruments targeting carbon dioxide (CO2) emissions. In some cases these instruments also cover other greenhouse gases. However, any pricing mechanisms that target non-CO2 gases (such as methane or nitrouse oxide) specifically, are not included. - - A country is considered to have a carbon tax or emissions trading system if at least one IPCC sector or gas is covered by the instrument. These instruments do not need to cover all sectors within the economy for this to apply. - - sources: - - - name: Dolphin and Xiahou (2022). World carbon pricing database. - published_by: "Dolphin, G., Xiahou, Q. World carbon pricing database: sources and methods. Sci Data 9, 573 (2022)." - publication_year: 2022 - publication_date: 2022-09-17 - url: https://github.com/g-dolphin/WorldCarbonPricingDatabase - -tables: - world_carbon_pricing: - title: World carbon pricing - variables: - ets: - title: Covered by emissions trading system - unit: "" - description: A binary value indicating whether the sector(-fuel) is covered by at least one emissions trading system. - ets_price: - title: ETS price - short_unit: current LCU / t CO2e - unit: Current local currency unit per ton of CO2 equivalent - description: Price of an emissions allowance in current local currency unit per ton of CO2 equivalent. - display: - numDecimalPlaces: 2 - sector_name: - title: IPCC sector name. - unit: "" - description: IPCC sector name. - tax: - title: Covered by tax instrument - unit: "" - description: A binary value indicating whether the sector(-fuel) is covered by at least one tax instrument. - tax_rate_gross: - title: Gross tax rate - short_unit: current LCU / t CO2e - unit: Current local currenty unit per ton of CO2 equivalent - description: Tax rate in current local currency unit (LCU) per ton of CO2 equivalent - display: - numDecimalPlaces: 2 - tax_rate_net: - title: Net tax rate - short_unit: current LCU / t CO2e - unit: Current local currenty unit per ton of CO2 equivalent - description: Net tax rate (accounting for exemption) in current LCU per ton of CO2 equivalent. - display: - numDecimalPlaces: 2 - world_carbon_pricing_any_sector: - title: World carbon pricing for any sector - variables: - ets: - title: Covered by emissions trading system in at least one sector - unit: "" - description: This variable indicates whether at least one sector(-fuel) is covered by at least one emissions trading system at the national level, or only at the sub-national level, or whether no sector is covered. - tax: - title: Covered by tax instrument in at least one sector - unit: "" - description: This variable indicates whether at least one sector(-fuel) is covered by at least one carbon tax at the national level, or only at the sub-national level, or whether no sector is covered. - diff --git a/etl/steps/archive/garden/rff/2022-10-11/world_carbon_pricing.py b/etl/steps/archive/garden/rff/2022-10-11/world_carbon_pricing.py deleted file mode 100644 index eb0970a7b6b..00000000000 --- a/etl/steps/archive/garden/rff/2022-10-11/world_carbon_pricing.py +++ /dev/null @@ -1,332 +0,0 @@ -from typing import Dict, List, cast - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore_table -from owid.datautils import dataframes, io -from shared import CURRENT_DIR, LAST_INFORMED_YEAR, MEADOW_VERSION, VERSION - -from etl.data_helpers import geo -from etl.helpers import PathFinder -from etl.paths import DATA_DIR, STEP_DIR - -# Details of the input dataset. -MEADOW_DATASET_NAME = "world_carbon_pricing" -MEADOW_MAIN_DATASET_PATH = DATA_DIR / f"meadow/rff/{MEADOW_VERSION}/{MEADOW_DATASET_NAME}" -MEADOW_SUBNATIONAL_DATASET_PATH = DATA_DIR / f"meadow/rff/{MEADOW_VERSION}/world_carbon_pricing__subnational" -# Details of the output tables. -GARDEN_MAIN_TABLE_NAME = MEADOW_DATASET_NAME -GARDEN_VERSION = VERSION -GARDEN_ANY_SECTOR_TABLE_NAME = "world_carbon_pricing_any_sector" -# Get naming convention. -N = PathFinder(str(CURRENT_DIR / MEADOW_DATASET_NAME)) - -# Labels for the variables showing whether any sector is covered by an ETS or a carbon tax at the national or only -# sub-national level. -LABEL_ETS_NOT_COVERED = "No ETS" -LABEL_ETS_COVERED = "Has an ETS" -LABEL_ETS_COVERED_ONLY_SUBNATIONAL = "Has an ETS only at a sub-national level" -LABEL_TAX_NOT_COVERED = "No carbon tax" -LABEL_TAX_COVERED = "Has a carbon tax" -LABEL_TAX_COVERED_ONLY_SUBNATIONAL = "Has a carbon tax only at a sub-national level" -# If a country-years has both national and subnational coverage, mention only the national and ignore subnational. -LABEL_ETS_COVERED_NATIONAL_AND_SUBNATIONAL = "Has an ETS" -LABEL_TAX_COVERED_NATIONAL_AND_SUBNATIONAL = "Has a carbon tax" - -# Columns to keep from raw dataset and how to rename them. -COLUMNS = { - "jurisdiction": "country", - "year": "year", - "ipcc_code": "ipcc_code", - "product": "product", - "sector_name": "sector_name", - "tax": "tax", - "ets": "ets", - "tax_rate_excl_ex_clcu": "tax_rate_gross", - "tax_rate_incl_ex_clcu": "tax_rate_net", - "ets_price": "ets_price", -} - -# Columns to use as index in main table. -INDEX_COLUMNS = ["country", "year", "ipcc_code", "product"] -# Columns to use as index in table simplified to show whether there is coverage for any sector. -INDEX_COLUMNS_ANY_SECTOR = ["country", "year"] - -# Mapping of countries and the regions of the country included in the sub-national dataset. -# In the future, it would be good to load this mapping as additional data (however, the mapping is hardcoded in the -# original repository, so it's not trivial to get this mapping automatically). -COUNTRY_MEMBERS_FILE = STEP_DIR / f"data/garden/rff/{GARDEN_VERSION}/sub_national_jurisdictions.json" - - -def sanity_checks(df: pd.DataFrame) -> None: - """Sanity checks on the raw data. - - Parameters - ---------- - df : pd.DataFrame - Raw data from meadow. - - """ - column_checks = ( - df.groupby("jurisdiction") - .agg( - { - # Columns 'tax' and 'ets' must contain only 0 and/or 1. - "tax": lambda x: set(x) <= {0, 1}, - "ets": lambda x: set(x) <= {0, 1}, - } - ) - .all() - ) - # Column tax_id either is nan or has one value, which is the iso code of the country followed by "tax" - # (e.g. 'aus_tax'). However there is at least one exception, Norway has 'nor_tax_I', so maybe the data is - # expected to have more than one 'tax_id'. - - # Similarly, 'ets_id' is either nan, or usually just one value, e.g. "eu_ets" for EU countries, or "nzl_ets", - # "mex_ets", etc. However for the UK there are two, namely {'gbr_ets', 'eu_ets'}. - - error = f"Unexpected content in columns {column_checks[~column_checks].index.tolist()}." - assert column_checks.all(), error - - -def prepare_data(df: pd.DataFrame) -> pd.DataFrame: - """Prepare data. - - Parameters - ---------- - df : pd.DataFrame - Raw data. - - Returns - ------- - df : pd.DataFrame - Clean data. - - """ - df = df.copy() - - # Select and rename columns. - df = df[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") - - # Column 'product' has many nans. Convert them into empty strings. - df["product"] = df["product"].cat.add_categories("").fillna("") - - # Given that the most recent data is incomplete, keep only data points prior to (or at) a certain year - # (given by global variable LAST_INFORMED_YEAR). - df = df[df["year"] <= LAST_INFORMED_YEAR].reset_index(drop=True) - - return df - - -def get_coverage_for_any_sector(df: pd.DataFrame) -> pd.DataFrame: - """Create a dataframe showing whether a country has any sector covered by an ets/carbon tax. - - Parameters - ---------- - df : pd.DataFrame - Original national or sub-national data, disaggregated by sector. - - Returns - ------- - df_any_sector : pd.DataFrame - Coverage for any sector. - - """ - # Create a simplified dataframe that gives, for each country and year, whether the country has any sector(-fuel) - # that is covered by at least one tax instrument. And idem for ets. - df_any_sector = ( - df.reset_index() - .groupby(["country", "year"], observed=True) - .agg({"ets": lambda x: min(x.sum(), 1), "tax": lambda x: min(x.sum(), 1)}) - .astype(int) - .reset_index() - ) - - return df_any_sector - - -def prepare_subnational_data(df_subnational: pd.DataFrame, country_members: Dict[str, List[str]]) -> pd.DataFrame: - """Create a dataframe showing whether a country has any sub-national jurisdiction for which any sector is covered by - an ets/carbon tax. - - The 'country' column of this dataframe does not need to be harmonized, since we are mapping the original - sub-national jurisdiction names to the harmonized name of the country. - - Parameters - ---------- - df_subnational : pd.DataFrame - Sub-national data, disaggregated by sector. - - Returns - ------- - pd.DataFrame - Processed sub-national data. - - """ - # Prepare subnational data. - df_subnational = prepare_data(df_subnational) - # Map subnational regions to their corresponding country. - subregions_to_country = { - subregion: country for country in list(country_members) for subregion in country_members[country] - } - df_subnational["country"] = dataframes.map_series( - series=df_subnational["country"], - mapping=subregions_to_country, - warn_on_missing_mappings=True, - warn_on_unused_mappings=True, - ) - # Get coverage of "any sector", where we only care about having at least one sector covered by carbon tax/ets. - df_subnational = get_coverage_for_any_sector(df=df_subnational) - - return df_subnational - - -def combine_national_and_subnational_data( - df_any_sector_national: pd.DataFrame, df_any_sector_subnational: pd.DataFrame -) -> pd.DataFrame: - """Combine national and sub-national data on whether countries have any sector covered by a tax instrument. - - The returned dataframe will have the following labels: - * Whether a country-year has no sector covered. - * Whether a country-year has at least one sector covered at the national level. - * Whether a country-year has at least one sector in one sub-national jurisdiction covered, but no sector covered at - the national level. - * Whether a country-year has at least one sector in both a sub-national and the national jurisdiction covered. - However, for now we disregard this option, by using the same label as for only national coverage. - - Parameters - ---------- - df_any_sector_national : pd.DataFrame - National data on whether countries have any sector covered by a tax instrument. - df_any_sector_subnational : pd.DataFrame - Sub-national data on whether countries have any sector covered by a tax instrument. - - Returns - ------- - df_any_sector : pd.DataFrame - Combined dataframe showing whether a country has at least one sector covered by a tax instrument at a national - level, or only at the sub-national level, or not at all. - - """ - # Combine national and subnational data. - df_any_sector = pd.merge( - df_any_sector_national, - df_any_sector_subnational, - on=["country", "year"], - how="left", - suffixes=("_national", "_subnational"), - ).fillna(0) - - # Create two new columns ets and tax, that are: - # * 0 if no ets/tax exists. - # * 1 if there is a national ets/tax and not a subnational ets/tax. - # * 2 if there is a subnational ets/tax and not a national ets/tax. - # * 3 if there are both a national and a subnational ets/tax. - df_any_sector = df_any_sector.assign( - **{ - "ets": df_any_sector["ets_national"] + 2 * df_any_sector["ets_subnational"], - "tax": df_any_sector["tax_national"] + 2 * df_any_sector["tax_subnational"], - } - )[["country", "year", "ets", "tax"]] - - # Now replace 0, 1, 2, and 3 by their corresponding labels. - ets_mapping = { - 0: LABEL_ETS_NOT_COVERED, - 1: LABEL_ETS_COVERED, - 2: LABEL_ETS_COVERED_ONLY_SUBNATIONAL, - 3: LABEL_ETS_COVERED_NATIONAL_AND_SUBNATIONAL, - } - tax_mapping = { - 0: LABEL_TAX_NOT_COVERED, - 1: LABEL_TAX_COVERED, - 2: LABEL_TAX_COVERED_ONLY_SUBNATIONAL, - 3: LABEL_TAX_COVERED_NATIONAL_AND_SUBNATIONAL, - } - df_any_sector["ets"] = dataframes.map_series(series=df_any_sector["ets"], mapping=ets_mapping) - df_any_sector["tax"] = dataframes.map_series(series=df_any_sector["tax"], mapping=tax_mapping) - - return cast(pd.DataFrame, df_any_sector) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read main dataset from meadow. - ds_meadow = Dataset(MEADOW_MAIN_DATASET_PATH) - - # Read subnational dataset from meadow. - ds_meadow_subnational = Dataset(MEADOW_SUBNATIONAL_DATASET_PATH) - - # Get main table from dataset. - tb_meadow = ds_meadow[ds_meadow.table_names[0]] - - # Get table for subnational data from dataset. - tb_meadow_subnational = ds_meadow_subnational[ds_meadow_subnational.table_names[0]] - - # Construct a dataframe from the main table. - df = pd.DataFrame(tb_meadow) - - # Construct a dataframe for subnational data. - df_subnational = pd.DataFrame(tb_meadow_subnational) - - # Load dictionary mapping sub-national jurisdictions to their countries. - country_members = io.load_json(COUNTRY_MEMBERS_FILE) - - # - # Process data. - # - # Sanity checks on raw data. - sanity_checks(df=df) - sanity_checks(df=df_subnational) - - # Prepare data. - df = prepare_data(df=df) - - # Harmonize country names. - df = geo.harmonize_countries(df=df, countries_file=str(N.country_mapping_path), warn_on_unused_countries=False) - - # Create a simplified table for "any sector" of national data. - df_any_sector_national = get_coverage_for_any_sector(df=df) - - # Create a simplified dataframe with the coverage for "any sector" of subnational data. - df_any_sector_subnational = prepare_subnational_data(df_subnational=df_subnational, country_members=country_members) - - # Combine national and subnational data. - df_any_sector = combine_national_and_subnational_data( - df_any_sector_national=df_any_sector_national, df_any_sector_subnational=df_any_sector_subnational - ) - - # Prepare output tables. - tb = underscore_table(Table(df)).set_index(INDEX_COLUMNS, verify_integrity=True).sort_index().sort_index(axis=1) - tb_any_sector = ( - underscore_table(Table(df_any_sector)) - .set_index(INDEX_COLUMNS_ANY_SECTOR, verify_integrity=True) - .sort_index() - .sort_index(axis=1) - ) - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = Dataset.create_empty(dest_dir) - - # Fetch metadata from meadow step (if any). - ds_garden.metadata = ds_meadow.metadata - - # Update dataset metadata using metadata yaml file. - ds_garden.metadata.update_from_yaml(N.metadata_path, if_source_exists="replace") - - # Update main table metadata using metadata yaml file. - tb.update_metadata_from_yaml(N.metadata_path, GARDEN_MAIN_TABLE_NAME) - - # Update simplified table metadata using metadata yaml file. - tb_any_sector.update_metadata_from_yaml(N.metadata_path, GARDEN_ANY_SECTOR_TABLE_NAME) - - # Add tables to dataset. - ds_garden.add(tb) - ds_garden.add(tb_any_sector) - - # Save dataset. - ds_garden.save() diff --git a/etl/steps/archive/garden/shift/2022-07-18/fossil_fuel_production.country_mapping.json b/etl/steps/archive/garden/shift/2022-07-18/fossil_fuel_production.country_mapping.json deleted file mode 100644 index 3c9f237b5e9..00000000000 --- a/etl/steps/archive/garden/shift/2022-07-18/fossil_fuel_production.country_mapping.json +++ /dev/null @@ -1,239 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burma": "Myanmar", - "Burundi": "Burundi", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cape Verde": "Cape Verde", - "Cayman Islands": "Cayman Islands", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czech republic": "Czechia", - "Czechia": "Czechia", - "Czechoslovakia": "Czechoslovakia", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Ethiopia": "Ethiopia", - "Faeroe Islands": "Faroe Islands", - "Falkland Islands (Malvinas)": "Falkland Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Gibraltar": "Gibraltar", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hong Kong Special Administrative Region (China)": "Hong Kong", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Ivory Coast": "Cote d'Ivoire", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kosovo": "Kosovo", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Laos": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Macao Special Administrative Region (China)": "Macao", - "Macedonia": "North Macedonia", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Moldova": "Moldova", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "NZ": "New Zealand", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "Netherlands Antilles": "Netherlands Antilles", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North Korea": "North Korea", - "Northern Mariana Islands": "Northern Mariana Islands", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palestinian Territories": "Palestine", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Reunion": "Reunion", - "Romania": "Romania", - "Rwanda": "Rwanda", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Korea": "South Korea", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Swaziland": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syria": "Syria", - "Taiwan": "Taiwan", - "Tajikistan": "Tajikistan", - "Tanzania": "Tanzania", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United States Virgin Islands": "United States Virgin Islands", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela": "Venezuela", - "Viet Nam": "Vietnam", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen": "Yemen", - "Yugoslavia": "Yugoslavia", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Africa": "Africa (Shift)", - "Asia and Oceania": "Asia and Oceania (Shift)", - "Central and South America": "Central and South America (Shift)", - "EU28": "EU28 (Shift)", - "Eurasia": "Eurasia (Shift)", - "Europe": "Europe (Shift)", - "Inde": "Inde (Shift)", - "Middle East": "Middle East (Shift)", - "North America": "North America (Shift)", - "OECD": "OECD (Shift)", - "OPEC": "OPEC (Shift)", - "Persian Gulf": "Persian Gulf (Shift)", - "Russian Federation & USSR": "Russian Federation & USSR (Shift)", - "U.S. Pacific Islands": "U.S. Pacific Islands (Shift)", - "U.S. Territories": "U.S. Territories (Shift)", - "Wake Island": "Wake Island (Shift)" -} diff --git a/etl/steps/archive/garden/shift/2022-07-18/fossil_fuel_production.meta.yml b/etl/steps/archive/garden/shift/2022-07-18/fossil_fuel_production.meta.yml deleted file mode 100644 index 3adc234eaec..00000000000 --- a/etl/steps/archive/garden/shift/2022-07-18/fossil_fuel_production.meta.yml +++ /dev/null @@ -1,33 +0,0 @@ -dataset: - title: Fossil fuel production (Shift, 2022) - description: | - Fossil fuel production, produced by Our World in Data based on data from [The Shift Dataportal](https://www.theshiftdataportal.org/energy). - sources: - - name: Our World in Data based on The Shift Dataportal (2022) - published_by: The Shift Dataportal - date_accessed: 2022-07-18 - url: https://www.theshiftdataportal.org/energy -tables: - fossil_fuel_production: - variables: - coal: - title: Coal production - short_unit: TWh - unit: terawatt-hours - description: - display: - numDecimalPlaces: 0 - gas: - title: Gas production - short_unit: TWh - unit: terawatt-hours - description: - display: - numDecimalPlaces: 0 - oil: - title: Oil production - short_unit: TWh - unit: terawatt-hours - description: - display: - numDecimalPlaces: 0 diff --git a/etl/steps/archive/garden/shift/2022-07-18/fossil_fuel_production.py b/etl/steps/archive/garden/shift/2022-07-18/fossil_fuel_production.py deleted file mode 100644 index a84a1f53db9..00000000000 --- a/etl/steps/archive/garden/shift/2022-07-18/fossil_fuel_production.py +++ /dev/null @@ -1,419 +0,0 @@ -"""Garden step for Shift data on energy production from fossil fuels. - -""" - -from pathlib import Path -from typing import List, cast - -import numpy as np -import pandas as pd -from owid import catalog -from structlog import get_logger - -from etl.data_helpers import geo -from etl.helpers import create_dataset -from etl.paths import DATA_DIR - -log = get_logger() - -NAMESPACE = "shift" -DATASET_SHORT_NAME = "fossil_fuel_production" - -VERSION = Path(__file__).parent.name -COUNTRY_MAPPING_PATH = Path(__file__).parent / f"{DATASET_SHORT_NAME}.country_mapping.json" -METADATA_PATH = Path(__file__).parent / f"{DATASET_SHORT_NAME}.meta.yml" - -REGIONS_TO_ADD = [ - "North America", - "South America", - "Europe", - "European Union (27)", - "Africa", - "Asia", - "Oceania", - "Low-income countries", - "Upper-middle-income countries", - "Lower-middle-income countries", - "High-income countries", -] - -# When creating region aggregates, decide how to distribute historical regions. -# The following decisions are based on the current location of the countries that succeeded the region, and their income -# group. Continent and income group assigned corresponds to the continent and income group of the majority of the -# population in the member countries. -HISTORIC_TO_CURRENT_REGION = { - "Czechoslovakia": { - "continent": "Europe", - "income_group": "High-income countries", - "members": [ - # Europe - High-income countries. - "Czechia", - "Slovakia", - ], - }, - "Netherlands Antilles": { - "continent": "North America", - "income_group": "High-income countries", - "members": [ - # North America - High-income countries. - "Aruba", - "Curacao", - "Sint Maarten (Dutch part)", - ], - }, - "USSR": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Lithuania", - "Estonia", - "Latvia", - # Europe - Upper-middle-income countries. - "Moldova", - "Belarus", - "Russia", - # Europe - Lower-middle-income countries. - "Ukraine", - # Asia - Upper-middle-income countries. - "Georgia", - "Armenia", - "Azerbaijan", - "Turkmenistan", - "Kazakhstan", - # Asia - Lower-middle-income countries. - "Kyrgyzstan", - "Uzbekistan", - "Tajikistan", - ], - }, - "Yugoslavia": { - "continent": "Europe", - "income_group": "Upper-middle-income countries", - "members": [ - # Europe - High-income countries. - "Croatia", - "Slovenia", - # Europe - Upper-middle-income countries. - "North Macedonia", - "Bosnia and Herzegovina", - "Serbia", - "Montenegro", - ], - }, -} - - -def load_income_groups() -> pd.DataFrame: - """Load dataset of income groups and add historical regions to it. - - Returns - ------- - income_groups : pd.DataFrame - Income groups data. - - """ - # Load the WorldBank dataset for income grups. - income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() - - # Add historical regions to income groups. - for historic_region in HISTORIC_TO_CURRENT_REGION: - historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] - if historic_region not in income_groups["country"]: - historic_region_df = pd.DataFrame( - { - "country": [historic_region], - "income_group": [historic_region_income_group], - } - ) - income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) - - return cast(pd.DataFrame, income_groups) - - -def remove_overlapping_data_between_historical_regions_and_successors( - data_region: pd.DataFrame, - index_columns: List[str], - country_column: str, - year_column: str, - ignore_zeros: bool = True, -) -> pd.DataFrame: - """Remove overlapping data between a historical region and any of its successors (if there is any overlap), to avoid - double-counting those regions when aggregating data. - - Data for historical regions (e.g. USSR) could overlap with data of the successor countries (e.g. Russia). If this - happens, remove data (on the overlapping years) of the historical country. - - Parameters - ---------- - data_region : pd.DataFrame - Data (after selecting the countries of a certain relevant region). - index_columns : list - Index columns - country_column : str - Name of column for country names. - year_column : str - Name of column for year. - ignore_zeros : bool - True to ignore zeros when checking if historical regions overlap with their member countries; this means that, - if a historical region overlaps with a member, but the member only has zeros in the data, this will not be - considered an overlap. - - Returns - ------- - data_region : pd.DataFrame - Data after removing data with overlapping regions. - - """ - data_region = data_region.copy() - - # Select data columns. - data_columns = [column for column in data_region.columns if column not in index_columns] - # Select index columns without country column. - _index_columns = [column for column in index_columns if column != country_column] - indexes_to_drop = [] - - if ignore_zeros: - overlapping_values_to_ignore = [0] - else: - overlapping_values_to_ignore = [] - - for historical_region in HISTORIC_TO_CURRENT_REGION: - # Successors of the current historical region. - historical_successors = HISTORIC_TO_CURRENT_REGION[historical_region]["members"] - # Unique combinations of index for which historical region has data. - historical_region_years = ( - data_region[(data_region[country_column] == historical_region)] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=data_columns, how="all")[_index_columns] - .dropna() - .drop_duplicates() - ) - # Unique combinations of index for which successors have data. - historical_successors_years = ( - data_region[(data_region[country_column].isin(historical_successors))] - .replace(overlapping_values_to_ignore, np.nan) - .dropna(subset=data_columns, how="all")[_index_columns] - .dropna() - .drop_duplicates() - ) - - # Find unique years where the above combinations of region and successors overlap. - overlapping_years = pd.concat([historical_region_years, historical_successors_years], ignore_index=True) - overlapping_years = overlapping_years[overlapping_years.duplicated()] - if not overlapping_years.empty: - log.warning( - f"Removing rows where historical region {historical_region} overlaps with its successors " - f"(years {sorted(set(overlapping_years[year_column]))})." - ) - # Select rows in data_region to drop. - overlapping_years[country_column] = historical_region - indexes_to_drop.extend( - pd.merge( - data_region.reset_index(), - overlapping_years, - how="inner", - on=[country_column] + _index_columns, - )["index"].tolist() - ) - - if len(indexes_to_drop) > 0: - # Remove rows of data of the historical region where its data overlaps with data from its successors. - data_region = data_region.drop(index=indexes_to_drop) - - return data_region - - -def add_region_aggregates( - data: pd.DataFrame, - index_columns: List[str], - country_column: str = "country", - year_column: str = "year", -) -> pd.DataFrame: - """Add region aggregate for all regions. - - Regions are defined above, in REGIONS_TO_ADD. - - Parameters - ---------- - data : pd.DataFrame - Data. - index_columns : list - Index columns - country_column : str - Name of country column. - year_column : str - Name of year column. - - Returns - ------- - data : pd.DataFrame - Data after adding regions. - - """ - data = data.copy() - - income_groups = load_income_groups() - aggregates = {column: "sum" for column in data.columns if column not in index_columns} - for region in REGIONS_TO_ADD: - countries_in_region = geo.list_countries_in_region(region=region, income_groups=income_groups) - data_region = data[data[country_column].isin(countries_in_region)] - - data_region = remove_overlapping_data_between_historical_regions_and_successors( - data_region=data_region, - index_columns=index_columns, - country_column=country_column, - year_column=year_column, - ) - - data_region = geo.add_region_aggregates( - df=data_region, - region=region, - country_col=country_column, - year_col=year_column, - aggregations=aggregates, - countries_in_region=countries_in_region, - countries_that_must_have_data=[], - frac_allowed_nans_per_year=None, - num_allowed_nans_per_year=None, - ) - data = pd.concat([data, data_region[data_region["country"] == region]], ignore_index=True).reset_index( - drop=True - ) - - return data - - -def split_ussr_and_russia(df: pd.DataFrame) -> pd.DataFrame: - """Split data for USSR & Russia into two separate entities (given that Shift treats them as the same entity). - - Parameters - ---------- - df : pd.DataFrame - Shift data after harmonizing country names. - - Returns - ------- - df : pd.DataFrame - Shift data after separating data for USSR and Russia as separate entities. - - """ - df = df.copy() - - # Name that The Shift Data Portal uses for Russia and USSR. - shift_ussr_russia_name = "Russian Federation & USSR (Shift)" - # The relevant part of the data is originally from EIA, who have the first data point for Russia in 1992. - # Therefore we use this year to split USSR and Russia. - russia_start_year = 1992 - # Filter to select rows of USSR & Russia data. - ussr_russia_filter = df["country"] == shift_ussr_russia_name - ussr_data = ( - df[ussr_russia_filter & (df["year"] < russia_start_year)] - .replace({shift_ussr_russia_name: "USSR"}) - .reset_index(drop=True) - ) - russia_data = ( - df[ussr_russia_filter & (df["year"] >= russia_start_year)] - .replace({shift_ussr_russia_name: "Russia"}) - .reset_index(drop=True) - ) - # Remove rows where Russia and USSR are combined. - df = df[~ussr_russia_filter].reset_index(drop=True) - # Combine original data (without USSR and Russia as one entity) with USSR and Russia as separate entities. - df = ( - pd.concat([df, ussr_data, russia_data], ignore_index=True) - .sort_values(["country", "year"]) - .reset_index(drop=True) - ) - - return df - - -def correct_historical_regions(data: pd.DataFrame) -> pd.DataFrame: - """Correct some issues in Shift data involving historical regions. - - Parameters - ---------- - data : pd.DataFrame - Shift data after harmonization of country names. - - Returns - ------- - data : pd.DataFrame - Shift data after doing some corrections related to historical regions. - - """ - data = data.copy() - - # For coal and oil, Czechoslovakia's data become Czechia and Slovakia in 1993. - # However, for gas, Czechia appear at an earlier date. - # We correct those rows to be part of Czechoslovakia. - data_to_add = pd.merge( - data[(data["year"] < 1980) & (data["country"] == "Czechoslovakia")] - .reset_index(drop=True) - .drop(columns=["gas"]), - data[(data["year"] < 1980) & (data["country"] == "Czechia")].reset_index(drop=True)[["year", "gas"]], - how="left", - on="year", - ) - select_rows_to_correct = (data["country"].isin(["Czechia", "Czechoslovakia"])) & (data["year"] < 1980) - data = ( - pd.concat([data[~select_rows_to_correct], data_to_add], ignore_index=True) - .sort_values(["country", "year"]) - .reset_index(drop=True) - ) - - return data - - -def run(dest_dir: str) -> None: - log.info(f"{DATASET_SHORT_NAME}.start") - # - # Load data. - # - # Load meadow dataset and get the only table inside (with the same name). - ds_meadow = catalog.Dataset(DATA_DIR / f"meadow/{NAMESPACE}/{VERSION}/{DATASET_SHORT_NAME}") - tb_meadow = ds_meadow[DATASET_SHORT_NAME] - - # Convert table into a dataframe. - df = pd.DataFrame(tb_meadow) - - # - # Process data. - # - # Harmonize country names. - log.info(f"{DATASET_SHORT_NAME}.harmonize_countries") - df = geo.harmonize_countries(df=df, countries_file=str(COUNTRY_MAPPING_PATH)) - - # Remove rows that only have nans. - df = df.dropna(subset=["coal", "oil", "gas"], how="all").reset_index(drop=True) - - # Treat USSR and Russia as separate entities. - df = split_ussr_and_russia(df=df) - - # Correct gas data where Czechia and Czechoslovakia overlap. - df = correct_historical_regions(data=df) - - # Create aggregate regions. - log.info(f"{DATASET_SHORT_NAME}.add_region_aggregates") - df = add_region_aggregates( - data=df, - index_columns=["country", "year"], - country_column="country", - year_column="year", - ) - - # Prepare output data. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Create a new table. - tb_garden = catalog.Table(df, short_name=DATASET_SHORT_NAME, underscore=True) - - # - # Save outputs. - # - # Create a new garden dataset (with the same metadata as the meadow version). - ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_meadow.metadata) - ds_garden.save() - - log.info(f"{DATASET_SHORT_NAME}.end") diff --git a/etl/steps/archive/garden/smil/2017-01-01/global_primary_energy.meta.yml b/etl/steps/archive/garden/smil/2017-01-01/global_primary_energy.meta.yml deleted file mode 100644 index 2a9139090ca..00000000000 --- a/etl/steps/archive/garden/smil/2017-01-01/global_primary_energy.meta.yml +++ /dev/null @@ -1,43 +0,0 @@ -tables: - global_primary_energy: - variables: - biofuels__twh_direct_energy: - unit: terawatt-hours - short_unit: TWh - description: Direct energy from biofuels. - coal__twh_direct_energy: - unit: terawatt-hours - short_unit: TWh - description: Direct energy from coal. - gas__twh_direct_energy: - unit: terawatt-hours - short_unit: TWh - description: Direct energy from gas. - hydropower__twh_direct_energy: - unit: terawatt-hours - short_unit: TWh - description: Direct energy from hydropower. - nuclear__twh_direct_energy: - unit: terawatt-hours - short_unit: TWh - description: Direct energy from nuclear power. - oil__twh_direct_energy: - unit: terawatt-hours - short_unit: TWh - description: Direct energy from oil. - other_renewables__twh_direct_energy: - unit: terawatt-hours - short_unit: TWh - description: Direct energy from other renewables. - solar__twh_direct_energy: - unit: terawatt-hours - short_unit: TWh - description: Direct energy from solar. - traditional_biomass__twh_direct_energy: - unit: terawatt-hours - short_unit: TWh - description: Direct energy from traditional biomass. - wind__twh_direct_energy: - unit: terawatt-hours - short_unit: TWh - description: Direct energy from wind. diff --git a/etl/steps/archive/garden/smil/2017-01-01/global_primary_energy.py b/etl/steps/archive/garden/smil/2017-01-01/global_primary_energy.py deleted file mode 100644 index 30e4578b672..00000000000 --- a/etl/steps/archive/garden/smil/2017-01-01/global_primary_energy.py +++ /dev/null @@ -1,52 +0,0 @@ -from owid.catalog import Dataset -from owid.catalog.utils import underscore_table -from structlog import get_logger - -from etl.helpers import PathFinder - -log = get_logger() - -# naming conventions -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - log.info("global_primary_energy.start") - - # Load dataset from meadow. - ds_meadow = N.meadow_dataset - tb_meadow = ds_meadow["global_primary_energy"] - - # Create new garden dataset. - ds_garden = Dataset.create_empty(dest_dir, ds_meadow.metadata) - - # Copy all metadata from meadow, including variable metadata. - tb_garden = underscore_table(tb_meadow) - tb_garden.metadata = tb_meadow.metadata - for col in tb_garden.columns: - tb_garden[col].metadata = tb_meadow[col].metadata - # Sort data conveniently. - tb_garden = tb_garden[sorted(tb_garden.columns)].sort_index() - - # Update metadata using yaml file. - ds_garden.metadata.sources[0].name = "Smil (2017)" - #################################################################################################################### - # Temporary solution: At the moment, 'published_by' cannot be added to walden metadata. - # I could add a new source to the yaml file in this step (with the appropriate 'published_by') and use - # > ds_garden.metadata.update_from_yaml(N.metadata_path) - # but this would keep the original source (without 'published_by') and add a new one (with 'published_by'). - # Therefore, for the moment the only solution I see is to manually input the 'published_by' field here. - # Alternatively, I could ignore the metadata from walden and add all the relevant metadata in this step's yaml file. - ds_garden.metadata.sources[ - 0 - ].published_by = "Vaclav Smil (2017), Energy Transitions: Global and National Perspectives, 2nd edition, Appendix A" - #################################################################################################################### - tb_garden.update_metadata_from_yaml(N.metadata_path, "global_primary_energy") - - # Add table to dataset. - ds_garden.add(tb_garden) - - # Save dataset. - ds_garden.save() - - log.info("global_primary_energy.end") diff --git a/etl/steps/archive/garden/uk_beis/2022-07-28/shared.py b/etl/steps/archive/garden/uk_beis/2022-07-28/shared.py deleted file mode 100644 index 7e7f4d18c5b..00000000000 --- a/etl/steps/archive/garden/uk_beis/2022-07-28/shared.py +++ /dev/null @@ -1,3 +0,0 @@ -from pathlib import Path - -CURRENT_DIR = Path(__file__).parent diff --git a/etl/steps/archive/garden/uk_beis/2022-07-28/uk_historical_electricity.meta.yml b/etl/steps/archive/garden/uk_beis/2022-07-28/uk_historical_electricity.meta.yml deleted file mode 100644 index 200ee388aa9..00000000000 --- a/etl/steps/archive/garden/uk_beis/2022-07-28/uk_historical_electricity.meta.yml +++ /dev/null @@ -1,64 +0,0 @@ -dataset: - namespace: uk_beis - version: 2022-07-28 - title: UK historical electricity - short_name: uk_historical_electricity - description: | - Historical UK electricity data. - Data on fuel input gives raw energy used for electricity generation. The actual electricity generated by those inputs is reduced (because of inefficiencies in the generation when burning fossil fuels). - For this reason, a column of implied efficiency is included, which is obtained by dividing the input energy by the total electricity generation. - sources: - - - name: Digest of UK Energy Statistics (DUKES) - published_by: UK's Department for Business, Energy & Industrial Strategy - date_accessed: 2022-09-21 - url: https://www.gov.uk/government/statistical-data-sets/historical-electricity-data - -tables: - uk_historical_electricity: - variables: - coal: - title: Coal energy used for electricity generation - short_unit: TWh - unit: terawatt-hours - oil: - title: Oil energy used for electricity generation - short_unit: TWh - unit: terawatt-hours - gas: - title: Gas energy used for electricity generation - short_unit: TWh - unit: terawatt-hours - nuclear: - title: Nuclear energy used for electricity generation - short_unit: TWh - unit: terawatt-hours - hydro: - title: Hydropower used for electricity generation - short_unit: TWh - unit: terawatt-hours - wind_and_solar: - title: Wind and solar energy used for electricity generation - short_unit: TWh - unit: terawatt-hours - all_sources: - title: All energy used for electricity generation - short_unit: TWh - unit: terawatt-hours - other: - title: Other sources of energy used for electricity generation - short_unit: TWh - unit: terawatt-hours - electricity_generation: - title: Total electricity generation - short_unit: TWh - unit: terawatt-hours - net_imports: - title: Net electricity imports - short_unit: TWh - unit: terawatt-hours - implied_efficiency: - title: Implied efficiency - short_unit: "" - unit: "" - diff --git a/etl/steps/archive/garden/uk_beis/2022-07-28/uk_historical_electricity.py b/etl/steps/archive/garden/uk_beis/2022-07-28/uk_historical_electricity.py deleted file mode 100644 index 41b8bde1182..00000000000 --- a/etl/steps/archive/garden/uk_beis/2022-07-28/uk_historical_electricity.py +++ /dev/null @@ -1,101 +0,0 @@ -import pandas as pd -from owid import catalog -from owid.datautils import dataframes -from shared import CURRENT_DIR - -from etl.helpers import PathFinder - -DATASET_TITLE = "UK historical electricity" -DATASET_SHORT_NAME = "uk_historical_electricity" -N = PathFinder(str(CURRENT_DIR / DATASET_SHORT_NAME)) - -# Conversion factor from million tonnes of oil equivalent to terawatt-hours. -MTOE_TO_TWH = 11.63 - - -def combine_tables( - tb_fuel_input: catalog.Table, tb_supply: catalog.Table, tb_efficiency: catalog.Table -) -> catalog.Table: - """Combine tables (each one originally coming from a different sheet of the BEIS data file) and prepare output table - with metadata. - - Parameters - ---------- - tb_fuel_input : catalog.Table - Data extracted from the "Fuel input" sheet. - tb_supply : catalog.Table - Data extracted from the "Supply, availability & consump" sheet. - tb_efficiency : catalog.Table - Data (on implied efficiency) extracted from the "Generated and supplied" sheet. - - Returns - ------- - tb_combined : catalog.Table - Combined and processed table with metadata and a verified index. - - """ - tb_fuel_input = tb_fuel_input.copy() - tb_supply = tb_supply.copy() - tb_efficiency = tb_efficiency.copy() - - # Create convenient dataframes. - df_fuel_input = pd.DataFrame(tb_fuel_input) - df_supply = pd.DataFrame(tb_supply) - df_efficiency = pd.DataFrame(tb_efficiency) - - # Remove rows with duplicated year. - df_fuel_input = df_fuel_input.drop_duplicates(subset="year", keep="last").reset_index(drop=True) - df_supply = df_supply.drop_duplicates(subset="year", keep="last").reset_index(drop=True) - df_efficiency = df_efficiency.drop_duplicates(subset="year", keep="last").reset_index(drop=True) - - # Convert units of fuel input data. - for column in df_fuel_input.set_index("year").columns: - df_fuel_input[column] *= MTOE_TO_TWH - - # Combine dataframes. - df_combined = dataframes.multi_merge(dfs=[df_fuel_input, df_supply, df_efficiency], how="outer", on="year") - - # Prepare metadata using one of the original tables. - tb_combined_metadata = tb_fuel_input.metadata - tb_combined_metadata.short_name = DATASET_SHORT_NAME - tb_combined_metadata.title = DATASET_TITLE - - # Create a new table with metadata from any of the tables. - tb_combined = catalog.Table(df_combined, metadata=tb_combined_metadata) - - # Add a country column (even if there is only one country) and set an appropriate index. - tb_combined["country"] = "United Kingdom" - tb_combined = tb_combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return tb_combined - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read dataset from meadow. - ds_meadow = catalog.Dataset(N.meadow_dataset.path) - # Load tables from meadow dataset. - tb_fuel_input = ds_meadow["fuel_input"] - tb_supply = ds_meadow["supply"] - tb_efficiency = ds_meadow["efficiency"] - - # - # Process data. - # - # Clean and combine tables. - tb_garden = combine_tables(tb_fuel_input=tb_fuel_input, tb_supply=tb_supply, tb_efficiency=tb_efficiency) - - # - # Save outputs. - # - # Create new dataset. - ds_garden = catalog.Dataset.create_empty(dest_dir) - ds_garden.metadata = ds_meadow.metadata - # Get metadata from yaml file. - ds_garden.metadata.update_from_yaml(N.metadata_path, if_source_exists="replace") - tb_garden.update_metadata_from_yaml(N.metadata_path, DATASET_SHORT_NAME) - - ds_garden.add(tb_garden) - ds_garden.save() diff --git a/etl/steps/archive/garden/un/2019/un_wpp.mapping.json b/etl/steps/archive/garden/un/2019/un_wpp.mapping.json deleted file mode 100644 index 8383a95feae..00000000000 --- a/etl/steps/archive/garden/un/2019/un_wpp.mapping.json +++ /dev/null @@ -1,243 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Africa": "Africa", - "Albania": "Albania", - "Algeria": "Algeria", - "American Samoa": "American Samoa", - "Andorra": "Andorra", - "Angola": "Angola", - "Anguilla": "Anguilla", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Aruba": "Aruba", - "Asia": "Asia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bermuda": "Bermuda", - "Bhutan": "Bhutan", - "Bolivia (Plurinational State of)": "Bolivia", - "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "British Virgin Islands": "British Virgin Islands", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cabo Verde": "Cape Verde", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cayman Islands": "Cayman Islands", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Channel Islands": "Channel Islands", - "Chile": "Chile", - "China": "China", - "China, Hong Kong SAR": "Hong Kong", - "China, Macao SAR": "Macao", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cura\u00e7ao": "Curacao", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Europe": "Europe", - "Falkland Islands (Malvinas)": "Falkland Islands", - "Faroe Islands": "Faroe Islands", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "French Guiana": "French Guiana", - "French Polynesia": "French Polynesia", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Gibraltar": "Gibraltar", - "Greece": "Greece", - "Greenland": "Greenland", - "Grenada": "Grenada", - "Guadeloupe": "Guadeloupe", - "Guam": "Guam", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Isle of Man": "Isle of Man", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic": "Laos", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Libya": "Libya", - "Liechtenstein": "Liechtenstein", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Martinique": "Martinique", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mayotte": "Mayotte", - "Melanesia": "Melanesia", - "Mexico": "Mexico", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Montserrat": "Montserrat", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Caledonia": "New Caledonia", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "North Macedonia": "North Macedonia", - "Northern Mariana Islands": "Northern Mariana Islands", - "Norway": "Norway", - "Oceania": "Oceania", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Polynesia": "Polynesia", - "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", - "Qatar": "Qatar", - "Republic of Korea": "South Korea", - "Republic of Moldova": "Moldova", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "R\u00e9union": "Reunion", - "Saint Barth\u00e9lemy": "Saint Barthelemy", - "Saint Helena": "Saint Helena", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Martin (French part)": "Saint Martin (French part)", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "San Marino": "San Marino", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Tajikistan": "Tajikistan", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tokelau": "Tokelau", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United Republic of Tanzania": "Tanzania", - "United States Virgin Islands": "United States Virgin Islands", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Wallis and Futuna Islands": "Wallis and Futuna", - "Western Sahara": "Western Sahara", - "World": "World", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "China (and dependencies)": "China", - "China, Taiwan Province of China": "Taiwan", - "Dem. People's Republic of Korea": "North Korea", - "Denmark (and dependencies)": "Denmark" -} diff --git a/etl/steps/archive/garden/un/2019/un_wpp.py b/etl/steps/archive/garden/un/2019/un_wpp.py deleted file mode 100644 index b874d35ff51..00000000000 --- a/etl/steps/archive/garden/un/2019/un_wpp.py +++ /dev/null @@ -1,67 +0,0 @@ -import json -from pathlib import Path -from typing import Dict, cast - -from owid.catalog import Dataset - -from etl.paths import DATA_DIR - -MAPPING_FILE = Path(__file__).with_suffix(".mapping.json") - - -def run(dest_dir: str) -> None: - harmonize_countries("meadow/un/2019/un_wpp", dest_dir) - - -def harmonize_countries(source_ds_path: str, dest_dir: str) -> None: - """ - Harmonize the country field of every table in the source dataset. - """ - # this is deliberately done in a generic way as a demonstration - source_ds = Dataset(DATA_DIR / source_ds_path) - mapping = load_mapping() - - ds = Dataset.create_empty(dest_dir, metadata=source_ds.metadata) - ds.metadata.short_name = "un_wpp" - ds.metadata.namespace = "un" - - for table in source_ds: - # harmonize this specific table - names = table.index.names - table = table.reset_index() - - # these tables don't need harmonization - if table.metadata.short_name in ("location_codes", "variant_codes"): - ds.add(table) - continue - - # drop locations with suffix `(and dependencies)` - table = table[~table.location.str.endswith("(and dependencies)")] - - # continents are duplicated for unknown reason - table = table.drop_duplicates() - - dimensions = [n for n in names if n != "location"] - - # harmonize countries; drop locations without a match (typically WB defined regions) - table = ( - table.assign(country=table.location.map(mapping)) - .dropna(subset=["country"]) - .drop(["location"], axis=1) - .set_index(["country"] + dimensions) - ) - - # make sure we don't have duplicate countries - assert table[table.index.duplicated()].empty - - ds.add(table) - - ds.save() - - -def load_mapping() -> Dict[str, str]: - country_mapping_file = MAPPING_FILE - with open(country_mapping_file) as istream: - mapping = json.load(istream) - - return cast(Dict[str, str], mapping) diff --git a/etl/steps/archive/garden/un/2022-07-07/un_sdg.country_exclude.json b/etl/steps/archive/garden/un/2022-07-07/un_sdg.country_exclude.json deleted file mode 100644 index 6186d0fdabc..00000000000 --- a/etl/steps/archive/garden/un/2022-07-07/un_sdg.country_exclude.json +++ /dev/null @@ -1,75 +0,0 @@ -[ - "Residual/unallocated ODA: Central Asia and Southern Asia", - "Residual/unallocated ODA: Eastern and South-eastern Asia", - "Residual/unallocated ODA: Latin America and the Caribbean", - "Residual/unallocated ODA: Oceania excl. Aus. and N. Zealand", - "Residual/unallocated ODA: Sub-Saharan Africa", - "Residual/unallocated ODA: Western Asia and Northern Africa", - "United States Minor Outlying Islands", - "Other non-OECD Asia", - "Other non-OECD Oceania", - "Other non-OECD Americas", - "Areas not elsewhere specified", - "Other Africa (IEA)", - "World Marine Bunkers", - "World Aviation Bunkers", - "Latin America", - "European Union (EU) Institutions", - "FAO Major Fishing Area: Pacific, Eastern Central", - "FAO Major Fishing Area: Pacific, Northeast", - "FAO Major Fishing Area: Pacific, Northwest", - "FAO Major Fishing Area: Pacific, Western Central", - "FAO Major Fishing Area: Pacific, Southwest", - "FAO Major Fishing Area: Atlantic, Northwest", - "FAO Major Fishing Area: Atlantic, Northeast", - "FAO Major Fishing Area: Indian Ocean, Eastern", - "FAO Major Fishing Area: Atlantic, Southeast", - "FAO Major Fishing Area: Indian Ocean, Western", - "FAO Major Fishing Area: Atlantic, Western Central", - "FAO Major Fishing Area: Atlantic, Eastern Central", - "FAO Major Fishing Area: Atlantic, Southwest", - "FAO Major Fishing Area: Pacific, Southeast", - "FAO Major Fishing Area: Mediterranean and Black Sea", - "Belgium and Luxembourg", - "Residual/unallocated ODA: Unspecified, developing countries", - "WTO Developing Member States", - "Central America", - "Eastern Africa", - "Middle Africa", - "Southern Africa", - "Americas", - "South America", - "Albania", - "Western Africa", - "Caribbean", - "Eastern Asia", - "Australia and New Zealand", - "Melanesia", - "Caucasus and Central Asia", - "Eastern Europe", - "Central Asia", - "Southern Asia", - "South-Eastern Asia", - "Southern Europe", - "Landlocked developing countries (LLDCs)", - "Western Asia (exc. Armenia, Azerbaijan, Cyprus, Israel and Georgia)", - "Developed regions (Europe, Cyprus, Israel, Northern America, Japan, Australia & New Zealand)", - "Eastern Asia (excluding Japan)", - "Oceania (exc. Australia and New Zealand)", - "Sub-Saharan Africa (inc. Sudan)", - "Northern Africa (exc. Sudan)", - "Northern Africa and Western Asia", - "Other non-specified areas in Eastern Asia", - "Southern Asia (excluding India)", - "Eastern Asia (excluding Japan and China)", - "International Centers (FAO)", - "Regional Centres (FAO)", - "ODA residual", - "European Union", - "Development Assistance Committee members (DAC)", - "Polynesia", - "Northern Europe", - "Micronesia", - "Western Asia", - "Western Europe" -] \ No newline at end of file diff --git a/etl/steps/archive/garden/un/2022-07-07/un_sdg.country_mapping.json b/etl/steps/archive/garden/un/2022-07-07/un_sdg.country_mapping.json deleted file mode 100644 index dd1d97b25d9..00000000000 --- a/etl/steps/archive/garden/un/2022-07-07/un_sdg.country_mapping.json +++ /dev/null @@ -1,284 +0,0 @@ -{ - "World": "World", - "Algeria": "Algeria", - "Angola": "Angola", - "Azerbaijan": "Azerbaijan", - "Argentina": "Argentina", - "Australia": "Australia", - "Austria": "Austria", - "Bangladesh": "Bangladesh", - "Armenia": "Armenia", - "Belgium": "Belgium", - "Bhutan": "Bhutan", - "Bolivia (Plurinational State of)": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Belize": "Belize", - "Solomon Islands": "Solomon Islands", - "Bulgaria": "Bulgaria", - "Myanmar": "Myanmar", - "Burundi": "Burundi", - "Belarus": "Belarus", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cabo Verde": "Cape Verde", - "Central African Republic": "Central African Republic", - "Sri Lanka": "Sri Lanka", - "Western Asia": "Western Asia", - "Chad": "Chad", - "Chile": "Chile", - "Northern Europe": "Northern Europe", - "Western Europe": "Western Europe", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Democratic Republic of the Congo": "Democratic Republic of Congo", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cyprus": "Cyprus", - "Czechia": "Czechia", - "Benin": "Benin", - "Denmark": "Denmark", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "El Salvador": "El Salvador", - "Ethiopia": "Ethiopia", - "Estonia": "Estonia", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "Djibouti": "Djibouti", - "Gabon": "Gabon", - "Georgia": "Georgia", - "Gambia": "Gambia", - "State of Palestine": "Palestine", - "Germany": "Germany", - "Ghana": "Ghana", - "Kiribati": "Kiribati", - "Greece": "Greece", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "C\u00f4te d'Ivoire": "Cote d'Ivoire", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Kazakhstan": "Kazakhstan", - "Jordan": "Jordan", - "Kenya": "Kenya", - "Republic of Korea": "South Korea", - "Kosovo": "Kosovo", - "Kyrgyzstan": "Kyrgyzstan", - "Lao People's Democratic Republic": "Laos", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Latvia": "Latvia", - "Liberia": "Liberia", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Mongolia": "Mongolia", - "Republic of Moldova": "Moldova", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "Vanuatu": "Vanuatu", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Norway": "Norway", - "Micronesia (Federated States of)": "Micronesia (country)", - "Pakistan": "Pakistan", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Guinea-Bissau": "Guinea-Bissau", - "Timor-Leste": "East Timor", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Saint Lucia": "Saint Lucia", - "Sao Tome and Principe": "Sao Tome and Principe", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Slovakia": "Slovakia", - "Viet Nam": "Vietnam", - "Slovenia": "Slovenia", - "Somalia": "Somalia", - "South Africa": "South Africa", - "Zimbabwe": "Zimbabwe", - "Spain": "Spain", - "South Sudan": "South Sudan", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Eswatini": "Eswatini", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Tajikistan": "Tajikistan", - "Thailand": "Thailand", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "United Arab Emirates": "United Arab Emirates", - "Tunisia": "Tunisia", - "Turkey": "Turkey", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "North Macedonia": "North Macedonia", - "Egypt": "Egypt", - "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", - "United Republic of Tanzania": "Tanzania", - "United States of America": "United States", - "Burkina Faso": "Burkina Faso", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Samoa": "Samoa", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Cambodia": "Cambodia", - "Afghanistan": "Afghanistan", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Palau": "Palau", - "Antigua and Barbuda": "Antigua and Barbuda", - "Bahamas": "Bahamas", - "Polynesia": "Polynesia", - "Brunei Darussalam": "Brunei", - "Cuba": "Cuba", - "Dominica": "Dominica", - "Grenada": "Grenada", - "Liechtenstein": "Liechtenstein", - "Aruba": "Aruba", - "New Zealand": "New Zealand", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Anguilla": "Anguilla", - "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "San Marino": "San Marino", - "Singapore": "Singapore", - "Sudan [former]": "Sudan [former]", - "Turks and Caicos Islands": "Turks and Caicos Islands", - "Marshall Islands": "Marshall Islands", - "Bahrain": "Bahrain", - "Bermuda": "Bermuda", - "Cook Islands": "Cook Islands", - "French Guiana": "French Guiana", - "Guadeloupe": "Guadeloupe", - "Martinique": "Martinique", - "Puerto Rico": "Puerto Rico", - "Saint Martin (French Part)": "Saint Martin (French part)", - "Saudi Arabia": "Saudi Arabia", - "United States Virgin Islands": "United States Virgin Islands", - "Andorra": "Andorra", - "Barbados": "Barbados", - "British Virgin Islands": "British Virgin Islands", - "Cayman Islands": "Cayman Islands", - "China, Hong Kong Special Administrative Region": "Hong Kong", - "Kuwait": "Kuwait", - "Libya": "Libya", - "China, Macao Special Administrative Region": "Macao", - "Cura\u00e7ao": "Curacao", - "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", - "New Caledonia": "New Caledonia", - "Qatar": "Qatar", - "Isle of Man": "Isle of Man", - "Oman": "Oman", - "American Samoa": "American Samoa", - "Micronesia": "Micronesia", - "Mayotte": "Mayotte", - "Falkland Islands (Malvinas)": "Falkland Islands", - "French Polynesia": "French Polynesia", - "Gibraltar": "Gibraltar", - "Greenland": "Greenland", - "Guam": "Guam", - "Democratic People's Republic of Korea": "North Korea", - "Monaco": "Monaco", - "Montserrat": "Montserrat", - "Niue": "Niue", - "Northern Mariana Islands": "Northern Mariana Islands", - "R\u00e9union": "Reunion", - "Saint Barth\u00e9lemy": "Saint Barthlemy", - "Saint Helena": "Saint Helena", - "Tokelau": "Tokelau", - "Channel Islands": "Channel Islands", - "Wallis and Futuna Islands": "Wallis and Futuna", - "Low income economies (WB)": "Low income economies", - "Lower middle economies (WB)": "Lower middle economies", - "Low and middle income economies (WB)": "Low and middle income economies", - "Upper middle economies (WB)": "Upper middle economies", - "Holy See": "Vatican", - "\u00c5land Islands": "Aland Islands", - "Netherlands Antilles": "Netherlands Antilles", - "Serbia and Montenegro [former]": "Serbia and Montenegro [former]", - "Jersey": "Jersey", - "Western Sahara": "Western Sahara", - "British Indian Ocean Territory": "British Indian Ocean Territory", - "Christmas Island": "Christmas Island", - "Cocos (Keeling) Islands": "Cocos Islands", - "South Georgia and the South Sandwich Islands": "South Georgia and the South Sandwich Islands", - "French Southern Territories": "French Southern Territories", - "Heard Island and McDonald Islands": "Heard Island and McDonald Islands", - "Norfolk Island": "Norfolk Island", - "Pitcairn": "Pitcairn", - "Svalbard and Jan Mayen Islands": "Svalbard and Jan Mayen", - "Guernsey": "Guernsey", - "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", - "Bouvet Island": "Bouvet Island", - "Iraq (Central Iraq)": "Iraq (Central Iraq)", - "Iraq (Kurdistan Region)": "Iraq (Kurdistan Region)", - "United Kingdom (England and Wales)": "United Kingdom (England and Wales)", - "United Kingdom (Northern Ireland)": "United Kingdom (Northern Ireland)", - "United Kingdom (Scotland)": "United Kingdom (Scotland)", - "Yugoslavia [former]": "Yugoslavia [former]", - "Türkiye": "Turkey", - "Faroe Islands": "Faroe Islands", - "Central and Southern Asia": "Central and Southern Asia (UN)", - "Eastern and South-Eastern Asia": "Eastern and South-Eastern Asia (UN)", - "Europe and Northern America": "Europe and Northern America (UN)", - "Latin America and the Caribbean": "Latin America and the Caribbean (UN)", - "Middle East and North Africa": "Middle East and North Africa (UN)", - "Northern Africa": "Northern Africa (UN)", - "Oceania": "Oceania (UN)", - "Sub-Saharan Africa": "Sub-Saharan Africa (UN)", - "Asia": "Asia (UN)", - "Africa": "Africa (UN)", - "Northern America": "Northern America (UN)", - "Europe": "Europe (UN)", - "Least Developed Countries (LDCs)": "Least Developed Countries (LDCs)", - "Developing regions": "Developing regions", - "Small island developing States (SIDS)": "Small Island Developing States (SIDS)" -} diff --git a/etl/steps/archive/garden/un/2022-07-07/un_sdg.py b/etl/steps/archive/garden/un/2022-07-07/un_sdg.py deleted file mode 100644 index 5e8a25ab984..00000000000 --- a/etl/steps/archive/garden/un/2022-07-07/un_sdg.py +++ /dev/null @@ -1,347 +0,0 @@ -import json -from pathlib import Path -from typing import Any, Dict, List, Tuple - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore -from owid.walden import Catalog -from structlog import get_logger - -from etl.paths import DATA_DIR - -COUNTRY_MAPPING_PATH = Path(__file__).parent / "un_sdg.country_mapping.json" - -BASE_URL = "https://unstats.un.org/sdgapi" -VERSION = Path(__file__).parent.stem -FNAME = Path(__file__).stem -NAMESPACE = "un_sdg" - -log = get_logger() - - -def run(dest_dir: str, query: str = "") -> None: - log.info("Reading in dataset from Meadow...") - ds_meadow = Dataset((DATA_DIR / f"meadow/{NAMESPACE}/{VERSION}/{FNAME}").as_posix()) - - assert len(ds_meadow.table_names) == 1, "Expected meadow dataset to have only one table, but found > 1 table names." - tb_meadow = ds_meadow[FNAME] - df = pd.DataFrame(tb_meadow) - df = create_units(df) - df = manual_clean_data(df) - - log.info("Harmonizing entity names...") - country_mapping = load_country_mapping() - excluded_countries = load_excluded_countries() - df = df.loc[~df.country.isin(excluded_countries)] - assert df["country"].notnull().all() - countries = df["country"].map(country_mapping) - if countries.isnull().any(): - missing_countries = [x for x in df["country"].drop_duplicates() if x not in country_mapping] - raise RuntimeError( - "The following raw country names have not been harmonized. " - f"Please: (a) edit {COUNTRY_MAPPING_PATH} to include these country " - "names; or (b) remove these country names from the raw table." - f"Raw country names: {missing_countries}" - ) - df["country"] = countries - assert df["country"].notnull().all() - assert df["value"].notnull().all() - assert not df.isnull().all(axis=1).any(), "Unexpected state: One or more rows contains only NaN values." - - if query: - df = df.query(query) - - log.info("Creating data tables...") - - all_tables = create_tables(df) - - all_tables = create_omms(all_tables) - - log.info("Saving data tables...") - - ds_garden = Dataset.create_empty(dest_dir) - ds_garden.metadata = ds_meadow.metadata - - for table in all_tables: - log.info( - "un_sdg.create_garden_table", - indicator=table.index[0][4], - series_code=table.index[0][5], - ) - - tb_garden = Table(table) - tb_garden.metadata = tb_meadow.metadata - short_name = tb_garden.index[0][4] + "_" + tb_garden.index[0][5] - tb_garden.metadata.short_name = underscore(short_name) - ds_garden.add(tb_garden) - - ds_garden.save() - - -def create_tables(original_df: pd.DataFrame) -> List[pd.DataFrame]: - original_df = original_df.copy(deep=False) - - dim_description = get_dimension_description() - init_dimensions = list(dim_description.keys()) - init_dimensions = list(set(init_dimensions).intersection(list(original_df.columns))) - init_dimensions = sorted(init_dimensions) - init_non_dimensions = list([c for c in original_df.columns if c not in set(init_dimensions)]) - - all_series = original_df.groupby(["indicator", "seriescode"]) - - output_tables = [] - len_dimensions = [] - for group_name, df_group in all_series: - log.info( - "un_sdg.create_dataframe.group", - indicator=group_name[0], - series=group_name[1], - ) - df_dim, dimensions = get_series_with_relevant_dimensions(df_group, init_dimensions, init_non_dimensions) - len_dimensions.append(len(dimensions)) - if len(dimensions) == 0: - # no additional dimensions - table = generate_tables_for_indicator_and_series( - dim_dict=dim_description, data_dimensions=df_dim, dimensions=dimensions - ) - table_fil = table[ - [ - "country", - "year", - "goal", - "target", - "indicator", - "seriescode", - "seriesdescription", - "value", - "long_unit", - "short_unit", - "source", - ] - ] - table_fil = table_fil.dropna() - table_fil.set_index( - [ - "country", - "year", - "goal", - "target", - "indicator", - "seriescode", - ], - inplace=True, - verify_integrity=True, - ) - - output_tables.append(table_fil) - - else: - # has additional dimensions - tables = generate_tables_for_indicator_and_series( - dim_dict=dim_description, data_dimensions=df_dim, dimensions=dimensions - ) - tables_fil = tables[ - [ - "country", - "year", - "goal", - "target", - "indicator", - "seriescode", - "seriesdescription", - "value", - "long_unit", - "short_unit", - "source", - ] - + dimensions - ] - tables_fil = tables_fil.dropna() - tables_fil.set_index( - [ - "country", - "year", - "goal", - "target", - "indicator", - "seriescode", - ] - + dimensions, - inplace=True, - verify_integrity=True, - ) - - output_tables.append(tables_fil) - return output_tables - - -def create_units(df: pd.DataFrame) -> pd.DataFrame: - df = df.copy(deep=False) - unit_description = get_attributes_description() - df["long_unit"] = df["units"].map(unit_description) - df["short_unit"] = create_short_unit(df["long_unit"]) - return df - - -def generate_tables_for_indicator_and_series( - dim_dict: dict[Any, Any], - data_dimensions: pd.DataFrame, - dimensions: List[str], -) -> pd.DataFrame: - if len(dimensions) == 0: - return data_dimensions - else: - for dim in dimensions: - data_dimensions[dim] = data_dimensions[dim].map(dim_dict[dim]) - - return data_dimensions - - -def get_series_with_relevant_dimensions( - data_series: pd.DataFrame, - init_dimensions: List[str], - init_non_dimensions: List[str], -) -> Tuple[pd.DataFrame, List[str]]: - """For a given indicator and series, return a tuple: - - data filtered to that indicator and series - - names of relevant dimensions - - unique values for each relevant dimension - """ - - non_null_dimensions_columns = [col for col in init_dimensions if data_series.loc[:, col].notna().any()] - dimension_names = [] - - for c in non_null_dimensions_columns: - uniques = data_series[c].unique() - if ( - len(uniques) > 1 - ): # Means that columns where the value doesn't change aren't included e.g. Nature is typically consistent across a dimension whereas Age and Sex are less likely to be. - dimension_names.append(c) - return ( - data_series.loc[ - :, - data_series.columns.intersection(init_non_dimensions + list(dimension_names)), - ], - dimension_names, - ) - - -def create_short_unit(long_unit: pd.Series) -> np.ndarray[Any, np.dtype[Any]]: - conditions = [ - (long_unit.str.contains("PERCENT")) | (long_unit.str.contains("Percentage") | (long_unit.str.contains("%"))), - (long_unit.str.contains("KG")) | (long_unit.str.contains("Kilograms")), - (long_unit.str.contains("USD")) | (long_unit.str.contains("usd")), - ] - - choices = ["%", "kg", "$"] - - short_unit = np.select(conditions, choices, default="") - return short_unit - - -def manual_clean_data(df: pd.DataFrame) -> pd.DataFrame: - """ - Some values for 15.2.1 is above 100% when this shouldn't be possible. This sets the max value to 100. - Returns: - pd.DataFrame with cleaned values for 15.2.1 - """ - df = df.copy(deep=False) - - df["value"] = df["value"].astype(float) - df.loc[ - (df["long_unit"] == "Percentage") & (df["value"] > 100) & (df["indicator"] == "15.2.1"), - "value", - ] = 100 - - # Clean the IHR Capacity column, duplicate labelling of some attributes which doesn't work well with the grapher - df["ihr_capacity"] = df["ihr_capacity"].replace( - [ - "IHR02", - "IHR03", - "IHR06", - "IHR07", - "IHR08", - "IHR09", - "IHR10", - "IHR11", - "IHR12", - ], - [ - "SPAR02", - "SPAR06", - "SPAR10", - "SPAR07", - "SPAR05", - "SPAR11", - "SPAR03", - "SPAR04", - "SPAR12", - ], - ) - # Dropping average marine acidity as we don't have a way to visualise it - df = df[~df["seriescode"].isin(["ER_OAW_MNACD"])] - df = df.drop(["level_0", "index"], axis=1, errors="ignore") - - return df - - -def get_attributes_description() -> Any: - walden_ds = Catalog().find_one(namespace=NAMESPACE, short_name="unit", version=VERSION) - local_file = walden_ds.ensure_downloaded() - with open(local_file) as json_file: - units = json.load(json_file) - return units - - -def get_dimension_description() -> dict[str, str]: - walden_ds = Catalog().find_one(namespace=NAMESPACE, short_name="dimension", version=VERSION) - local_file = walden_ds.ensure_downloaded() - with open(local_file) as json_file: - dims: dict[str, str] = json.load(json_file) - # underscore to match the df column names - for key in dims.copy(): - dims[underscore(key)] = dims.pop(key) - return dims - - -def load_country_mapping() -> Dict[str, str]: - with open(COUNTRY_MAPPING_PATH, "r") as f: - mapping = json.load(f) - assert isinstance(mapping, dict) - return mapping - - -def load_excluded_countries() -> List[str]: - with open(Path(__file__).parent / f"{FNAME}.country_exclude.json", "r") as f: - data = json.load(f) - assert isinstance(data, list) - return data - - -def create_omms(all_tabs: List[pd.DataFrame]) -> List[pd.DataFrame]: - new_tabs = [] - for table in all_tabs: - if table.index[0][5] in ("ER_BDY_ABT2NP", "SG_SCP_PROCN"): - table = table.copy(deep=False) - table = table.query('level_status != "No breakdown"') - - # exclude regions which contain more than one country and cannot be - # converted to a level_status for a single country - vc = table.groupby(["country", "year"]).value.sum().sort_values(ascending=False) - regions = set(vc[vc > 1].index.get_level_values(0)) - table = table[~table.index.get_level_values("country").isin(regions)] - - table.reset_index(level=["level_status"], inplace=True) # type: ignore - table["value"] = table["level_status"] - table.drop(columns=["level_status"], inplace=True) - - new_tabs.append(table) - - return new_tabs - - -if __name__ == "__main__": - # test script for a single indicator with `python etl/steps/data/meadow/un_sdg/2022-05-26/un_sdg.py` - run("/tmp/un_sdg", query="Indicator == '1.1.1'") diff --git a/etl/steps/archive/grapher/agriculture/2023-04-20/long_term_wheat_yields.py b/etl/steps/archive/grapher/agriculture/2023-04-20/long_term_wheat_yields.py deleted file mode 100644 index 885a9500289..00000000000 --- a/etl/steps/archive/grapher/agriculture/2023-04-20/long_term_wheat_yields.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Load a garden dataset and create a grapher dataset.""" - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("long_term_wheat_yields") - - # For simplicity, remove sources descriptions (which are mostly irrelevant for the variable used here). - ds_garden.metadata.sources[0].description = "" - ds_garden.metadata.sources[1].description = "" - - # Read table from garden dataset. - tb_garden = ds_garden["long_term_wheat_yields"] - - # - # Process data. - # - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - - # - # Checks. - # - grapher_checks(ds_grapher) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/agriculture/2023-04-21/uk_long_term_yields.py b/etl/steps/archive/grapher/agriculture/2023-04-21/uk_long_term_yields.py deleted file mode 100644 index b42eab49b84..00000000000 --- a/etl/steps/archive/grapher/agriculture/2023-04-21/uk_long_term_yields.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Load a garden dataset and create a grapher dataset.""" - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("uk_long_term_yields") - - # For simplicity, remove sources descriptions (which are mostly irrelevant for the variable used here). - ds_garden.metadata.sources[0].description = "" - ds_garden.metadata.sources[1].description = "" - ds_garden.metadata.sources[2].description = "" - - # Read table from garden dataset. - tb_garden = ds_garden["uk_long_term_yields"] - - # - # Process data. - # - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - - # - # Checks. - # - grapher_checks(ds_grapher) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/agriculture/2023-05-26/attainable_yields.py b/etl/steps/archive/grapher/agriculture/2023-05-26/attainable_yields.py deleted file mode 100644 index 16f2090a0db..00000000000 --- a/etl/steps/archive/grapher/agriculture/2023-05-26/attainable_yields.py +++ /dev/null @@ -1,39 +0,0 @@ -"""Load a garden dataset and create a grapher dataset.""" - -from typing import cast - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden = cast(Dataset, paths.load_dependency("attainable_yields")) - - # Read table from garden dataset. - tb = ds_garden["attainable_yields"] - - # - # Process data. - # - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) - - # - # Checks. - # - grapher_checks(ds_grapher) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/agriculture/2023-05-30/attainable_yields.py b/etl/steps/archive/grapher/agriculture/2023-05-30/attainable_yields.py deleted file mode 100644 index 16f2090a0db..00000000000 --- a/etl/steps/archive/grapher/agriculture/2023-05-30/attainable_yields.py +++ /dev/null @@ -1,39 +0,0 @@ -"""Load a garden dataset and create a grapher dataset.""" - -from typing import cast - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden = cast(Dataset, paths.load_dependency("attainable_yields")) - - # Read table from garden dataset. - tb = ds_garden["attainable_yields"] - - # - # Process data. - # - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) - - # - # Checks. - # - grapher_checks(ds_grapher) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/agriculture/2023-05-30/long_term_crop_yields.py b/etl/steps/archive/grapher/agriculture/2023-05-30/long_term_crop_yields.py deleted file mode 100644 index d0218240360..00000000000 --- a/etl/steps/archive/grapher/agriculture/2023-05-30/long_term_crop_yields.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Load a garden dataset and create a grapher dataset.""" - -from typing import cast - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - - # Load garden dataset. - ds_garden = cast(Dataset, paths.load_dependency("long_term_crop_yields")) - - # Read table from garden dataset. - tb = ds_garden["long_term_crop_yields"] - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) - - # To avoid a very long (and mostly irrelevant) dataset description, remove the sources descriptions. - for source in ds_grapher.metadata.sources: - source.description = None - - # - # Checks. - # - grapher_checks(ds_grapher) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/bp/2022-07-14/energy_mix.py b/etl/steps/archive/grapher/bp/2022-07-14/energy_mix.py deleted file mode 100644 index 753d6a526e3..00000000000 --- a/etl/steps/archive/grapher/bp/2022-07-14/energy_mix.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Grapher step for BP's energy mix 2022 dataset. -""" - -from owid import catalog - -from etl.paths import DATA_DIR - -DATASET_PATH = DATA_DIR / "garden" / "bp" / "2022-07-14" / "energy_mix" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[garden_dataset.table_names[0]].reset_index().drop(columns=["country_code"]) - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/bp/2022-07-14/statistical_review.py b/etl/steps/archive/grapher/bp/2022-07-14/statistical_review.py deleted file mode 100644 index 079e18c53b9..00000000000 --- a/etl/steps/archive/grapher/bp/2022-07-14/statistical_review.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Grapher step for BP's statistical review 2022 dataset. -""" - -from owid import catalog - -from etl.paths import DATA_DIR - -DATASET_PATH = DATA_DIR / "garden" / "bp" / "2022-07-14" / "statistical_review" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[garden_dataset.table_names[0]].reset_index().drop(columns=["country_code"]) - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/bp/2022-09-19/fossil_fuel_reserves_production_ratio.py b/etl/steps/archive/grapher/bp/2022-09-19/fossil_fuel_reserves_production_ratio.py deleted file mode 100644 index 5e6d535dc33..00000000000 --- a/etl/steps/archive/grapher/bp/2022-09-19/fossil_fuel_reserves_production_ratio.py +++ /dev/null @@ -1,17 +0,0 @@ -from owid import catalog -from shared import CURRENT_DIR - -from etl.helpers import PathFinder - -DATASET_SHORT_NAME = "fossil_fuel_reserves_production_ratio" -N = PathFinder(str(CURRENT_DIR / DATASET_SHORT_NAME)) - - -def run(dest_dir: str) -> None: - # Create new grapher dataset. - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - # Prepare table for grapher. - table = N.garden_dataset[DATASET_SHORT_NAME].reset_index() - # Add table and save dataset. - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/bp/2022-09-19/shared.py b/etl/steps/archive/grapher/bp/2022-09-19/shared.py deleted file mode 100644 index 7e7f4d18c5b..00000000000 --- a/etl/steps/archive/grapher/bp/2022-09-19/shared.py +++ /dev/null @@ -1,3 +0,0 @@ -from pathlib import Path - -CURRENT_DIR = Path(__file__).parent diff --git a/etl/steps/archive/grapher/bp/2022-12-28/energy_mix.py b/etl/steps/archive/grapher/bp/2022-12-28/energy_mix.py deleted file mode 100644 index def70075724..00000000000 --- a/etl/steps/archive/grapher/bp/2022-12-28/energy_mix.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Grapher step for BP's energy mix 2022 dataset. -""" - -from owid import catalog - -from etl.paths import DATA_DIR - -DATASET_PATH = DATA_DIR / "garden" / "bp" / "2022-12-28" / "energy_mix" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[garden_dataset.table_names[0]].reset_index().drop(columns=["country_code"]) - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/bp/2022-12-28/fossil_fuel_reserves_production_ratio.py b/etl/steps/archive/grapher/bp/2022-12-28/fossil_fuel_reserves_production_ratio.py deleted file mode 100644 index aaaaf495cc1..00000000000 --- a/etl/steps/archive/grapher/bp/2022-12-28/fossil_fuel_reserves_production_ratio.py +++ /dev/null @@ -1,18 +0,0 @@ -from owid import catalog - -from etl.paths import DATA_DIR - -DATASET_SHORT_NAME = "fossil_fuel_reserves_production_ratio" -DATASET_PATH = DATA_DIR / "garden" / "bp" / "2022-12-28" / DATASET_SHORT_NAME - - -def run(dest_dir: str) -> None: - # Load dataset from garden. - garden_dataset = catalog.Dataset(DATASET_PATH) - # Create new grapher dataset. - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - # Prepare table for grapher. - table = garden_dataset[DATASET_SHORT_NAME].reset_index() - # Add table and save dataset. - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/bp/2022-12-28/statistical_review.py b/etl/steps/archive/grapher/bp/2022-12-28/statistical_review.py deleted file mode 100644 index c8402ec6a45..00000000000 --- a/etl/steps/archive/grapher/bp/2022-12-28/statistical_review.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Grapher step for BP's statistical review 2022 dataset. -""" - -from owid import catalog - -from etl.paths import DATA_DIR - -DATASET_PATH = DATA_DIR / "garden" / "bp" / "2022-12-28" / "statistical_review" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset["statistical_review"].reset_index().drop(columns=["country_code"]) - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/bp/2023-02-20/energy_mix.py b/etl/steps/archive/grapher/bp/2023-02-20/energy_mix.py deleted file mode 100644 index 8e8c9e9ba1d..00000000000 --- a/etl/steps/archive/grapher/bp/2023-02-20/energy_mix.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Grapher step for BP's energy mix dataset. -""" - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("energy_mix") - - # Read table from garden dataset. - tb_garden = ds_garden["energy_mix"].reset_index() - - # - # Process data. - # - # Remove unnecessary columns. - tb_garden = tb_garden.drop(columns=["country_code"]) - - # - # Save outputs. - # - # Create new grapher dataset. - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/cait/2022-08-10/all_ghg_emissions.py b/etl/steps/archive/grapher/cait/2022-08-10/all_ghg_emissions.py deleted file mode 100644 index 9185e2acc88..00000000000 --- a/etl/steps/archive/grapher/cait/2022-08-10/all_ghg_emissions.py +++ /dev/null @@ -1,42 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -from .shared import GARDEN_DATASET_VERSION, NAMESPACE - -# Name of table to load from garden dataset to convert into a grapher dataset. -TABLE_NAME = "greenhouse_gas_emissions_by_sector" -# Name of output grapher dataset. -GRAPHER_DATASET_TITLE = "Greenhouse gas emissions by sector (CAIT, 2022)" -# Path to garden dataset to be loaded. -DATASET_PATH = DATA_DIR / "garden" / NAMESPACE / GARDEN_DATASET_VERSION / "ghg_emissions_by_sector" - -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - #################################################################################################################### - # Grapher seems to be taking the name from the dataset instead of the table. - # Given that there are different tables in the same dataset, use the table title as the dataset title. - dataset.metadata.title = GRAPHER_DATASET_TITLE - dataset.metadata.short_name = TABLE_NAME - dataset.metadata.short_name = N.short_name - #################################################################################################################### - - dataset.save() - - table = garden_dataset[TABLE_NAME].reset_index().drop(columns=["population"]) - # For convenience, change units from "million tonnes" to "tonnes" and multiply all variables by a million. - # Doing this, grapher will know when to use the word "million" and when to use "billion". - variables = [column for column in table.columns if column not in ["country", "year"]] - for column in variables: - if table[column].metadata.unit == "million tonnes": - table[column].metadata.unit = "tonnes" - table[column].metadata.short_unit = "t" - table[column].metadata.display["conversionFactor"] = 1e6 - table[column].metadata.description = table[column].metadata.description.replace("million tonnes", "tonnes") - dataset.add(table) diff --git a/etl/steps/archive/grapher/cait/2022-08-10/ch4_emissions.py b/etl/steps/archive/grapher/cait/2022-08-10/ch4_emissions.py deleted file mode 100644 index dda9779a1bf..00000000000 --- a/etl/steps/archive/grapher/cait/2022-08-10/ch4_emissions.py +++ /dev/null @@ -1,40 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -from .shared import GARDEN_DATASET_VERSION, NAMESPACE - -# Name of table to load from garden dataset to convert into a grapher dataset. -TABLE_NAME = "methane_emissions_by_sector" -# Name of output grapher dataset. -GRAPHER_DATASET_TITLE = "Methane emissions by sector (CAIT, 2022)" -# Path to garden dataset to be loaded. -DATASET_PATH = DATA_DIR / "garden" / NAMESPACE / GARDEN_DATASET_VERSION / "ghg_emissions_by_sector" -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - #################################################################################################################### - # Grapher seems to be taking the name from the dataset instead of the table. - # Given that there are different tables in the same dataset, use the table title as the dataset title. - dataset.metadata.title = GRAPHER_DATASET_TITLE - dataset.metadata.short_name = N.short_name - #################################################################################################################### - - dataset.save() - - table = garden_dataset[TABLE_NAME].reset_index().drop(columns=["population"]) - # For convenience, change units from "million tonnes" to "tonnes" and multiply all variables by a million. - # Doing this, grapher will know when to use the word "million" and when to use "billion". - variables = [column for column in table.columns if column not in ["country", "year"]] - for column in variables: - if table[column].metadata.unit == "million tonnes": - table[column].metadata.unit = "tonnes" - table[column].metadata.short_unit = "t" - table[column].metadata.display["conversionFactor"] = 1e6 - table[column].metadata.description = table[column].metadata.description.replace("million tonnes", "tonnes") - dataset.add(table) diff --git a/etl/steps/archive/grapher/cait/2022-08-10/co2_emissions.py b/etl/steps/archive/grapher/cait/2022-08-10/co2_emissions.py deleted file mode 100644 index 2c9c5ea8f03..00000000000 --- a/etl/steps/archive/grapher/cait/2022-08-10/co2_emissions.py +++ /dev/null @@ -1,40 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -from .shared import GARDEN_DATASET_VERSION, NAMESPACE - -# Name of table to load from garden dataset to convert into a grapher dataset. -TABLE_NAME = "carbon_dioxide_emissions_by_sector" -# Name of output grapher dataset. -GRAPHER_DATASET_TITLE = "Carbon dioxide emissions by sector (CAIT, 2022)" -# Path to garden dataset to be loaded. -DATASET_PATH = DATA_DIR / "garden" / NAMESPACE / GARDEN_DATASET_VERSION / "ghg_emissions_by_sector" -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - #################################################################################################################### - # Grapher seems to be taking the name from the dataset instead of the table. - # Given that there are different tables in the same dataset, use the table title as the dataset title. - dataset.metadata.title = GRAPHER_DATASET_TITLE - dataset.metadata.short_name = N.short_name - #################################################################################################################### - - dataset.save() - - table = garden_dataset[TABLE_NAME].reset_index().drop(columns=["population"]) - # For convenience, change units from "million tonnes" to "tonnes" and multiply all variables by a million. - # Doing this, grapher will know when to use the word "million" and when to use "billion". - variables = [column for column in table.columns if column not in ["country", "year"]] - for column in variables: - if table[column].metadata.unit == "million tonnes": - table[column].metadata.unit = "tonnes" - table[column].metadata.short_unit = "t" - table[column].metadata.display["conversionFactor"] = 1e6 - table[column].metadata.description = table[column].metadata.description.replace("million tonnes", "tonnes") - dataset.add(table) diff --git a/etl/steps/archive/grapher/cait/2022-08-10/n2o_emissions.py b/etl/steps/archive/grapher/cait/2022-08-10/n2o_emissions.py deleted file mode 100644 index 1c78723a7ce..00000000000 --- a/etl/steps/archive/grapher/cait/2022-08-10/n2o_emissions.py +++ /dev/null @@ -1,40 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR - -from .shared import GARDEN_DATASET_VERSION, NAMESPACE - -# Name of table to load from garden dataset to convert into a grapher dataset. -TABLE_NAME = "nitrous_oxide_emissions_by_sector" -# Name of output grapher dataset. -GRAPHER_DATASET_TITLE = "Nitrous oxide emissions by sector (CAIT, 2022)" -# Path to garden dataset to be loaded. -DATASET_PATH = DATA_DIR / "garden" / NAMESPACE / GARDEN_DATASET_VERSION / "ghg_emissions_by_sector" -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - #################################################################################################################### - # Grapher seems to be taking the name from the dataset instead of the table. - # Given that there are different tables in the same dataset, use the table title as the dataset title. - dataset.metadata.title = GRAPHER_DATASET_TITLE - dataset.metadata.short_name = N.short_name - #################################################################################################################### - - dataset.save() - - table = garden_dataset[TABLE_NAME].reset_index().drop(columns=["population"]) - # For convenience, change units from "million tonnes" to "tonnes" and multiply all variables by a million. - # Doing this, grapher will know when to use the word "million" and when to use "billion". - variables = [column for column in table.columns if column not in ["country", "year"]] - for column in variables: - if table[column].metadata.unit == "million tonnes": - table[column].metadata.unit = "tonnes" - table[column].metadata.short_unit = "t" - table[column].metadata.display["conversionFactor"] = 1e6 - table[column].metadata.description = table[column].metadata.description.replace("million tonnes", "tonnes") - dataset.add(table) diff --git a/etl/steps/archive/grapher/cait/2022-08-10/shared.py b/etl/steps/archive/grapher/cait/2022-08-10/shared.py deleted file mode 100644 index 5ecb3f14586..00000000000 --- a/etl/steps/archive/grapher/cait/2022-08-10/shared.py +++ /dev/null @@ -1,2 +0,0 @@ -NAMESPACE = "cait" -GARDEN_DATASET_VERSION = "2022-08-10" diff --git a/etl/steps/archive/grapher/demography/2023-04-14/population_density.py b/etl/steps/archive/grapher/demography/2023-04-14/population_density.py deleted file mode 100644 index 889df338b92..00000000000 --- a/etl/steps/archive/grapher/demography/2023-04-14/population_density.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Load a garden dataset and create a grapher dataset.""" - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("population_density") - - # Read table from garden dataset. - tb_garden = ds_garden["population_density"] - - # - # Process data. - # - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - # - # Checks. - # - grapher_checks(ds_grapher) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/emdat/2022-11-24/natural_disasters.py b/etl/steps/archive/grapher/emdat/2022-11-24/natural_disasters.py deleted file mode 100644 index 373f017d794..00000000000 --- a/etl/steps/archive/grapher/emdat/2022-11-24/natural_disasters.py +++ /dev/null @@ -1,54 +0,0 @@ -from copy import deepcopy - -from owid import catalog -from shared import CURRENT_DIR, DISASTER_TYPE_RENAMING - -from etl.helpers import PathFinder - -N = PathFinder(str(CURRENT_DIR / "natural_disasters")) - - -def create_wide_tables(table: catalog.Table, is_decade: bool) -> catalog.Table: - # Create wide dataframes. - table_wide = table.reset_index().pivot(index=["country", "year"], columns="type") - - if is_decade: - variable_name_suffix = "_decadal" - variable_title_suffix = " (decadal)" - else: - variable_name_suffix = "_yearly" - variable_title_suffix = "" - - # Store metadata of original table variables. - variable_metadata = {} - for column, subcolumn in table_wide.columns: - old_metadata = deepcopy(table[column].metadata) - new_variable = f"{column}_{subcolumn}" + variable_name_suffix - new_title = f"{old_metadata.title} - {DISASTER_TYPE_RENAMING[subcolumn]}" + variable_title_suffix - old_metadata.title = new_title - variable_metadata[new_variable] = old_metadata - - # Flatten column indexes. - table_wide.columns = [f"{column}_{subcolumn}" + variable_name_suffix for column, subcolumn in table_wide.columns] - - # Assign original variables metadata to new variables in wide table. - for variable in variable_metadata: - table_wide[variable].metadata = variable_metadata[variable] - - return table_wide - - -def run(dest_dir: str) -> None: - # Load garden tables and remove unnecessary columns. - table_yearly = N.garden_dataset["natural_disasters_yearly"].drop(columns=["population", "gdp"]) - table_decade = N.garden_dataset["natural_disasters_decadal"].drop(columns=["population", "gdp"]) - - # Create wide tables. - table_yearly_wide = create_wide_tables(table=table_yearly, is_decade=False) - table_decade_wide = create_wide_tables(table=table_decade, is_decade=True) - - # Create new grapher dataset, add tables, and save dataset. - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - dataset.add(table_yearly_wide) - dataset.add(table_decade_wide) - dataset.save() diff --git a/etl/steps/archive/grapher/emdat/2022-11-24/natural_disasters_global_by_type.py b/etl/steps/archive/grapher/emdat/2022-11-24/natural_disasters_global_by_type.py deleted file mode 100644 index 681e7683936..00000000000 --- a/etl/steps/archive/grapher/emdat/2022-11-24/natural_disasters_global_by_type.py +++ /dev/null @@ -1,31 +0,0 @@ -from owid import catalog -from shared import DISASTER_TYPE_RENAMING, GARDEN_DATASET_PATH, GARDEN_VERSION_YEAR - -GRAPHER_DATASET_TITLE = f"Global natural disasters by type (EM-DAT, {GARDEN_VERSION_YEAR})" -GRAPHER_DATASET_SHORT_NAME = "natural_disasters_global_by_type" - - -def run(dest_dir: str) -> None: - # Load garden dataset. - garden_dataset = catalog.Dataset(GARDEN_DATASET_PATH) - # Load table on yearly data. - table = garden_dataset["natural_disasters_yearly"].reset_index() - - # Select data for the World and remove unnecessary columns. - table_global = ( - table[table["country"] == "World"].drop(columns=["country", "population", "gdp"]).reset_index(drop=True) - ) - # Treat column for disaster type as the new entity (so they can be selected in grapher as if they were countries). - table_global = table_global.rename(columns={"type": "country"}).replace(DISASTER_TYPE_RENAMING) - - # Set an appropriate index. - table_global = table_global.set_index(["country", "year"]).sort_index() - - # Create new grapher dataset, update metadata, add table, and save dataset. - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.metadata.title = GRAPHER_DATASET_TITLE - dataset.metadata.short_name = GRAPHER_DATASET_SHORT_NAME - table_global.metadata.title = GRAPHER_DATASET_TITLE - table_global.metadata.short_name = GRAPHER_DATASET_SHORT_NAME - dataset.add(table_global) - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2022-07-20/fossil_fuel_production.py b/etl/steps/archive/grapher/energy/2022-07-20/fossil_fuel_production.py deleted file mode 100644 index b2f800749a4..00000000000 --- a/etl/steps/archive/grapher/energy/2022-07-20/fossil_fuel_production.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Grapher step for the fossil fuel production dataset. -""" - -from owid import catalog - -from etl.paths import DATA_DIR - -DATASET_PATH = DATA_DIR / "garden" / "energy" / "2022-07-20" / "fossil_fuel_production" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[garden_dataset.table_names[0]].reset_index() - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2022-07-29/primary_energy_consumption.py b/etl/steps/archive/grapher/energy/2022-07-29/primary_energy_consumption.py deleted file mode 100644 index 28b05c1212b..00000000000 --- a/etl/steps/archive/grapher/energy/2022-07-29/primary_energy_consumption.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Grapher step for the primary energy consumption dataset. -""" - -from owid import catalog - -from etl.paths import DATA_DIR - -# Path to garden dataset to be loaded. -DATASET_PATH = DATA_DIR / "garden" / "energy" / "2022-07-29" / "primary_energy_consumption" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # Backward compatibility - dataset.metadata.title = "Primary energy consumption (BP & EIA, 2022 archive)" - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[garden_dataset.table_names[0]].reset_index().drop(columns=["gdp", "population", "source"]) - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2022-09-09/global_primary_energy.py b/etl/steps/archive/grapher/energy/2022-09-09/global_primary_energy.py deleted file mode 100644 index 6d65e12a327..00000000000 --- a/etl/steps/archive/grapher/energy/2022-09-09/global_primary_energy.py +++ /dev/null @@ -1,17 +0,0 @@ -from owid import catalog - -from etl.paths import DATA_DIR - -# Path to garden dataset to be loaded. -DATASET_PATH = DATA_DIR / "garden" / "energy" / "2022-09-09" / "global_primary_energy" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[garden_dataset.table_names[0]].reset_index().drop(columns=["data_source"]) - - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2022-12-13/electricity_mix.py b/etl/steps/archive/grapher/energy/2022-12-13/electricity_mix.py deleted file mode 100644 index e9973cb5289..00000000000 --- a/etl/steps/archive/grapher/energy/2022-12-13/electricity_mix.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Grapher step for the Electricity Mix (BP & Ember, 2022) dataset. -""" - -from copy import deepcopy - -from owid import catalog - -from etl.paths import DATA_DIR - -# Path to garden dataset to be loaded. -DATASET_PATH = DATA_DIR / "garden" / "energy" / "2022-12-13" / "electricity_mix" -TABLE_NAME = "electricity_mix" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[TABLE_NAME].reset_index().drop(columns=["population"]) - - # Add zero-filled variables (where missing points are filled with zeros) to avoid stacked area charts - # showing incomplete data. - generation_columns = [c for c in table.columns if "generation__twh" in c] - for column in generation_columns: - new_column = f"{column}_zero_filled" - table[new_column] = table[column].fillna(0) - table[new_column].metadata = deepcopy(table[column].metadata) - table[new_column].metadata.title += " (zero filled)" - - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2022-12-13/uk_historical_electricity.py b/etl/steps/archive/grapher/energy/2022-12-13/uk_historical_electricity.py deleted file mode 100644 index c5987d1ba9c..00000000000 --- a/etl/steps/archive/grapher/energy/2022-12-13/uk_historical_electricity.py +++ /dev/null @@ -1,17 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder - -# Naming conventions. -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Create new empty grapher dataset, using metadata from the garden dataset. - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - # Load table from garden dataset. - table = N.garden_dataset["uk_historical_electricity"].reset_index() - # Add table to new grapher dataset. - dataset.add(table) - # Save new dataset. - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2022-12-28/electricity_mix.py b/etl/steps/archive/grapher/energy/2022-12-28/electricity_mix.py deleted file mode 100644 index 1681ad19b2d..00000000000 --- a/etl/steps/archive/grapher/energy/2022-12-28/electricity_mix.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Grapher step for the Electricity Mix (BP & Ember, 2022) dataset. -""" - -from copy import deepcopy - -from owid import catalog - -from etl.paths import DATA_DIR - -# Path to garden dataset to be loaded. -DATASET_PATH = DATA_DIR / "garden" / "energy" / "2022-12-28" / "electricity_mix" -TABLE_NAME = "electricity_mix" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[TABLE_NAME].reset_index().drop(columns=["population"]) - - # Add zero-filled variables (where missing points are filled with zeros) to avoid stacked area charts - # showing incomplete data. - generation_columns = [c for c in table.columns if "generation__twh" in c] - for column in generation_columns: - new_column = f"{column}_zero_filled" - table[new_column] = table[column].fillna(0) - table[new_column].metadata = deepcopy(table[column].metadata) - table[new_column].metadata.title += " (zero filled)" - - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2022-12-28/fossil_fuel_production.py b/etl/steps/archive/grapher/energy/2022-12-28/fossil_fuel_production.py deleted file mode 100644 index ee8a8104043..00000000000 --- a/etl/steps/archive/grapher/energy/2022-12-28/fossil_fuel_production.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Grapher step for the fossil fuel production dataset. -""" - -from owid import catalog - -from etl.paths import DATA_DIR - -DATASET_PATH = DATA_DIR / "garden" / "energy" / "2022-12-28" / "fossil_fuel_production" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[garden_dataset.table_names[0]].reset_index() - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2022-12-28/global_primary_energy.py b/etl/steps/archive/grapher/energy/2022-12-28/global_primary_energy.py deleted file mode 100644 index e9dcdcc1c0b..00000000000 --- a/etl/steps/archive/grapher/energy/2022-12-28/global_primary_energy.py +++ /dev/null @@ -1,17 +0,0 @@ -from owid import catalog - -from etl.paths import DATA_DIR - -# Path to garden dataset to be loaded. -DATASET_PATH = DATA_DIR / "garden" / "energy" / "2022-12-28" / "global_primary_energy" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[garden_dataset.table_names[0]].reset_index().drop(columns=["data_source"]) - - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2022-12-28/primary_energy_consumption.py b/etl/steps/archive/grapher/energy/2022-12-28/primary_energy_consumption.py deleted file mode 100644 index 8d5f388d6dd..00000000000 --- a/etl/steps/archive/grapher/energy/2022-12-28/primary_energy_consumption.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Grapher step for the primary energy consumption dataset. -""" - -from owid import catalog - -from etl.paths import DATA_DIR - -# Path to garden dataset to be loaded. -DATASET_PATH = DATA_DIR / "garden" / "energy" / "2022-12-28" / "primary_energy_consumption" - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATASET_PATH) - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - - # There is only one table in the dataset, with the same name as the dataset. - table = garden_dataset[garden_dataset.table_names[0]].reset_index().drop(columns=["gdp", "population", "source"]) - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2022-12-28/uk_historical_electricity.py b/etl/steps/archive/grapher/energy/2022-12-28/uk_historical_electricity.py deleted file mode 100644 index c5987d1ba9c..00000000000 --- a/etl/steps/archive/grapher/energy/2022-12-28/uk_historical_electricity.py +++ /dev/null @@ -1,17 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder - -# Naming conventions. -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Create new empty grapher dataset, using metadata from the garden dataset. - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - # Load table from garden dataset. - table = N.garden_dataset["uk_historical_electricity"].reset_index() - # Add table to new grapher dataset. - dataset.add(table) - # Save new dataset. - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2023-01-04/photovoltaic_cost_and_capacity.py b/etl/steps/archive/grapher/energy/2023-01-04/photovoltaic_cost_and_capacity.py deleted file mode 100644 index 5f85c2a7480..00000000000 --- a/etl/steps/archive/grapher/energy/2023-01-04/photovoltaic_cost_and_capacity.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Load garden dataset of photovoltaic cost and capacity and create a grapher dataset. - -""" - -from owid import catalog - -from etl.helpers import PathFinder - -# Get paths and naming conventions. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Load table from garden dataset. - ds_garden: catalog.Dataset = paths.load_dependency("photovoltaic_cost_and_capacity") - tb_garden = ds_garden["photovoltaic_cost_and_capacity"] - - # Remove unnecessary columns. - tb_garden = tb_garden.drop(columns=["cost_source", "cumulative_capacity_source"]) - - # Create a new grapher dataset. - dataset = catalog.Dataset.create_empty(dest_dir, ds_garden.metadata) - - # Add table to dataset and save dataset. - dataset.add(tb_garden) - dataset.save() diff --git a/etl/steps/archive/grapher/energy/2023-02-20/electricity_mix.py b/etl/steps/archive/grapher/energy/2023-02-20/electricity_mix.py deleted file mode 100644 index b9eeca2ef15..00000000000 --- a/etl/steps/archive/grapher/energy/2023-02-20/electricity_mix.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Grapher step for the Electricity Mix (BP & Ember) dataset. -""" - -from copy import deepcopy - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("electricity_mix") - - # Read table from garden dataset. - tb_garden = ds_garden["electricity_mix"].reset_index() - - # - # Process data. - # - # Drop unnecessary columns. - tb_garden = tb_garden.drop(columns=["population"]) - - # Add zero-filled variables (where missing points are filled with zeros) to avoid stacked area charts - # showing incomplete data. - generation_columns = [c for c in tb_garden.columns if "generation__twh" in c] - for column in generation_columns: - new_column = f"{column}_zero_filled" - tb_garden[new_column] = tb_garden[column].fillna(0) - tb_garden[new_column].metadata = deepcopy(tb_garden[column].metadata) - tb_garden[new_column].metadata.title += " (zero filled)" - - # - # Save outputs. - # - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/energy/2023-02-20/fossil_fuel_production.py b/etl/steps/archive/grapher/energy/2023-02-20/fossil_fuel_production.py deleted file mode 100644 index e041806d449..00000000000 --- a/etl/steps/archive/grapher/energy/2023-02-20/fossil_fuel_production.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Grapher step for the fossil fuel production dataset. -""" - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("fossil_fuel_production") - - # Read table from garden dataset. - tb_garden = ds_garden["fossil_fuel_production"].reset_index() - - # - # Save outputs. - # - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/energy/2023-02-20/global_primary_energy.py b/etl/steps/archive/grapher/energy/2023-02-20/global_primary_energy.py deleted file mode 100644 index 215d4f7ee3a..00000000000 --- a/etl/steps/archive/grapher/energy/2023-02-20/global_primary_energy.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Grapher step for the global primary energy dataset. -""" -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("global_primary_energy") - - # Read main table from dataset. - tb_garden = ds_garden["global_primary_energy"] - - # - # Process data. - # - # Drop unnecessary columns from table. - tb_garden = tb_garden.reset_index().drop(columns=["data_source"]) - - # - # Save outputs. - # - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/energy/2023-02-20/primary_energy_consumption.py b/etl/steps/archive/grapher/energy/2023-02-20/primary_energy_consumption.py deleted file mode 100644 index ef1bc97315a..00000000000 --- a/etl/steps/archive/grapher/energy/2023-02-20/primary_energy_consumption.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Grapher step for the primary energy consumption dataset. -""" -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("primary_energy_consumption") - - # Read table from garden dataset. - tb_garden = ds_garden["primary_energy_consumption"].reset_index() - - # - # Process data. - # - # Remove unnecessary columns. - tb_garden = tb_garden.drop(columns=["gdp", "population", "source"]) - - # - # Save outputs. - # - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/energy/2023-02-20/uk_historical_electricity.py b/etl/steps/archive/grapher/energy/2023-02-20/uk_historical_electricity.py deleted file mode 100644 index 3ef47dabd76..00000000000 --- a/etl/steps/archive/grapher/energy/2023-02-20/uk_historical_electricity.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Grapher step for the UK historical electricity dataset. -""" -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("uk_historical_electricity") - - # Read table from garden dataset. - tb_garden = ds_garden["uk_historical_electricity"].reset_index() - - # - # Save outputs. - # - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/energy/2023-06-01/electricity_mix.py b/etl/steps/archive/grapher/energy/2023-06-01/electricity_mix.py deleted file mode 100644 index d54dc2d6984..00000000000 --- a/etl/steps/archive/grapher/energy/2023-06-01/electricity_mix.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Grapher step for the Electricity Mix (BP & Ember) dataset. -""" - -from copy import deepcopy - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset and read its main table. - ds_garden: Dataset = paths.load_dependency("electricity_mix") - tb_garden = ds_garden["electricity_mix"].reset_index() - - # - # Process data. - # - # Drop unnecessary columns. - tb_garden = tb_garden.drop(columns=["population"]) - - # Add zero-filled variables (where missing points are filled with zeros) to avoid stacked area charts - # showing incomplete data. - generation_columns = [c for c in tb_garden.columns if "generation__twh" in c] - for column in generation_columns: - new_column = f"{column}_zero_filled" - tb_garden[new_column] = tb_garden[column].fillna(0) - tb_garden[new_column].metadata = deepcopy(tb_garden[column].metadata) - tb_garden[new_column].metadata.title += " (zero filled)" - - # Set an appropriate index and sort conveniently. - tb_garden = tb_garden.set_index(["country", "year"], verify_integrity=True).sort_index() - - # - # Save outputs. - # - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/energy/2023-06-01/fossil_fuel_production.py b/etl/steps/archive/grapher/energy/2023-06-01/fossil_fuel_production.py deleted file mode 100644 index eb3f0afcabb..00000000000 --- a/etl/steps/archive/grapher/energy/2023-06-01/fossil_fuel_production.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Grapher step for the fossil fuel production dataset. -""" - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset and read its main table - ds_garden: Dataset = paths.load_dependency("fossil_fuel_production") - tb_garden = ds_garden["fossil_fuel_production"].reset_index() - - # - # Save outputs. - # - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/energy/2023-06-01/global_primary_energy.py b/etl/steps/archive/grapher/energy/2023-06-01/global_primary_energy.py deleted file mode 100644 index a315faeb118..00000000000 --- a/etl/steps/archive/grapher/energy/2023-06-01/global_primary_energy.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Grapher step for the global primary energy dataset. -""" -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset and read its main table. - ds_garden: Dataset = paths.load_dependency("global_primary_energy") - tb_garden = ds_garden["global_primary_energy"] - - # - # Process data. - # - # Drop unnecessary columns from table. - tb_garden = tb_garden.reset_index().drop(columns=["data_source"]) - - # - # Save outputs. - # - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/energy/2023-06-01/primary_energy_consumption.py b/etl/steps/archive/grapher/energy/2023-06-01/primary_energy_consumption.py deleted file mode 100644 index c9bf517acd1..00000000000 --- a/etl/steps/archive/grapher/energy/2023-06-01/primary_energy_consumption.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Grapher step for the primary energy consumption dataset. -""" -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset and read its main table. - ds_garden: Dataset = paths.load_dependency("primary_energy_consumption") - tb_garden = ds_garden["primary_energy_consumption"].reset_index() - - # - # Process data. - # - # Remove unnecessary columns. - tb = tb_garden.drop(columns=["gdp", "population", "source"]) - - # Set an appropriate index and sort conveniently. - tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index() - - # - # Save outputs. - # - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/energy/2023-06-01/uk_historical_electricity.py b/etl/steps/archive/grapher/energy/2023-06-01/uk_historical_electricity.py deleted file mode 100644 index 0debdbd5072..00000000000 --- a/etl/steps/archive/grapher/energy/2023-06-01/uk_historical_electricity.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Grapher step for the UK historical electricity dataset. -""" -from copy import deepcopy - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load garden dataset and read its main table. - ds_garden: Dataset = paths.load_dependency("uk_historical_electricity") - tb_garden = ds_garden["uk_historical_electricity"] - - # Create variable filling missing values with zeros (to allow visualization of stacked area charts in grapher). - for column in tb_garden.columns: - new_column = f"{column}_zero_filled" - tb_garden[new_column] = tb_garden[column].fillna(0) - tb_garden[new_column].metadata = deepcopy(tb_garden[column].metadata) - tb_garden[new_column].metadata.title = tb_garden[column].metadata.title + " (zero filled)" - - # - # Save outputs. - # - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - ds_grapher.save() diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ef.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ef.py deleted file mode 100644 index dc4e6c0fefc..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ef.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_ef dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ei.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ei.py deleted file mode 100644 index f75dd85f088..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ei.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_ei dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ek.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ek.py deleted file mode 100644 index ad1035848b8..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ek.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_ek dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_el.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_el.py deleted file mode 100644 index 5e47afab31f..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_el.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_el dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_emn.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_emn.py deleted file mode 100644 index 28f65fc16ad..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_emn.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_emn dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ep.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ep.py deleted file mode 100644 index d83658e74f9..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ep.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_ep dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_esb.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_esb.py deleted file mode 100644 index f5b3ed29ebe..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_esb.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_esb dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fa.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fa.py deleted file mode 100644 index a6477ace678..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fa.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_fa dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fbsc.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fbsc.py deleted file mode 100644 index 02d655c437b..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fbsc.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_fbsc dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fo.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fo.py deleted file mode 100644 index 40bc03719da..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fo.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_fo dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fs.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fs.py deleted file mode 100644 index 0d6f2d89a6d..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_fs.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_fs dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_lc.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_lc.py deleted file mode 100644 index 30869cd7071..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_lc.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_lc dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_qcl.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_qcl.py deleted file mode 100644 index 977983a9764..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_qcl.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_qcl dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_qi.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_qi.py deleted file mode 100644 index 603101d86c5..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_qi.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_qi dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_qv.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_qv.py deleted file mode 100644 index 9697bb49576..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_qv.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_qv dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rfb.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rfb.py deleted file mode 100644 index da3ddde7eb6..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rfb.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_rfb dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rfn.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rfn.py deleted file mode 100644 index 8479a63b074..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rfn.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_rfn dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rl.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rl.py deleted file mode 100644 index 335234c5827..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rl.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_rl dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rp.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rp.py deleted file mode 100644 index c9e322be721..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rp.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_rp dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rt.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rt.py deleted file mode 100644 index 284a462a126..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_rt.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_rt dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_scl.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_scl.py deleted file mode 100644 index 22d7471ae72..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_scl.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_scl dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_sdgb.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_sdgb.py deleted file mode 100644 index 28be3deafde..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_sdgb.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_sdgb dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_tcl.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_tcl.py deleted file mode 100644 index 6731fa79ae6..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_tcl.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_tcl dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ti.py b/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ti.py deleted file mode 100644 index 6d8625dbeeb..00000000000 --- a/etl/steps/archive/grapher/faostat/2022-05-17/faostat_ti.py +++ /dev/null @@ -1,11 +0,0 @@ -"""FAOSTAT grapher step for faostat_ti dataset.""" -from .shared import catalog, get_grapher_dataset_from_file_name, get_grapher_table - - -def run(dest_dir: str) -> None: - garden_dataset = get_grapher_dataset_from_file_name(__file__) - - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - dataset.add(get_grapher_table(garden_dataset)) diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/additional_variables.py b/etl/steps/archive/grapher/faostat/2023-02-22/additional_variables.py deleted file mode 100644 index 9c2b939e0b8..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/additional_variables.py +++ /dev/null @@ -1,204 +0,0 @@ -"""Load a garden dataset and create a grapher dataset.""" - -import pandas as pd -from owid.catalog import Dataset, Table -from owid.catalog.utils import underscore_table - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def prepare_maize_and_wheat_in_the_context_of_the_ukraine_war(tb_maize_and_wheat: Table) -> Table: - # Prepare groupings that will be shown in a stacked discrete bar chart. - # Ukraine and Russia exports of maize and wheat. - ukraine_and_russia_exports = ( - pd.merge( - tb_maize_and_wheat[["maize_exports", "wheat_exports"]].loc["Ukraine"], - tb_maize_and_wheat[["maize_exports", "wheat_exports"]].loc["Russia"], - left_index=True, - right_index=True, - suffixes=(" Ukraine", " Russia"), - ) - .assign(**{"country": "Ukraine and Russia exports"}) - .reset_index() - ) - # EU and UK maize and wheat used for animal feed. - eu_and_uk_feed = ( - pd.merge( - tb_maize_and_wheat[["maize_animal_feed", "wheat_animal_feed"]].loc["European Union (27)"], - tb_maize_and_wheat[["maize_animal_feed", "wheat_animal_feed"]].loc["United Kingdom"], - left_index=True, - right_index=True, - suffixes=(" EU", " UK"), - ) - .assign(**{"country": "EU and UK animal feed"}) - .reset_index() - ) - # EU and UK maize and wheat devoted to other uses (predominantly biofuels). - eu_and_uk_biofuels = ( - pd.merge( - tb_maize_and_wheat[["maize_other_uses", "wheat_other_uses"]].loc["European Union (27)"], - tb_maize_and_wheat[["maize_other_uses", "wheat_other_uses"]].loc["United Kingdom"], - left_index=True, - right_index=True, - suffixes=(" EU", " UK"), - ) - .assign(**{"country": "EU and UK biofuels"}) - .reset_index() - ) - # US maize and wheat used for animal feed. - us_feed = ( - tb_maize_and_wheat[["maize_animal_feed", "wheat_animal_feed"]] - .loc["United States"] - .rename(columns={"maize_animal_feed": "maize_animal_feed US", "wheat_animal_feed": "wheat_animal_feed US"}) - .assign(**{"country": "US animal feed"}) - .reset_index() - ) - # US maize and wheat devoted to other uses (predominantly biofuels). - us_biofuels = ( - tb_maize_and_wheat[["maize_other_uses", "wheat_other_uses"]] - .loc["United States"] - .rename(columns={"maize_other_uses": "maize_other_uses US", "wheat_other_uses": "wheat_other_uses US"}) - .assign(**{"country": "US biofuels"}) - .reset_index() - ) - - # Combine all groupings. - combined = pd.concat( - [ukraine_and_russia_exports, eu_and_uk_feed, eu_and_uk_biofuels, us_feed, us_biofuels], ignore_index=True - ) - - # Set an appropriate index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Adapt metadata. - combined.metadata.short_name = "maize_and_wheat_in_the_context_of_the_ukraine_war" - for column in combined.columns: - title = ( - column.replace("maize_", "Maize ") - .replace("wheat_", "Wheat ") - .replace("animal_feed", "used for animal feed in") - .replace("exports", "exported by") - .replace("other_uses", "used for biofuels in") - ) - combined[column].metadata.title = title - combined[column].metadata.unit = "tonnes" - combined[column].metadata.short_unit = "t" - combined = underscore_table(combined) - - return combined - - -def prepare_fertilizer_exports_in_the_context_of_the_ukraine_war(tb_fertilizer_exports: Table) -> Table: - # Select the relevant countries for the chart. - fertilizer_exports = tb_fertilizer_exports.loc[["Ukraine", "Russia", "Belarus"]].reset_index() - - # Transpose data. - fertilizer_exports = fertilizer_exports.pivot( - index=["item", "year"], columns="country", values=["exports", "share_of_exports"] - ) - - fertilizer_exports.columns = [column[0] + " " + column[1] for column in fertilizer_exports.columns] - - # To be able to work in grapher, rename "item" column to "country". - fertilizer_exports.index.names = ["country", "year"] - - # Adapt metadata. - fertilizer_exports.metadata.short_name = "fertilizer_exports_in_the_context_of_the_ukraine_war" - for column in fertilizer_exports.columns: - element, country = column.split(" ") - title = element.capitalize().replace("_", " ") + " from " + country - fertilizer_exports[column].metadata.title = title - if "share" in column: - fertilizer_exports[column].metadata.unit = "%" - fertilizer_exports[column].metadata.short_unit = "%" - else: - fertilizer_exports[column].metadata.unit = "tonnes" - fertilizer_exports[column].metadata.short_unit = "t" - fertilizer_exports = underscore_table(fertilizer_exports) - - return fertilizer_exports - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("additional_variables") - - # Read tables from garden dataset. - tb_arable_land_per_crop_output = ds_garden["arable_land_per_crop_output"] - tb_area_used_per_crop_type = ds_garden["area_used_per_crop_type"] - tb_sustainable_and_overexploited_fish = ds_garden["share_of_sustainable_and_overexploited_fish"] - tb_land_spared_by_increased_crop_yields = ds_garden["land_spared_by_increased_crop_yields"] - tb_food_available_for_consumption = ds_garden["food_available_for_consumption"] - tb_macronutrient_compositions = ds_garden["macronutrient_compositions"] - tb_fertilizers = ds_garden["fertilizers"] - tb_vegetable_oil_yields = ds_garden["vegetable_oil_yields"] - tb_agriculture_land_use_evolution = ds_garden["agriculture_land_use_evolution"] - tb_hypothetical_meat_consumption = ds_garden["hypothetical_meat_consumption"] - tb_cereal_allocation = ds_garden["cereal_allocation"] - tb_maize_and_wheat = ds_garden["maize_and_wheat"] - tb_fertilizer_exports = ds_garden["fertilizer_exports"] - - # - # Process data. - # - # To insert table into grapher DB, change "item" column to "country" (which will be changed back in the admin). - tb_area_used_per_crop_type = tb_area_used_per_crop_type.reset_index().rename(columns={"item": "country"}) - - # For land spared by increased crop yields, for the moment we only need global data, by crop type. - # And again, change "item" to "country" to fit grapher DB needs. - tb_land_spared_by_increased_crop_yields = tb_land_spared_by_increased_crop_yields.reset_index() - tb_land_spared_by_increased_crop_yields = ( - tb_land_spared_by_increased_crop_yields[tb_land_spared_by_increased_crop_yields["country"] == "World"] - .drop(columns=["country"]) - .rename(columns={"item": "country"}) - .set_index(["country", "year"], verify_integrity=True) - .sort_index() - ) - - # Prepare maize and what data in the context of the Ukraine war. - tb_maize_and_wheat_in_the_context_of_the_ukraine_war = prepare_maize_and_wheat_in_the_context_of_the_ukraine_war( - tb_maize_and_wheat=tb_maize_and_wheat - ) - - # Prepare fertilizer exports data in the context of the Ukraine war. - tb_fertilizer_exports_in_the_context_of_the_ukraine_war = ( - prepare_fertilizer_exports_in_the_context_of_the_ukraine_war(tb_fertilizer_exports=tb_fertilizer_exports) - ) - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset( - dest_dir, - tables=[ - tb_arable_land_per_crop_output, - tb_area_used_per_crop_type, - tb_sustainable_and_overexploited_fish, - tb_land_spared_by_increased_crop_yields, - tb_food_available_for_consumption, - tb_macronutrient_compositions, - tb_fertilizers, - tb_vegetable_oil_yields, - tb_agriculture_land_use_evolution, - tb_hypothetical_meat_consumption, - tb_cereal_allocation, - tb_maize_and_wheat_in_the_context_of_the_ukraine_war, - tb_fertilizer_exports_in_the_context_of_the_ukraine_war, - ], - default_metadata=ds_garden.metadata, - ) - - # - # Checks. - # - grapher_checks(ds_grapher) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ef.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ef.py deleted file mode 100644 index 304863c1b67..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ef.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_ef dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ei.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ei.py deleted file mode 100644 index d5ca840e309..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ei.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_ei dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ek.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ek.py deleted file mode 100644 index 1438bef60af..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ek.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_ek dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_el.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_el.py deleted file mode 100644 index dad80439d5f..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_el.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_el dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_emn.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_emn.py deleted file mode 100644 index a4ec711b24f..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_emn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_emn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ep.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ep.py deleted file mode 100644 index 425db60fd08..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ep.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_ep dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_esb.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_esb.py deleted file mode 100644 index 9443efd4c2f..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_esb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_esb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fa.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fa.py deleted file mode 100644 index 68c503b33fb..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fa.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_fa dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fbsc.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fbsc.py deleted file mode 100644 index a96693ea59e..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fbsc.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_fbsc dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fo.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fo.py deleted file mode 100644 index 52d47d9693b..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fo.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_fo dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fs.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fs.py deleted file mode 100644 index 9ac98d46d8b..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_fs.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_fs dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_gn.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_gn.py deleted file mode 100644 index dd97adb515e..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_gn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_gn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ic.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ic.py deleted file mode 100644 index 3bb8b297f9b..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ic.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_ic dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_lc.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_lc.py deleted file mode 100644 index 9e55fe697eb..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_lc.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_lc dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_qcl.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_qcl.py deleted file mode 100644 index 17ea29863b0..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_qcl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_qcl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_qi.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_qi.py deleted file mode 100644 index ec1e351be6d..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_qi.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_qi dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_qv.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_qv.py deleted file mode 100644 index a8ad501a473..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_qv.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_qv dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rfb.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rfb.py deleted file mode 100644 index 9203ba8a494..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rfb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_rfb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rfn.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rfn.py deleted file mode 100644 index 006af8bb6ce..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rfn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_rfn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rl.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rl.py deleted file mode 100644 index 95550785095..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_rl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rp.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rp.py deleted file mode 100644 index b552b8f0035..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rp.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_rp dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rt.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rt.py deleted file mode 100644 index 709d69ac2d5..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_rt.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_rt dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_scl.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_scl.py deleted file mode 100644 index 95725e189c9..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_scl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_scl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_sdgb.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_sdgb.py deleted file mode 100644 index 7e0187e34ba..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_sdgb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_sdgb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_tcl.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_tcl.py deleted file mode 100644 index 0babbf0b2e3..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_tcl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_tcl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ti.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ti.py deleted file mode 100644 index 94634f7e505..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_ti.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_ti dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_wcad.py b/etl/steps/archive/grapher/faostat/2023-02-22/faostat_wcad.py deleted file mode 100644 index 39ee4dcf7f0..00000000000 --- a/etl/steps/archive/grapher/faostat/2023-02-22/faostat_wcad.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT grapher step for faostat_wcad dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/grapher/fasttrack/2023-01-19/food_expenditures_by_country.meta.yml b/etl/steps/archive/grapher/fasttrack/2023-01-19/food_expenditures_by_country.meta.yml deleted file mode 100644 index 05c33f19179..00000000000 --- a/etl/steps/archive/grapher/fasttrack/2023-01-19/food_expenditures_by_country.meta.yml +++ /dev/null @@ -1,59 +0,0 @@ -dataset: - namespace: fasttrack - version: '2023-01-19' - short_name: food_expenditures_by_country - title: Food expenditures by country (USDA, 2023) - description: '' - sources: - - name: USDA, Economic Research Service - published_by: United States Department of Agriculture (USDA) Economic Research - Service - url: https://www.ers.usda.gov/topics/international-markets-u-s-trade/international-consumer-and-food-industry-trends/#data -tables: - food_expenditures_by_country: - variables: - share_expenditure_food: - title: share_expenditure_food - short_unit: '%' - unit: '%' - description: The share of consumer expenditures that are spent on food. This - only includes food eaten at-home, and does not include alcoholic beverages - and tobacco. - sources: - - name: USDA, Economic Research Service - published_by: United States Department of Agriculture (USDA) Economic Research - Service - url: https://www.ers.usda.gov/topics/international-markets-u-s-trade/international-consumer-and-food-industry-trends/#data - share_alcohol_tobacco: - title: share_alcohol_tobacco - short_unit: '%' - unit: '%' - description: The share of consumer expenditures that are spent on alcohol - and tobacco. - sources: - - name: USDA, Economic Research Service - published_by: United States Department of Agriculture (USDA) Economic Research - Service - url: https://www.ers.usda.gov/topics/international-markets-u-s-trade/international-consumer-and-food-industry-trends/#data - consumer_expenditure: - title: consumer_expenditure - short_unit: $ - unit: $ - description: Total consumer expenditure on goods and services per year. - sources: - - name: USDA, Economic Research Service - published_by: United States Department of Agriculture (USDA) Economic Research - Service - url: https://www.ers.usda.gov/topics/international-markets-u-s-trade/international-consumer-and-food-industry-trends/#data - expenditure_food_dollars: - title: expenditure_food_dollars - short_unit: $ - unit: $ - description: The amount of money spend on food in a given year. This only - includes food eaten at-home, and does not include alcoholic beverages and - tobacco. - sources: - - name: USDA, Economic Research Service - published_by: United States Department of Agriculture (USDA) Economic Research - Service - url: https://www.ers.usda.gov/topics/international-markets-u-s-trade/international-consumer-and-food-industry-trends/#data diff --git a/etl/steps/archive/grapher/fasttrack/2023-01-19/food_expenditures_by_country.py b/etl/steps/archive/grapher/fasttrack/2023-01-19/food_expenditures_by_country.py deleted file mode 100644 index bbd179ca287..00000000000 --- a/etl/steps/archive/grapher/fasttrack/2023-01-19/food_expenditures_by_country.py +++ /dev/null @@ -1,21 +0,0 @@ -import pandas as pd -from owid import catalog - -from etl.helpers import PathFinder -from etl.snapshot import Snapshot - -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # load snapshot - data = pd.read_csv(Snapshot("fasttrack/2023-01-19/food_expenditures_by_country.csv").path) - - # create empty dataframe and table - ds = catalog.Dataset.create_empty(dest_dir) - tb = catalog.Table(data, short_name=N.short_name) - - # add table, update metadata from *.meta.yml and save - ds.add(tb) - ds.update_metadata(N.metadata_path) - ds.save() diff --git a/etl/steps/archive/grapher/fasttrack/2023-03-27/global_warming_contributions.meta.yml b/etl/steps/archive/grapher/fasttrack/2023-03-27/global_warming_contributions.meta.yml deleted file mode 100644 index 3220a480226..00000000000 --- a/etl/steps/archive/grapher/fasttrack/2023-03-27/global_warming_contributions.meta.yml +++ /dev/null @@ -1,839 +0,0 @@ -dataset: - namespace: fasttrack - version: '2023-03-27' - short_name: global_warming_contributions - title: Global warming contributions - description: |- - Jones et al. (2023) quantify national and regional contributions to the increase of global mean surface temperature over the last few centuries. As they detail: the "dataset describing the global warming response to national emissions CO2, CH4 and N2O from fossil and land use sources during 1851-2021. - - National CO2 emissions data are collated from the Global Carbon Project (Andrew and Peters, 2022; Friedlingstein et al., 2022). - - National CH4 and N2O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2022). - - We construct a time series of cumulative CO2-equivalent emissions for each country, gas, and emissions source (fossil or land use). Emissions of CH4 and N2O emissions are related to cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021). - - Warming in response to cumulative CO2-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST)." - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, Robbie - M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard A., Friedlingstein, - Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). National contributions - to climate change due to historical emissions of carbon dioxide, methane and - nitrous oxide [Data set]. In Scientific Data (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 -tables: - global_warming_contributions: - variables: - annual_fossil_co2: - title: Annual fossil CO2 emissions - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_land_co2: - title: Annual CO2 emissions from agriculture and land use - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_co2: - title: Annual CO2 emissions - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_fossil_ch4: - title: annual_fossil_ch4 - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_land_ch4: - title: annual_land_ch4 - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_fossil_n2o: - title: annual_fossil_n2o - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_ch4: - title: annual_ch4 - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_land_n2o: - title: annual_land_n2o - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_n2o: - title: annual_n2o - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_fossil_ch4_co2eq: - title: Annual methane emissions from fossil fuels and industry - short_unit: t - unit: tonnes - description: Methane emissions are calculated by Our World in Data based on - emissions data from Jones et al. (2023) and IPCC conversion factors. Jones - et al. (2023) give methane emissions in standard metric tonnes per year. - We have converted these emissions to carbon-dioxide equivalents over a 100-year - timescale using a conversion factor of 29.8 for fossil sources, and 27.2 - for agricultural and land use sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_ch4_co2eq: - title: Annual methane emissions - short_unit: t - unit: tonnes - description: Methane emissions are calculated by Our World in Data based on - emissions data from Jones et al. (2023) and IPCC conversion factors. Jones - et al. (2023) give methane emissions in standard metric tonnes per year. - We have converted these emissions to carbon-dioxide equivalents over a 100-year - timescale using a conversion factor of 29.8 for fossil sources, and 27.2 - for agricultural and land use sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_land_ch4_co2eq: - title: Annual methane emissions from agriculture and land use - short_unit: t - unit: tonnes - description: Methane emissions are calculated by Our World in Data based on - emissions data from Jones et al. (2023) and IPCC conversion factors. Jones - et al. (2023) give methane emissions in standard metric tonnes per year. - We have converted these emissions to carbon-dioxide equivalents over a 100-year - timescale using a conversion factor of 29.8 for fossil sources, and 27.2 - for agricultural and land use sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_fossil_n2o_co2eq: - title: Annual nitrous oxide emissions from fossil fuels and industry - short_unit: t - unit: tonnes - description: Nitrous oxide emissions are calculated by Our World in Data based - on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. - Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes - per year. We have converted these emissions to carbon-dioxide equivalents - over a 100-year timescale using a conversion factor of 273 (as per the IPCC - AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_land_n2o_co2eq: - title: Annual nitrous oxide emissions from agriculture and land use - short_unit: t - unit: tonnes - description: Nitrous oxide emissions are calculated by Our World in Data based - on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. - Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes - per year. We have converted these emissions to carbon-dioxide equivalents - over a 100-year timescale using a conversion factor of 273 (as per the IPCC - AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_ghg_co2eq: - title: Annual greenhouse gas emissions - short_unit: t - unit: tonnes - description: Greenhouse gas emissions are calculated by Our World in Data - based on emissions data from Jones et al. (2023) and IPCC AR6 conversion - factors. Jones et al. (2023) give methane and nitrous oxide emissions in - standard metric tonnes per year. We have converted these emissions to carbon-dioxide - equivalents over a 100-year timescale using a conversion factor of 273 for - nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane - from agricultural and land use sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_fossil_co2eq: - title: Annual greenhouse gas emissions from fossil fuels and industry - short_unit: t - unit: tonnes - description: Greenhouse gas emissions are calculated by Our World in Data - based on emissions data from Jones et al. (2023) and IPCC AR6 conversion - factors. Jones et al. (2023) give methane and nitrous oxide emissions in - standard metric tonnes per year. We have converted these emissions to carbon-dioxide - equivalents over a 100-year timescale using a conversion factor of 273 for - nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane - from agricultural and land use sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_land_co2eq: - title: Annual greenhouse gas emissions from agriculture and land use - short_unit: t - unit: tonnes - description: Greenhouse gas emissions are calculated by Our World in Data - based on emissions data from Jones et al. (2023) and IPCC AR6 conversion - factors. Jones et al. (2023) give methane and nitrous oxide emissions in - standard metric tonnes per year. We have converted these emissions to carbon-dioxide - equivalents over a 100-year timescale using a conversion factor of 273 for - nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane - from agricultural and land use sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_n2o_co2eq: - title: Annual nitrous oxide emissions - short_unit: t - unit: tonnes - description: Nitrous oxide emissions are calculated by Our World in Data based - on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. - Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes - per year. We have converted these emissions to carbon-dioxide equivalents - over a 100-year timescale using a conversion factor of 273 (as per the IPCC - AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - share_global_n2o: - title: Share of global nitrous oxide emissions - short_unit: '%' - unit: '%' - description: Nitrous oxide emissions are calculated by Our World in Data based - on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. - Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes - per year. We have converted these emissions to carbon-dioxide equivalents - over a 100-year timescale using a conversion factor of 273 (as per the IPCC - AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - share_global_ch4: - title: Share of global methane emissions - short_unit: '%' - unit: '%' - description: Methane emissions are calculated by Our World in Data based on - emissions data from Jones et al. (2023) and IPCC conversion factors. Jones - et al. (2023) give methane emissions in standard metric tonnes per year. - We have converted these emissions to carbon-dioxide equivalents over a 100-year - timescale using a conversion factor of 29.8 for fossil sources, and 27.2 - for agricultural and land use sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - share_global_ghg: - title: Share of global greenhouse gas emissions - short_unit: '%' - unit: '%' - description: Greenhouse gas emissions are calculated by Our World in Data - based on emissions data from Jones et al. (2023) and IPCC AR6 conversion - factors. Jones et al. (2023) give methane and nitrous oxide emissions in - standard metric tonnes per year. We have converted these emissions to carbon-dioxide - equivalents over a 100-year timescale using a conversion factor of 273 for - nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane - from agricultural and land use sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_fossil_co2: - title: cumulative_fossil_co2 - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_land_co2: - title: cumulative_land_co2 - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_co2: - title: cumulative_co2 - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_land_ch4: - title: cumulative_land_ch4 - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_fossil_ch4: - title: cumulative_fossil_ch4 - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_ch4: - title: cumulative_ch4 - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_land_n2o: - title: cumulative_land_n2o - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_fossil_n2o: - title: cumulative_fossil_n2o - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_n2o: - title: cumulative_n2o - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_fossil_ghg: - title: cumulative_fossil_ghg - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_land_ghg: - title: cumulative_land_ghg - short_unit: t - unit: tonnes - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - cumulative_ghg: - title: Cumulative greenhouse gas emissions - short_unit: t - unit: tonnes - description: Greenhouse gas emissions are calculated by Our World in Data - based on emissions data from Jones et al. (2023) and IPCC AR6 conversion - factors. Jones et al. (2023) give methane and nitrous oxide emissions in - standard metric tonnes per year. We have converted these emissions to carbon-dioxide - equivalents over a 100-year timescale using a conversion factor of 273 for - nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane - from agricultural and land use sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_fossil_co2: - title: temp_fossil_co2 - short_unit: °C - unit: °C - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_land_co2: - title: temp_land_co2 - short_unit: °C - unit: °C - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_co2: - title: Change in global mean surface temperature from CO2 emissions - short_unit: °C - unit: °C - description: This measures each country's contribution to global mean surface - temperature (GMST) rise from its cumulative emissions of carbon dioxide. - The warming effects of each gas are calculated based on cumulative CO2-equivalent - emissions using the Global Warming Potential (GWP*) approach. - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_fossil_ch4: - title: temp_fossil_ch4 - short_unit: °C - unit: °C - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_land_ch4: - title: temp_land_ch4 - short_unit: °C - unit: °C - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_ch4: - title: Change in global mean surface temperature from methane emissions - short_unit: °C - unit: °C - description: This measures each country's contribution to global mean surface - temperature (GMST) rise from its cumulative emissions of methane. The warming - effects of each gas are calculated based on cumulative CO2-equivalent emissions - using the Global Warming Potential (GWP*) approach. - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_fossil_n2o: - title: temp_fossil_n2o - short_unit: °C - unit: °C - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_land_n2o: - title: temp_land_n2o - short_unit: °C - unit: °C - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_n2o: - title: Change in global mean surface temperature from nitrous oxide emissions - short_unit: °C - unit: °C - description: This measures each country's contribution to global mean surface - temperature (GMST) rise from its cumulative nitrous oxide emissions. The - warming effects of each gas are calculated based on cumulative CO2-equivalent - emissions using the Global Warming Potential (GWP*) approach. - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_fossil_ghg: - title: temp_fossil_ghg - short_unit: °C - unit: °C - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_land_ghg: - title: temp_land_ghg - short_unit: °C - unit: °C - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - temp_ghg: - title: Change in global mean surface temperature from greenhouse gas emissions - short_unit: °C - unit: °C - description: This measures each country's contribution to global mean surface - temperature (GMST) rise from its cumulative emissions of carbon dioxide, - methane and nitrous oxide. The warming effects of each gas are calculated - based on cumulative CO2-equivalent emissions using the Global Warming Potential - (GWP*) approach. - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - share_global_warming: - title: Share of contribution to global warming - short_unit: '%' - unit: '%' - description: This measures each country's contribution to global mean surface - temperature (GMST) rise from its cumulative emissions of carbon dioxide, - methane and nitrous oxide. The warming effects of each gas are calculated - based on cumulative CO2-equivalent emissions using the Global Warming Potential - (GWP*) approach. - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_ch4_per_capita: - title: Methane emissions per person - short_unit: t - unit: tonnes - description: Methane emissions per person are calculated by Our World in Data - based on emissions data from Jones et al. (2023) and population data from - HYDE and the UN World Population Prospects. Jones et al. (2023) give methane - emissions in standard metric tonnes per year. We have converted these emissions - to carbon-dioxide equivalents over a 100-year timescale using a conversion - factor of 29.8 for fossil sources, and 27.2 for agricultural and land use - sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_n2o_per_capita: - title: Nitrous oxide emissions per person - short_unit: t - unit: tonnes - description: Nitrous oxide emissions per person are calculated by Our World - in Data based on emissions data from Jones et al. (2023) and population - data from HYDE and the UN World Population Prospects. Jones et al. (2023) - give nitrous oxide emissions in standard metric tonnes per year. We have - converted these emissions to carbon-dioxide equivalents over a 100-year - timescale using a conversion factor of 273 (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 - annual_ghg_co2eq_per_capita: - title: Greenhouse gas emissions per person - short_unit: t - unit: tonnes - description: Greenhouse gas emissions per person are calculated by Our World - in Data based on emissions data from Jones et al. (2023) and population - data from HYDE and the UN World Population Prospects. Jones et al. (2023) - give methane and nitrous oxide emissions in standard metric tonnes per year. - We have converted these emissions to carbon-dioxide equivalents over a 100-year - timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane - from fossil sources, and 27.2 for methane from agricultural and land use - sources (as per the IPCC AR6 report). - sources: - - name: Matthew Jones et al. (2023) - published_by: Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, - Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard - A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). - National contributions to climate change due to historical emissions of - carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data - (2023.1). - publication_year: 2023 - url: https://doi.org/10.5281/zenodo.7636699 diff --git a/etl/steps/archive/grapher/fasttrack/2023-03-27/global_warming_contributions.py b/etl/steps/archive/grapher/fasttrack/2023-03-27/global_warming_contributions.py deleted file mode 100644 index 9cc80f82c78..00000000000 --- a/etl/steps/archive/grapher/fasttrack/2023-03-27/global_warming_contributions.py +++ /dev/null @@ -1,19 +0,0 @@ -import pandas as pd -from owid import catalog - -from etl.helpers import PathFinder, create_dataset -from etl.snapshot import Snapshot - -P = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # load snapshot - data = pd.read_csv(Snapshot("fasttrack/2023-03-27/global_warming_contributions.csv").path) - - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) - - # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb]) - ds.save() diff --git a/etl/steps/archive/grapher/fasttrack/latest/monadic_strategic_nuclear_forces.meta.yml b/etl/steps/archive/grapher/fasttrack/latest/monadic_strategic_nuclear_forces.meta.yml deleted file mode 100644 index 8e134819679..00000000000 --- a/etl/steps/archive/grapher/fasttrack/latest/monadic_strategic_nuclear_forces.meta.yml +++ /dev/null @@ -1,67 +0,0 @@ -dataset: - title: Monadic Strategic Nuclear Forces Dataset – Suh - description: >- - This dataset provides information on the strategic nuclear forces by the nuclear powers, using monadic data from Suh (2022) - - - You can download the code and complete dataset, including supplementary variables, from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/nuclear_weapons - licenses: - - {} - sources: - - name: Suh (2023) - url: https://kyungwon-suh.github.io/datasets/ - date_accessed: '2023-05-08' - publication_year: '2023' - published_by: 'Kyungwon, Suh. 2023. Nuclear balance and the initiation of nuclear crises: Does superiority matter? Journal - of Peace Research 60(2): 337–351.' -tables: - monadic_strategic_nuclear_forces: - variables: - nuclear_warheads_stockpile: - title: nuclear_warheads_stockpile - description: The number of stockpiled nuclear warheads. - unit: '' - nuclear_warheads_firststrike: - title: nuclear_warheads_firststrike - description: >- - The number of strategic nuclear warheads deliverable in first strike. - - - Warheads designed for use away from the battlefield, such as against military bases, arms industries, or infrastructure - and that could be carried by ballistic missiles, bombers, and submarines in one strike. - unit: '' - nuclwarh_firststr_yield: - title: nuclwarh_firststr_yield - description: >- - The number of megatons of strategic nuclear warheads deliverable in first strike. - - - Strategic nuclear warheads designed for use away from the battlefield, such as against military bases, arms industries, - or infrastructure and that could be carried by ballistic missiles, bombers, and submarines in one strike. - - - Equivalent megatons, meaning that small warheads (equal or less than one megaton) are weighted more because they - are relatively more destructive than large ones. - unit: equivalent megatons - short_unit: Mt - nuclwarh_firststr_area: - title: nuclwarh_firststr_area - description: >- - The area destroyable by strategic nuclear warheads deliverable in first strike. - - - Destruction of unprotected structures, such as buildings and factories. - - - Strategic nuclear warheads designed for use away from the battlefield, such as against military bases, arms industries, - or infrastructure and that could be carried by ballistic missiles, bombers, and submarines in one strike. - - - Destroyable area in square kilometers, calculated by multiplying the equivalent megatonnage and the approximate - area destroyed by one megaton (20 square miles), and then converting it into square kilometers. - - - Approximate area destroyed by one megaton based on: Richelson, Jeffrey. 1980. Evaluating the Strategic Balance. - American Journal of Political Science 24(4): 782. - unit: square kilometers - short_unit: km² diff --git a/etl/steps/archive/grapher/fasttrack/latest/monadic_strategic_nuclear_forces.py b/etl/steps/archive/grapher/fasttrack/latest/monadic_strategic_nuclear_forces.py deleted file mode 100644 index b69dcf34db9..00000000000 --- a/etl/steps/archive/grapher/fasttrack/latest/monadic_strategic_nuclear_forces.py +++ /dev/null @@ -1,22 +0,0 @@ -import pandas as pd -from owid import catalog - -from etl.helpers import PathFinder, create_dataset -from etl.snapshot import Snapshot - -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # load snapshot - snap = Snapshot("fasttrack/latest/monadic_strategic_nuclear_forces.csv") - - # load data - data = pd.read_csv(snap.path) - - # create empty dataframe and table - tb = catalog.Table(data, short_name=paths.short_name) - - # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) - ds.save() diff --git a/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_inventories.meta.yml b/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_inventories.meta.yml deleted file mode 100644 index a2c0622c465..00000000000 --- a/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_inventories.meta.yml +++ /dev/null @@ -1,58 +0,0 @@ -dataset: - title: Nuclear warhead inventories – Federation of American Scientists - description: >- - This dataset provides information on the nuclear warhead inventories by the nuclear powers, using data from the Federation - of American Scientists, prepared by Hans M. Kristensen, Matt Korda, and Robert Norris. - - - You can download the code and complete dataset, including supplementary variables, from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/nuclear_weapons - - Publisher source: None - licenses: - - {} - sources: - - name: Federation of American Scientists (2023) - url: https://fas.org/issues/nuclear-weapons/status-world-nuclear-forces/ - date_accessed: '2023-03-28' - publication_year: .nan - published_by: Hans M. Kristensen, Matt Korda, Eliana Reynolds, and Robert Norris. -tables: - nuclear_warhead_inventories: - variables: - nuclear_weapons_depl_nonstrat: - title: nuclear_weapons_depl_nonstrat - description: |- - The variable denotes the estimated number of deployed nonstrategic nuclear warheads. - - Deployed warheads are those on ballistic missiles or bomber bases. - - Nonstrategic or tactical warheads are those for use on the battlefield. - unit: '' - nuclear_weapons_depl_strat: - title: nuclear_weapons_depl_strat - description: >- - The variable denotes the estimated number of deployed strategic nuclear warheads. - - - Deployed warheads are those on ballistic missiles or bomber bases. - - - Strategic warheads are those for use away from the battlefield, such as against military bases, arms industries, - or infrastructure. - unit: '' - nuclear_weapons_inventory: - title: nuclear_weapons_inventory - description: The variable denotes the estimated number of all nuclear warheads, be they deployed strategic, deployed - nonstrategic, nondeployed, or retired. - unit: '' - nuclear_weapons_reserve_nondepl: - title: nuclear_weapons_reserve_nondepl - description: |- - The variable denotes the estimated number of nondeployed nuclear warheads. - - Nondeployed or reserve warheads are those not on ballistic missiles or bomber bases. - unit: '' - nuclear_weapons_retired: - title: nuclear_weapons_retired - description: The variable denotes the estimated number of retired nuclear warheads queued for dismantlement. - unit: '' diff --git a/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_inventories.py b/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_inventories.py deleted file mode 100644 index 77cb9b7b743..00000000000 --- a/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_inventories.py +++ /dev/null @@ -1,22 +0,0 @@ -import pandas as pd -from owid import catalog - -from etl.helpers import PathFinder, create_dataset -from etl.snapshot import Snapshot - -P = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # load snapshot - snap = Snapshot("fasttrack/latest/nuclear_warhead_inventories.csv") - - # load data - data = pd.read_csv(snap.path) - - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) - - # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) - ds.save() diff --git a/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_stockpiles.meta.yml b/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_stockpiles.meta.yml deleted file mode 100644 index 1514eb96fd1..00000000000 --- a/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_stockpiles.meta.yml +++ /dev/null @@ -1,29 +0,0 @@ -dataset: - title: Nuclear warhead stockpiles – Federation of American Scientists - description: >- - This dataset provides information on the number of stockpiled nuclear warheads by the nuclear powers, using data from - the Federation of American Scientists, prepared by Hans M. Kristensen, Matt Korda, Eliana Reynolds, and Robert Norris. - - - You can download the code and complete dataset, including supplementary variables, from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/nuclear_weapons - - Publisher source: None - licenses: - - {} - sources: - - name: Federation of American Scientists (2023) - url: https://fas.org/issues/nuclear-weapons/status-world-nuclear-forces/ - date_accessed: '2023-03-28' - published_by: Hans M. Kristensen, Matt Korda, Eliana Reynolds, and Robert Norris. -tables: - nuclear_warhead_stockpiles: - variables: - nuclear_weapons_stockpile: - title: nuclear_weapons_stockpile - description: |- - The variable denotes the estimated number of nuclear warheads in the stockpiles of the nuclear powers. - - Stockpiles include warheads assigned to military forces, but exclude retired warheads queued for dismantlement. - - Retired warheads are only included in the global total. - unit: '' diff --git a/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_stockpiles.py b/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_stockpiles.py deleted file mode 100644 index 145b2565262..00000000000 --- a/etl/steps/archive/grapher/fasttrack/latest/nuclear_warhead_stockpiles.py +++ /dev/null @@ -1,22 +0,0 @@ -from etl.helpers import PathFinder, create_dataset, get_metadata_path -from etl.snapshot import Snapshot - -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # load snapshot - snap = Snapshot("fasttrack/latest/nuclear_warhead_stockpiles.csv") - - # load data - tb = snap.read_csv() - - # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) - - # override metadata if necessary - meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") - if meta_path.exists(): - ds.update_metadata(meta_path) - - ds.save() diff --git a/etl/steps/archive/grapher/gcp/2022-09-29/global_carbon_budget_additional.py b/etl/steps/archive/grapher/gcp/2022-09-29/global_carbon_budget_additional.py deleted file mode 100644 index 1c3ee42558e..00000000000 --- a/etl/steps/archive/grapher/gcp/2022-09-29/global_carbon_budget_additional.py +++ /dev/null @@ -1,16 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder - -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Create new grapher dataset. - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - # Load table from garden. - table = N.garden_dataset["global_carbon_budget_additional"].reset_index() - # Add table to dataset. - dataset.add(table) - # Save dataset. - dataset.save() diff --git a/etl/steps/archive/grapher/gcp/2022-11-11/global_carbon_budget.py b/etl/steps/archive/grapher/gcp/2022-11-11/global_carbon_budget.py deleted file mode 100644 index 6ba20e73314..00000000000 --- a/etl/steps/archive/grapher/gcp/2022-11-11/global_carbon_budget.py +++ /dev/null @@ -1,67 +0,0 @@ -"""Grapher step for Global Carbon Budget dataset. - -Some auxiliary variables will be added (where nans are filled with zeros, to avoid missing data in stacked area charts). - -""" - -from copy import deepcopy - -import numpy as np -import pandas as pd -from owid import catalog - -from etl.helpers import PathFinder - -# For two stacked area charts (namely "CO₂ emissions by fuel type" and "Cumulative CO₂ emissions by source") having -# nans in the data causes the chart to show only years where all sources have data. -# To avoid this, create additional variables that have nans filled with zeros. -VARIABLES_TO_FILL_WITH_ZEROS = [ - "emissions_total", - "emissions_from_cement", - "emissions_from_coal", - "emissions_from_flaring", - "emissions_from_gas", - "emissions_from_land_use_change", - "emissions_from_oil", - "emissions_from_other_industry", - "cumulative_emissions_total", - "cumulative_emissions_from_cement", - "cumulative_emissions_from_coal", - "cumulative_emissions_from_flaring", - "cumulative_emissions_from_gas", - "cumulative_emissions_from_land_use_change", - "cumulative_emissions_from_oil", - "cumulative_emissions_from_other_industry", -] - -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Create a new Grapher dataset. - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - - # Load table from Garden dataset. - table = N.garden_dataset["global_carbon_budget"] - - # Ensure all countries span all years (from 1750 to the latest observation), even if many of those rows are empty. - # This will increase the size of the dataset, but we do this so that stacked area charts span the maximum possible - # range of years. - countries = table.reset_index()["country"].unique() - years = np.arange(table.reset_index()["year"].min(), table.reset_index()["year"].max() + 1, dtype=int) - table = table.reindex(pd.MultiIndex.from_product([countries, years], names=["country", "year"])) - - # Create additional variables in the table that have nans filled with zeros (for two specific stacked area charts). - for variable in VARIABLES_TO_FILL_WITH_ZEROS: - new_variable_name = variable + "_zero_filled" - table[new_variable_name] = table[variable].fillna(0) - table[new_variable_name].metadata = deepcopy(table[variable].metadata) - table[new_variable_name].metadata.title = table[variable].metadata.title + " (zero filled)" - table[new_variable_name].metadata.description = ( - table[variable].metadata.description + " Missing data has been filled with zeros for the purposes of data " - "visualization." - ) - - # Add table to Grapher dataset and save dataset. - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/gcp/2023-04-28/global_carbon_budget.py b/etl/steps/archive/grapher/gcp/2023-04-28/global_carbon_budget.py deleted file mode 100644 index 78d07f6e953..00000000000 --- a/etl/steps/archive/grapher/gcp/2023-04-28/global_carbon_budget.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Load a garden dataset and create a grapher dataset. - -Some auxiliary variables will be added (where nans are filled with zeros, to avoid missing data in stacked area charts). - -""" -from copy import deepcopy - -import numpy as np -import pandas as pd -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# For two stacked area charts (namely "CO₂ emissions by fuel type" and "Cumulative CO₂ emissions by source") having -# nans in the data causes the chart to show only years where all sources have data. -# To avoid this, create additional variables that have nans filled with zeros. -VARIABLES_TO_FILL_WITH_ZEROS = [ - "emissions_total", - "emissions_from_cement", - "emissions_from_coal", - "emissions_from_flaring", - "emissions_from_gas", - "emissions_from_land_use_change", - "emissions_from_oil", - "emissions_from_other_industry", - "cumulative_emissions_total", - "cumulative_emissions_from_cement", - "cumulative_emissions_from_coal", - "cumulative_emissions_from_flaring", - "cumulative_emissions_from_gas", - "cumulative_emissions_from_land_use_change", - "cumulative_emissions_from_oil", - "cumulative_emissions_from_other_industry", -] - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("global_carbon_budget") - - # Read table from garden dataset. - tb_garden = ds_garden["global_carbon_budget"] - - # - # Process data. - # - # Ensure all countries span all years (from 1750 to the latest observation), even if many of those rows are empty. - # This will increase the size of the dataset, but we do this so that stacked area charts span the maximum possible - # range of years. - countries = tb_garden.reset_index()["country"].unique() - years = np.arange(tb_garden.reset_index()["year"].min(), tb_garden.reset_index()["year"].max() + 1, dtype=int) - tb_garden = tb_garden.reindex(pd.MultiIndex.from_product([countries, years], names=["country", "year"])) - - # Create additional variables in the table that have nans filled with zeros (for two specific stacked area charts). - for variable in VARIABLES_TO_FILL_WITH_ZEROS: - new_variable_name = variable + "_zero_filled" - tb_garden[new_variable_name] = tb_garden[variable].fillna(0) - tb_garden[new_variable_name].metadata = deepcopy(tb_garden[variable].metadata) - tb_garden[new_variable_name].metadata.title = tb_garden[variable].metadata.title + " (zero filled)" - tb_garden[new_variable_name].metadata.description = ( - tb_garden[variable].metadata.description + " Missing data has been filled with zeros for the purposes of " - "data visualization." - ) - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - - # Sanity checks. - grapher_checks(ds_grapher) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/gcp/2023-07-10/global_carbon_budget.meta.yml b/etl/steps/archive/grapher/gcp/2023-07-10/global_carbon_budget.meta.yml deleted file mode 100644 index fb5f8fc7a75..00000000000 --- a/etl/steps/archive/grapher/gcp/2023-07-10/global_carbon_budget.meta.yml +++ /dev/null @@ -1,107 +0,0 @@ -dataset: - update_period_days: 365 - sources: [] - -definitions: - common: - sources: [] - origins: - - producer: Global Carbon Project - title: Global Carbon Budget - description: | - The Global Carbon Budget 2022 has over 105 contributors from 80 organizations and 18 countries. It was founded by the Global Carbon Project international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. The 2022 report was published at COP27 in Egypt on Friday 11th November. - citation_full: | - Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Alkama, R., Arneth, A., Arora, V. K., Bates, N. R., Becker, M., Bellouin, N., Bittig, H. C., Bopp, L., Chevallier, F., Chini, L. P., Cronin, M., Evans, W., Falk, S., Feely, R. A., Gasser, T., Gehlen, M., Gkritzalis, T., Gloege, L., Grassi, G., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jain, A. K., Jersild, A., Kadono, K., Kato, E., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Landschützer, P., Lefèvre, N., Lindsay, K., Liu, J., Liu, Z., Marland, G., Mayot, N., McGrath, M. J., Metzl, N., Monacci, N. M., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K., Ono, T., Palmer, P. I., Pan, N., Pierrot, D., Pocock, K., Poulter, B., Resplandy, L., Robertson, E., Rödenbeck, C., Rodriguez, C., Rosan, T. M., - Schwinger, J., Séférian, R., Shutler, J. D., Skjelvan, I., Steinhoff, T., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tanhua, T., Tans, P. P., Tian, X., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., Walker, A. P., Wanninkhof, R., Whitehead, C., Willstrand Wranne, A., Wright, R., Yuan, W., Yue, C., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2022, Earth Syst. Sci. Data, 14, 4811-4900, https://doi.org/10.5194/essd-14-4811-2022, - 2022. - url_main: https://globalcarbonbudget.org/ - url_download: https://zenodo.org/record/7215364/files/GCB2022v27_MtCO2_flat.csv - date_accessed: '2023-04-28' - date_published: '2022-11-11' - license: - name: CC BY 4.0 - url: https://zenodo.org/record/7215364 - -tables: - global_carbon_budget: - variables: - consumption_emissions_per_capita: - unit: tonnes per person - short_unit: t/person - title: Per capita consumption-based CO₂ emissions - description_short: | - Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes per person. - description_processing: | - Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664. - display: - shortUnit: t - numDecimalPlaces: 0 - description_key: - - Consumption-based emissions attribute the emissions generated in the production of goods and services according to where they were _consumed_, rather than where they were _produced_. - - "The data is calculated by adjusting 'production-based' emissions (emissions produced domestically) for trade: Consumption-based emissions equals production-based emissions, _minus_ emissions embedded in exports, _plus_ emissions embedded in imports." - - If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. If its consumption-based emissions are lower, then it is a net exporter. - - Per capita emissions represent the emissions of an average person in a country or region - they are total emissions divided by population. - - Consumption-based emissions are not available for all countries because not all countries have sufficient, high-quality trade data. But those without complete data are a small fraction (3%) of the global total. - - This data measures Carbon dioxide (CO₂) emissions from fossil fuels and industry and does not include emissions from land use change, deforestation, soils, or vegetation. - - Emissions from international aviation and shipping are not included in any country or region's emissions. They are only included in the global total emissions. - presentation: - title_public: Per capita consumption-based CO₂ emissions - attribution_short: Global Carbon Project - topic_tags: - - CO2 & Greenhouse Gas Emissions - - Climate Change - - Energy - faqs: - - fragment_id: emissions-from-aviation-and-shipping - gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw - - fragment_id: missing-consumption-based-emissions - gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw - grapher_config: - title: Per capita consumption-based CO₂ emissions - subtitle: >- - [Consumption-based emissions](#dod:consumptionbasedemissions) are national - emissions that have been adjusted for trade. It's production-based emissions - minus emissions embedded in exports, plus emissions embedded in imports. - hideAnnotationFieldsInTitle: - time: true - entity: true - changeInPrefix: true - minTime: 1990 - hideRelativeToggle: false - hasMapTab: true - tab: map - originUrl: https://ourworldindata.org/co2-and-greenhouse-gas-emissions - yAxis: - min: 0 - max: 0 - colorScale: - binningStrategy: equalInterval - map: - colorScale: - baseColorScheme: Reds - binningStrategy: manual - # TODO: these intervals are not well chosen according to our map bracket guidelines - customNumericValues: - - 1 - - 2.5 - - 5 - - 7.5 - - 10 - - 15 - - 20 - - 50 - customNumericColors: - - null - - null - selectedEntityNames: - - United States - - United Kingdom - - European Union (27) - - China - - India - - Australia - - Brazil - - South Africa - relatedQuestions: - - url: https://ourworldindata.org/grapher/consumption-co2-per-capita#faqs - text: FAQs on this data diff --git a/etl/steps/archive/grapher/gcp/2023-07-10/global_carbon_budget.py b/etl/steps/archive/grapher/gcp/2023-07-10/global_carbon_budget.py deleted file mode 100644 index 8779c9c89df..00000000000 --- a/etl/steps/archive/grapher/gcp/2023-07-10/global_carbon_budget.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Load a garden dataset and create a grapher dataset. - -Some auxiliary variables will be added (where nans are filled with zeros, to avoid missing data in stacked area charts). - -""" - -import numpy as np -import pandas as pd -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset and read its main table. - ds_garden: Dataset = paths.load_dependency("global_carbon_budget") - tb_garden = ds_garden["global_carbon_budget"] - - # - # Process data. - # - # Ensure all countries span all years (from 1750 to the latest observation), even if many of those rows are empty. - # This will increase the size of the dataset, but we do this so that stacked area charts span the maximum possible - # range of years. - countries = tb_garden.reset_index()["country"].unique() - years = np.arange(tb_garden.reset_index()["year"].min(), tb_garden.reset_index()["year"].max() + 1, dtype=int) - tb_garden = tb_garden.reindex(pd.MultiIndex.from_product([countries, years], names=["country", "year"])) - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset( - dest_dir, - tables=[tb_garden], - default_metadata=ds_garden.metadata, - check_variables_metadata=True, - ) - - # Sanity checks. - grapher_checks(ds_grapher) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/health/2022-12-28/deaths_karlinsky.py b/etl/steps/archive/grapher/health/2022-12-28/deaths_karlinsky.py deleted file mode 100644 index 6eb2ee887d5..00000000000 --- a/etl/steps/archive/grapher/health/2022-12-28/deaths_karlinsky.py +++ /dev/null @@ -1,21 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder - -N = PathFinder(__file__) - - -TABLE_NAME = "deaths" - - -def run(dest_dir: str) -> None: - # get dataset from garden - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - - # get table from garden - table = N.garden_dataset[TABLE_NAME] - - # add table - dataset.add(table) - - dataset.save() diff --git a/etl/steps/archive/grapher/homicide/2023-01-27/unodc.py b/etl/steps/archive/grapher/homicide/2023-01-27/unodc.py deleted file mode 100644 index c6776324461..00000000000 --- a/etl/steps/archive/grapher/homicide/2023-01-27/unodc.py +++ /dev/null @@ -1,17 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder - -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - dataset = catalog.Dataset.create_empty(dest_dir, paths.meadow_dataset.metadata) - - table_names = paths.garden_dataset.table_names - # if your data is in long format, you can use `grapher_helpers.long_to_wide_tables` - # to get into wide format - for table_name in table_names: - table = paths.garden_dataset[table_name] - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/ihme_gbd/2023-03-29/gbd_drug_disorders.py b/etl/steps/archive/grapher/ihme_gbd/2023-03-29/gbd_drug_disorders.py deleted file mode 100644 index 18f3b8e4bac..00000000000 --- a/etl/steps/archive/grapher/ihme_gbd/2023-03-29/gbd_drug_disorders.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Load a garden dataset and create a grapher dataset.""" - -from owid.catalog import Dataset - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("gbd_drug_disorders") - - # Read table from garden dataset. - tb_garden = ds_garden["gbd_drug_disorders"] - - # - # Process data. - # - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) - - # - # Checks. - # - grapher_checks(ds_garden) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/irena/2022-10-20/renewable_electricity_capacity_by_technology.py b/etl/steps/archive/grapher/irena/2022-10-20/renewable_electricity_capacity_by_technology.py deleted file mode 100644 index 2d7336ba65e..00000000000 --- a/etl/steps/archive/grapher/irena/2022-10-20/renewable_electricity_capacity_by_technology.py +++ /dev/null @@ -1,41 +0,0 @@ -from owid import catalog - -from etl.paths import DATA_DIR - -# Details for output dataset. -DATASET_NAME = "renewable_electricity_capacity_by_technology" -DATASET_TITLE = "Renewable electricity capacity by technology" -# Details for input dataset. -GARDEN_DATASET_PATH = DATA_DIR / "garden/irena/2022-10-20/renewable_electricity_capacity" - - -def run(dest_dir: str) -> None: - # Load dataset from Garden. - garden_dataset = catalog.Dataset(GARDEN_DATASET_PATH) - # Load main table from dataset. - table = garden_dataset[garden_dataset.table_names[0]] - - # Get the human-readable names of the technologies from the variable metadata. - rename_technologies = {variable: table[variable].metadata.title for variable in table.columns} - - # Simplify table to consider only the World. - # Here we use "country" to refer to a technology. - # This is a workaround, so that grapher will let us select technologies as it does with countries. - table = table.loc["World"].reset_index().melt(id_vars="year", var_name="country", value_name="capacity") - - # Rename technologies conveniently. - table = table.replace(rename_technologies) - - # Set appropriate metadata. - table["capacity"].metadata.title = "Capacity" - table["capacity"].metadata.unit = "Megawatts" - table["capacity"].metadata.short_unit = "MW" - table["capacity"].metadata.display = {"numDecimalPlaces": 0} - - # Create new dataset. - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.metadata.title = DATASET_TITLE - dataset.metadata.short_name = DATASET_NAME - - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/irena/2022-10-20/renewable_power_generation_costs.py b/etl/steps/archive/grapher/irena/2022-10-20/renewable_power_generation_costs.py deleted file mode 100644 index 4e47d662bbb..00000000000 --- a/etl/steps/archive/grapher/irena/2022-10-20/renewable_power_generation_costs.py +++ /dev/null @@ -1,16 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder - -# Get naming conventions. -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Create new grapher dataset. - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - # Get main table from Garden dataset. - table = N.garden_dataset["renewable_power_generation_costs"] - # Add table to new Grapher dataset and save dataset. - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/irena/2023-01-04/renewable_electricity_capacity_by_technology.py b/etl/steps/archive/grapher/irena/2023-01-04/renewable_electricity_capacity_by_technology.py deleted file mode 100644 index 6f264a43718..00000000000 --- a/etl/steps/archive/grapher/irena/2023-01-04/renewable_electricity_capacity_by_technology.py +++ /dev/null @@ -1,43 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Details for output dataset. -DATASET_TITLE = "Renewable electricity capacity by technology (IRENA, 2023)" - - -def run(dest_dir: str) -> None: - # Load dataset from Garden. - ds_garden: catalog.Dataset = paths.load_dependency("renewable_electricity_capacity") - # Load main table from dataset. - table = ds_garden["renewable_electricity_capacity"] - - # Get the human-readable names of the technologies from the variable metadata. - rename_technologies = {variable: table[variable].metadata.title for variable in table.columns} - - # Simplify table to consider only the World. - # Here we use "country" to refer to a technology. - # This is a workaround, so that grapher will let us select technologies as it does with countries. - table = table.loc["World"].reset_index().melt(id_vars="year", var_name="country", value_name="capacity") - - # Rename technologies conveniently. - table = table.replace(rename_technologies) - - # Set appropriate metadata. - table["country"].metadata.unit = None - table["country"].metadata.short_unit = None - table["capacity"].metadata.title = "Capacity" - table["capacity"].metadata.unit = "megawatts" - table["capacity"].metadata.short_unit = "MW" - table["capacity"].metadata.display = {"numDecimalPlaces": 0} - - # Create new dataset. - dataset = catalog.Dataset.create_empty(dest_dir, ds_garden.metadata) - dataset.metadata.title = DATASET_TITLE - dataset.metadata.short_name = paths.short_name - - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/irena/2023-01-04/renewable_power_generation_costs.py b/etl/steps/archive/grapher/irena/2023-01-04/renewable_power_generation_costs.py deleted file mode 100644 index d3812dac69a..00000000000 --- a/etl/steps/archive/grapher/irena/2023-01-04/renewable_power_generation_costs.py +++ /dev/null @@ -1,20 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Load Garden dataset. - ds_garden: catalog.Dataset = paths.load_dependency("renewable_power_generation_costs") - # Get main table from Garden dataset. - table = ds_garden["renewable_power_generation_costs"] - - # Create new grapher dataset. - dataset = catalog.Dataset.create_empty(dest_dir, ds_garden.metadata) - - # Add table to new Grapher dataset and save dataset. - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/met_office_hadley_centre/2023-01-02/near_surface_temperature.py b/etl/steps/archive/grapher/met_office_hadley_centre/2023-01-02/near_surface_temperature.py deleted file mode 100644 index 0b765bc6b6f..00000000000 --- a/etl/steps/archive/grapher/met_office_hadley_centre/2023-01-02/near_surface_temperature.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Load garden dataset of near surface temperature by Met Office Hadley Centre, and create a grapher dataset. - -""" - -from owid import catalog - -from etl.helpers import PathFinder - -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Load table from garden dataset. - table = N.garden_dataset["near_surface_temperature"].reset_index() - - # For compatibility with grapher, change the name of "region" column to "country". - table = table.rename(columns={"region": "country"}) - - # Create new grapher dataset. - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - dataset.metadata.short_name = N.short_name - # Add table to dataset and save dataset. - dataset.add(table) - dataset.save() diff --git a/etl/steps/archive/grapher/met_office_hadley_centre/2023-01-17/near_surface_temperature.py b/etl/steps/archive/grapher/met_office_hadley_centre/2023-01-17/near_surface_temperature.py deleted file mode 100644 index 9ea4a3ddf50..00000000000 --- a/etl/steps/archive/grapher/met_office_hadley_centre/2023-01-17/near_surface_temperature.py +++ /dev/null @@ -1,38 +0,0 @@ -"""Load a garden dataset and create a grapher dataset.""" - -from owid.catalog import Dataset - -from etl.helpers import PathFinder - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden: Dataset = paths.load_dependency("near_surface_temperature") - - # Read table from garden dataset. - tb_garden = ds_garden["near_surface_temperature"].reset_index() - - # For compatibility with grapher, change the name of "region" column to "country". - tb_garden = tb_garden.rename(columns={"region": "country"}) - - # - # Process data. - # - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = Dataset.create_empty(dest_dir, ds_garden.metadata) - - # Add table of processed data to the new dataset. - ds_grapher.add(tb_garden) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/papers/2023-01-04/farmer_lafond_2016.py b/etl/steps/archive/grapher/papers/2023-01-04/farmer_lafond_2016.py deleted file mode 100644 index c84b00ccd13..00000000000 --- a/etl/steps/archive/grapher/papers/2023-01-04/farmer_lafond_2016.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Load garden dataset for Farmer & Lafond (2016) data and create a grapher dataset. - -""" - -from owid import catalog - -from etl.helpers import PathFinder - -# Get paths and naming conventions for current data step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - ds_garden: catalog.Dataset = paths.load_dependency("farmer_lafond_2016") - tb_garden = ds_garden["farmer_lafond_2016"] - - # - # Process data. - # - # Replace snake-case names by the original technology names. - tb_garden.columns = [tb_garden[column].metadata.title for column in tb_garden.columns] - - # For better visualization, divide the costs of DNA sequencing by 1000, as done in the original paper by Farmer & Lafond (2016). - tb_garden["DNA sequencing"] /= 1000 - - # Convert table to long format, and rename column so that it can be treated as a country in grapher. - # This way, we can select technologies as we usually do with countries. - tb_garden = ( - tb_garden.reset_index() - .melt(id_vars="year", var_name="country", value_name="cost") - .dropna() - .reset_index(drop=True) - ) - tb_garden["cost"].metadata.title = "Technology cost" - tb_garden["cost"].metadata.unit = "various units" - - # - # Save outputs. - # - # Create a new grapher dataset. - dataset = catalog.Dataset.create_empty(dest_dir, ds_garden.metadata) - - # Add table to dataset and save dataset. - dataset.add(tb_garden) - dataset.save() diff --git a/etl/steps/archive/grapher/rff/2022-09-14/emissions_weighted_carbon_price.py b/etl/steps/archive/grapher/rff/2022-09-14/emissions_weighted_carbon_price.py deleted file mode 100644 index e115927212f..00000000000 --- a/etl/steps/archive/grapher/rff/2022-09-14/emissions_weighted_carbon_price.py +++ /dev/null @@ -1,17 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder - -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Create new grapher dataset with the metadata from the garden dataset. - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - dataset.metadata.short_name = N.short_name - # Load table from garden dataset. - table = N.garden_dataset["emissions_weighted_carbon_price"].reset_index() - # Add table to new grapher dataset. - dataset.add(table) - # Save dataset. - dataset.save() diff --git a/etl/steps/archive/grapher/rff/2022-09-14/world_carbon_pricing_any_sector.py b/etl/steps/archive/grapher/rff/2022-09-14/world_carbon_pricing_any_sector.py deleted file mode 100644 index d3fab683f41..00000000000 --- a/etl/steps/archive/grapher/rff/2022-09-14/world_carbon_pricing_any_sector.py +++ /dev/null @@ -1,39 +0,0 @@ -from owid import catalog - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR, STEP_DIR - -N = PathFinder(__file__) - -# Details of input garden dataset and table. -GARDEN_DATASET_NAME = "world_carbon_pricing" -GARDEN_DATASET_PATH = DATA_DIR / "garden" / "rff" / "2022-09-14" / GARDEN_DATASET_NAME -GARDEN_TABLE_NAME = "world_carbon_pricing_any_sector" -# Details of output grapher dataset and table. -GRAPHER_DATASET_TITLE = "World carbon pricing for any sector" -GRAPHER_DATASET_NAME = GARDEN_TABLE_NAME -GRAPHER_METADATA_PATH = STEP_DIR / "grapher" / "rff" / "2022-09-14" / f"{GRAPHER_DATASET_NAME}.meta.yml" - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read dataset from meadow. - ds_garden = catalog.Dataset(GARDEN_DATASET_PATH) - ds_garden.metadata.short_name = N.short_name - # Get table from dataset. - tb_garden = ds_garden[GARDEN_TABLE_NAME] - - # - # Save outputs. - # - # Prepare metadata for new grapher dataset. - grapher_metadata = ds_garden.metadata - grapher_metadata.title = GRAPHER_DATASET_TITLE - # Create new grapher dataset. - ds_grapher = catalog.Dataset.create_empty(dest_dir, grapher_metadata) - # Add table to new dataset. - ds_grapher.add(tb_garden) - # Save dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/rff/2022-10-11/emissions_weighted_carbon_price.py b/etl/steps/archive/grapher/rff/2022-10-11/emissions_weighted_carbon_price.py deleted file mode 100644 index 99c1aa8aa43..00000000000 --- a/etl/steps/archive/grapher/rff/2022-10-11/emissions_weighted_carbon_price.py +++ /dev/null @@ -1,22 +0,0 @@ -from owid import catalog -from shared import CURRENT_DIR - -from etl.helpers import PathFinder - -GARDEN_DATASET_NAME = "emissions_weighted_carbon_price" -GRAPHER_DATASET_TITLE = "Emissions-weighted carbon price (2022)" -N = PathFinder(str(CURRENT_DIR / GARDEN_DATASET_NAME)) - - -def run(dest_dir: str) -> None: - # Create new grapher dataset with the metadata from the garden dataset. - dataset = catalog.Dataset.create_empty(dest_dir, N.garden_dataset.metadata) - dataset.metadata.title = GRAPHER_DATASET_TITLE - dataset.metadata.short_name = GARDEN_DATASET_NAME - - # Load table from garden dataset. - table = N.garden_dataset[GARDEN_DATASET_NAME].reset_index() - # Add table to new grapher dataset. - dataset.add(table) - # Save dataset. - dataset.save() diff --git a/etl/steps/archive/grapher/rff/2022-10-11/shared.py b/etl/steps/archive/grapher/rff/2022-10-11/shared.py deleted file mode 100644 index 13c7cf02081..00000000000 --- a/etl/steps/archive/grapher/rff/2022-10-11/shared.py +++ /dev/null @@ -1,7 +0,0 @@ -from pathlib import Path - -CURRENT_DIR = Path(__file__).parent -# Version of current grapher datasets to be created. -VERSION = str(CURRENT_DIR.name) -# Version of garden datasets to be imported. -GARDEN_VERSION = VERSION diff --git a/etl/steps/archive/grapher/rff/2022-10-11/world_carbon_pricing_any_sector.py b/etl/steps/archive/grapher/rff/2022-10-11/world_carbon_pricing_any_sector.py deleted file mode 100644 index 2bfeea51860..00000000000 --- a/etl/steps/archive/grapher/rff/2022-10-11/world_carbon_pricing_any_sector.py +++ /dev/null @@ -1,40 +0,0 @@ -from owid import catalog -from shared import GARDEN_VERSION, VERSION - -from etl.helpers import PathFinder -from etl.paths import DATA_DIR, STEP_DIR - -N = PathFinder(__file__) - -# Details of input garden dataset and table. -GARDEN_DATASET_NAME = "world_carbon_pricing" -GARDEN_DATASET_PATH = DATA_DIR / "garden" / "rff" / GARDEN_VERSION / GARDEN_DATASET_NAME -GARDEN_TABLE_NAME = "world_carbon_pricing_any_sector" -# Details of output grapher dataset and table. -GRAPHER_DATASET_TITLE = "World carbon pricing for any sector (2022)" -GRAPHER_DATASET_NAME = GARDEN_TABLE_NAME -GRAPHER_METADATA_PATH = STEP_DIR / "grapher" / "rff" / VERSION / f"{GRAPHER_DATASET_NAME}.meta.yml" - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Read dataset from meadow. - ds_garden = catalog.Dataset(GARDEN_DATASET_PATH) - # Get table from dataset. - tb_garden = ds_garden[GARDEN_TABLE_NAME] - - # - # Save outputs. - # - # Prepare metadata for new grapher dataset. - grapher_metadata = ds_garden.metadata - grapher_metadata.title = GRAPHER_DATASET_TITLE - # Create new grapher dataset. - ds_grapher = catalog.Dataset.create_empty(dest_dir, grapher_metadata) - ds_grapher.metadata.short_name = N.short_name - # Add table to new dataset. - ds_grapher.add(tb_garden) - # Save dataset. - ds_grapher.save() diff --git a/etl/steps/archive/grapher/un/2022-07-07/un_sdg.py b/etl/steps/archive/grapher/un/2022-07-07/un_sdg.py deleted file mode 100644 index 4f6f8decf9b..00000000000 --- a/etl/steps/archive/grapher/un/2022-07-07/un_sdg.py +++ /dev/null @@ -1,202 +0,0 @@ -import json -import os -from functools import cache -from pathlib import Path -from typing import Any, Dict, cast - -import pandas as pd -import requests -from owid import catalog -from owid.catalog import Dataset, Source, Table, VariableMeta -from owid.catalog.utils import underscore -from owid.walden import Catalog -from owid.walden import Dataset as WaldenDataset -from structlog import get_logger - -from etl import grapher_helpers as gh -from etl.paths import DATA_DIR - -log = get_logger() - -CURRENT_DIR = Path(__file__).parent - -BASE_URL = "https://unstats.un.org/sdgapi" -VERSION = Path(__file__).parent.stem -FNAME = Path(__file__).stem -NAMESPACE = Path(__file__).parent.parent.stem - - -def run(dest_dir: str) -> None: - garden_dataset = catalog.Dataset(DATA_DIR / f"garden/{NAMESPACE}/{VERSION}/{FNAME}") - dataset = catalog.Dataset.create_empty(dest_dir, garden_dataset.metadata) - dataset.save() - - # add tables to dataset - clean_source_map = load_clean_source_mapping() - # NOTE: we renamed namespace from un_sdg to un, but we still use old walden - walden_ds = Catalog().find_one(namespace="un_sdg", short_name=FNAME, version=VERSION) - ds_garden = Dataset((DATA_DIR / f"garden/{NAMESPACE}/{VERSION}/{FNAME}").as_posix()) - sdg_tables = ds_garden.table_names - for var in sdg_tables: - var_df = create_dataframe_with_variable_name(ds_garden, var) - var_df["source"] = clean_source_name(var_df["source"], clean_source_map) - - var_gr = var_df.groupby("variable_name") - - for var_name, df_var in var_gr: - df_tab = add_metadata_and_prepare_for_grapher(df_var, walden_ds) - - # NOTE: long format is quite inefficient, we're creating a table for every variable - # converting it to wide format would be too sparse, but we could move dimensions from - # variable names to proper dimensions - # currently we generate ~10000 files with total size 73MB (grapher step runs in 692s - # and both reindex and publishing is fast, so this is not a real bottleneck besides - # polluting `grapher` channel in our catalog) - # see https://github.com/owid/etl/issues/447 - for wide_table in gh.long_to_wide_tables(df_tab): - # table is generated for every column, use it as a table name - # shorten it under 255 characteres as this is the limit for file name - wide_table.metadata.short_name = wide_table.columns[0][:245] - dataset.add(wide_table) - - -def clean_source_name(raw_source: pd.Series, clean_source_map: Dict[str, str]) -> str: - if len(raw_source.drop_duplicates()) > 1: - clean_source = "Data from multiple sources compiled by the UN" - else: - source_name = raw_source.drop_duplicates().iloc[0] - assert source_name in clean_source_map, f"{source_name} not in un_sdg.sources.json - please add" - clean_source = clean_source_map[source_name] - - return clean_source - - -def add_metadata_and_prepare_for_grapher(df_gr: pd.DataFrame, walden_ds: WaldenDataset) -> Table: - indicator = df_gr["variable_name"].iloc[0].split("-")[0].strip() - source_url = get_metadata_link(indicator) - log.info( - "Getting the metadata url...", - url=source_url, - indicator=indicator, - var_name=df_gr["variable_name"].iloc[0], - ) - source = Source( - name=df_gr["source"].iloc[0], - url=walden_ds.metadata["url"], - source_data_url=walden_ds.metadata.get("source_data_url"), - owid_data_url=walden_ds.metadata["owid_data_url"], - date_accessed=walden_ds.metadata["date_accessed"], - publication_date=walden_ds.metadata["publication_date"], - publication_year=walden_ds.metadata["publication_year"], - published_by=walden_ds.metadata["name"], - ) - - df_gr["meta"] = VariableMeta( - title=df_gr["variable_name_meta"].iloc[0], - description=df_gr["seriesdescription"].iloc[0] + "\n\nFurther information available at: %s" % (source_url), - sources=[source], - unit=df_gr["long_unit"].iloc[0], - short_unit=df_gr["short_unit"].iloc[0], - additional_info=None, - ) - # Taking only the first 255 characters of the var name as this is the limit (there is at least one that is too long) - df_gr["variable"] = underscore(df_gr["variable_name"].iloc[0][0:254]) - - df_gr = df_gr[["country", "year", "value", "variable", "meta"]].copy() - # convert integer values to int but round float to 2 decimal places, string remain as string - df_gr["value"] = df_gr["value"].apply(value_convert) - df_gr = df_gr.set_index(["year", "country"]) - - return Table(df_gr) - - -def create_dataframe_with_variable_name(dataset: Dataset, tab: str) -> pd.DataFrame: - cols_keep = [ - "country", - "year", - "seriescode", - "seriesdescription", - "variable_name", - "variable_name_meta", - "value", - "source", - "long_unit", - "short_unit", - ] - - tab_df = pd.DataFrame(dataset[tab]).reset_index() - cols_meta = ["indicator", "seriesdescription", "seriescode"] - cols = ["indicator", "seriescode"] - if tab_df.shape[1] > 11: - col_list = sorted(tab_df.columns.to_list()) - drop_cols = [ - "country", - "year", - "goal", - "target", - "indicator", - "seriescode", - "seriesdescription", - "value", - "source", - "long_unit", - "short_unit", - "variable_name", - ] - dim_cols = [x for x in col_list if x not in drop_cols] - cols_meta_dim = cols_meta + dim_cols - cols_dim = cols + dim_cols - tab_df["variable_name_meta"] = tab_df[cols_meta_dim].agg(" - ".join, axis=1) - tab_df["variable_name"] = tab_df[cols_dim].agg(" - ".join, axis=1) - tab_df = tab_df[cols_keep] - tab_df["seriescode"] = tab_df["seriescode"].str.lower() - else: - tab_df["variable_name_meta"] = tab_df[cols_meta].agg(" - ".join, axis=1) - tab_df["variable_name"] = tab_df[cols].agg(" - ".join, axis=1) - tab_df = tab_df[cols_keep] - tab_df["seriescode"] = tab_df["seriescode"].str.lower() - - return tab_df - - -def load_clean_source_mapping() -> Dict[str, str]: - with open(CURRENT_DIR / "un_sdg.sources.json", "r") as f: - sources = json.load(f) - return cast(Dict[str, str], sources) - - -@cache -def get_metadata_link(indicator: str) -> str: - url = os.path.join("https://unstats.un.org/sdgs/metadata/files/", "Metadata-%s.pdf") % "-".join( - [part.rjust(2, "0") for part in indicator.split(".")] - ) - r = requests.head(url) - ctype = r.headers["Content-Type"] - if ctype == "application/pdf": - url_out = url - elif ctype == "text/html": - url_a = os.path.join("https://unstats.un.org/sdgs/metadata/files/", "Metadata-%sa.pdf") % "-".join( - [part.rjust(2, "0") for part in indicator.split(".")] - ) - url_b = os.path.join("https://unstats.un.org/sdgs/metadata/files/", "Metadata-%sb.pdf") % "-".join( - [part.rjust(2, "0") for part in indicator.split(".")] - ) - url_out = url_a + " and " + url_b - url_check = requests.head(url_a) - ctype_a = url_check.headers["Content-Type"] - assert ctype_a == "application/pdf", url_a + "does not link to a pdf" - else: - raise NotImplementedError() - - return url_out - - -def value_convert(value: Any) -> Any: - if isinstance(value, float) or isinstance(value, int): - if int(value) == value: - return int(value) - if float(value) == value: - value = round(value, 2) - return value - else: - return value diff --git a/etl/steps/archive/grapher/un/2022-07-07/un_sdg.sources.json b/etl/steps/archive/grapher/un/2022-07-07/un_sdg.sources.json deleted file mode 100644 index 4dc59d01070..00000000000 --- a/etl/steps/archive/grapher/un/2022-07-07/un_sdg.sources.json +++ /dev/null @@ -1,293 +0,0 @@ -{ - "ILO estimates based on information collected directly from national governments.": "International Labour Organisation", - "10YFP / WESR": "UN Statistics Division", - "10YFP / WESR ": "UN Statistics Division", - "2020 Partner Report on Support to Statistics (PRESS)": "UN Statistics Division", - "All Surveys and studies that are population-based, representative at the national or sub-national level, conducted between 2000 and 2018 and measured IPV using act-specific questions.": "Data from multiple sources compiled by the UN", - "Based on serosurveys at city, subnational or national level. World Health Organisation (WHO).": "World Health Organization", - "Based on the data from the Global Health Workforce Statistics database, Global Health Observatory, WHO .; Global Health Workforce Statistics database, Global Health Observatory, WHO . Available at http://www.who.int/hrh/statistics/hwfstats/en/. For the specific sources and metadata by country, refer to database directly.": "World Health Organization", - "Based on the data from the Global Health Workforce Statistics database, Global Health Observatory, WHO .; Global Health Workforce Statistics database, Global Health Observatory, WHO . Available at http://www.who.int/hrh/statistics/hwfstats/en/. For the specific sources and metadata by country, refer to database directly.": "World Health Organization", - "Biennial Reports by Annex I Parties to the Convention": "Data from multiple sources compiled by the UN", - "BirdLife International and IUCN (2021), based on global estimates of the extinction risk (IUCN Red List categories) of all mammals, birds, amphibians, corals and cycads, derived from local and national data, disaggregated to the national scale and weighted by the proportion of each species's distribution in the country or region.": "Birdlife International and International Union for Conservation of Nature", - "BirdLife International, IUCN and UNEP-WCMC (2020). Based on spatial overlap between polygons for Key Biodiversity Areas from the World Database of Key Biodiveristy Areas (www.keybiodiversityareas.org) and polygons for protected areas from the World Database on Protected Areas and (where available) for Other Effective area-based Conservation Measures and from the World Database on OECMs (www.protectedplanet.net)": "BirdLife International, International Union for Conservation of Nature and United Nations Environment Programme", - "CHM / WESR": "Convention on Biological Diversity and World Environment Situation Room", - "CHM / WESR ": "Convention on Biological Diversity and World Environment Situation Room", - "Country reports on the implementation of the UNESCO 1974 Recommendation concerning Education for International Understanding, Co-operation and Peace and Education relating to Human Rights and Fundamental Freedoms": "Data from multiple sources compiled by the UN", - "DAC Statistics database, 2021, The Organisation for Economic Co-operation and Development (OECD) and IRENA Public Finance Database, 2021, The international Renewable Energy Agency (IRENA)": "OECD and International Renewable Energy Agency", - "DAD-IS 3.2.2021 http://www.fao.org/dad-is/": "Food and Agriculture Organization of the United Nations", - "DAD-IS 7.9.2021 http://www.fao.org/dad-is/": "Food and Agriculture Organization of the United Nations", - "DAD-IS 9.7.2021 http://www.fao.org/dad-is/": "Food and Agriculture Organization of the United Nations", - "Data are derived from country reporting on 13 binary questions on questionnaire, collected through the Global Partneship for Effective Development Cooperation": "Data from multiple sources compiled by the UN", - "Data based on the World Telecommunication/ICT Indicators Database, 2020, International Telecommunication Union (ITU)": "International Telecommunication Union", - "Data from multiple sources compiled by United Nations": "Data from multiple sources compiled by the UN", - "Data gathered as a result of a survey": "Data from multiple sources compiled by the UN", - "Data gathered from the latest version of NBSAP": "National Biodiversity Strategies and Action Plans", - "Data gathered through a literature search (primary sources ECOLEX, FAOLEX) and consultation with country experts; Data gathered from the latest version of NBSAP": "Data from multiple sources compiled by the UN", - "Debt service is the sum of principle repayments and interest actually paid in currency, goods, or services. This series differs from the standard debt to exports series. It covers only long-term public and publicly guaranteed debt and repayments (repurchases and charges) to the IMF. Exports of goods and services include primary income, but do not include workers' remittances.": "International Monetary Fund (IMF)", - "Demographic and Social Statistics Branch, United Nations Statistics Division (UNSD), Department of Economic and Social Affairs, United Nations": "UN Department of Economic and Social Affairs", - "Demographic and Social Statistics Branch, United Nations Statistics Division (UNSD), Department of Economic and Social Affairs, United Nations.": "UN Department of Economic and Social Affairs", - "Division for Ocean Affairs and the Law of the Sea, Office of Legal Affairs, United Nations Secretariat": "UN Office of Legal Affairs", - "Environment Live": "UN Office of Legal Affairs", - "Environment Live / Global Material Flows Database": "UN Office of Legal Affairs", - "FAO": "Food and Agriculture Organization of the United Nations", - "FAO questionnaire on the implementation of the Code of Conduct for Responsible Fisheries - Country self-reporting": "Food and Agriculture Organization of the United Nations", - "FAO questionnaire on the implementation of the Code of Conduct for Responsible Fisheries - Country self-reporting ": "Food and Agriculture Organization of the United Nations", - "FAO, Global Forest Resources Assessment": "Food and Agriculture Organization of the United Nations", - "FAO, Global Forest Resources Assessment and FAOSTAT": "Food and Agriculture Organization of the United Nations", - "FAO, Statistics Division": "Food and Agriculture Organization of the United Nations", - "FAOSTAT Data/ Consumer Price Indices: http://www.fao.org/faostat/en/#data/CP.": "Food and Agriculture Organization of the United Nations", - "Food and Agriculture Organisation of the United Nations (FAO), with data collected through the Gallup World Poll.; National Entity": "Food and Agriculture Organization of the United Nations", - "Food and Agriculture Organisation of United Nations (FAO)": "Food and Agriculture Organization of the United Nations", - "Food and Agriculture Organization of the United Nations (FAO)": "Food and Agriculture Organization of the United Nations", - "Food and Agriculture Organization of the United Nations (FAO); National Entity": "Food and Agriculture Organization of the United Nations", - "Food Price Monitoring and Analysis (FPMA) online database: https://fpma.apps.fao.org/giews/food-prices/tool/public/#/dataset/domestic.": "Food and Agriculture Organization of the United Nations", - "Food Waste Index Report 2021 / WESR": "UN Environment Programme", - "Global Financial Inclusion Database, World Bank.": "World Bank", - "Global Health Estimates 2019: Deaths by Cause, Age, Sex, by Country and by Region, 2000-2019. Geneva, World Health Organization, 2020": "World Health Organization", - "Global Health Observatory (GHO), World Health Organisation (WHO)": "World Health Organization", - "Global Health Observatory (GHO), World Health Organisation (WHO).": "World Health Organization", - "Global Health Workforce Statistics database, Global Health Observatory, WHO . Available at http://www.who.int/hrh/statistics/hwfstats/en/. For the specific sources and metadata by country, refer to database directly.": "World Health Organization", - "Global Health Workforce Statistics database, Global Health Observatory, WHO . Available at http://www.who.int/hrh/statistics/hwfstats/en/. For the specific sources and metadata by country, refer to database directly.": "World Health Organization", - "Global Model for Monitoring Marine Litter": "Centre for Ocean-Atmospheric Prediction Studies", - "Global Surface Water Explorer extraction for UN Environment": "UN Environment Programme", - "GLOTIP Database": "UN Office on Drugs and Crime", - "https://absch.cbd.int/countries": "Convention on Biological Diversity", - "https://investmentpolicy.unctad.org/international-investment-agreements#": "UN Conference on Trade and Development", - "https://www.cbd.int/abs/nagoya-protocol/signatories/default.shtml, Access and Benefit-sharing Clearing-house https://absch.cbd.int/, http://www.fao.org/plant-treaty/countries/membership/en/, The Online Reporting System on Compliance of the International Treaty on PGRFA http://faoitpgrfa.ort-production.linode.unep-wcmc.org and Easy-SMTA\u00a0https://mls.planttreaty.org ,": "Data from multiple sources compiled by the UN", - "https://www.cbd.int/abs/nagoya-protocol/signatories/default.shtml, Access and Benefit-sharing Clearing-house https://absch.cbd.int/, http://www.fao.org/plant-treaty/countries/membership/en/, The Online Reporting System on Compliance of the International Treaty on PGRFA http://faoitpgrfa.ort-production.linode.unep-wcmc.org and Easy-SMTA https://mls.planttreaty.org ,": "Data from multiple sources compiled by the UN", - "ICAO; the International Transport Forum at the OECD (ITF-OECD)": "International Civil Aviation Organisation and Organization for Economic Co-operation and Development via United Nations Global SDG Database", - "IEA (2020), CO2 Emissions from fuel combustion. https://www.iea.org/statistics": "International Energy Agency via United Nations Global SDG Database", - "IEA fossil fuel subsidies database, OECD.Stat- Inventory of Support Measures for Fossil Fuels and IMF Energy Subsidies Template (pre-tax calculations)": "International Energy Agency, Organisation for Economic Co-operation and Development and International Monetary Fund via United Nations Global SDG Database", - "ILO calculations based on information from ILO textual sources.": "International Labour Organization (ILO)", - " ILO estimates based on information collected directly from national governments. ": "International Labour Organization (ILO)", - "ILO estimates based on country data compled through the ILO Social Security Inquiry (SSI); ILO estimates based on country data compled through the ILO Social Security Inquiry (SSI). Based on information from federal programmes only.": "International Labour Organization (ILO)", - "ILO estimates based on country data compled through the ILO Social Security Inquiry (SSI); ILO estimates based on country data compled through the ILO Social Security Inquiry (SSI). Based on information from federal programmes only.": "International Labour Organization (ILO)", - "ILO modelled estimates, November 2019, available in ILOSTAT (https://ilostat.ilo.org/). For the specific sources by country and the estimation methodology refer to ILOSTAT directly.": "International Labour Organization (ILO)", - "ILO modelled estimates, November 2020, available in ILOSTAT (https://ilostat.ilo.org/). For the specific sources by country and the estimation methodology refer to ILOSTAT directly.": "International Labour Organization (ILO)", - "IMF": "International Monetary Fund (IMF)", - "International Monetary Fund, Financial Soundness Indicators.": "International Monetary Fund (IMF)", - "International Monetary Fund, International Financial Statistics and data files.": "International Monetary Fund (IMF)", - "International Monetary Fund, Balance of Payments Statistics Yearbook and data files.": "International Monetary Fund (IMF)", - "International Monetary Fund, Balance of Payments Statistics Yearbook and data files, and World Bank and OECD GDP estimates.": "International Monetary Fund (IMF)", - "International Monetary Fund, Government Finance Statistics Yearbook and data files, and World Bank and OECD GDP estimates.": "International Monetary Fund (IMF), World Bank and Organisation for Economic Co-operation and Development", - "International Monetary Fund, International Financial Statistics and Balance of Payments databases, World Bank, International Debt Statistics, and World Bank and OECD GDP estimates.": "International Monetary Fund (IMF), World Bank and Organisation for Economic Co-operation and Development", - "Inter-Parliamentary Union (IPU), Parline Database": "Inter-Parliamentary Union via United Nations Global SDG Database", - "Intergovernmental Oceanographic Commission (IOC) of UNESCO": "UNESCO", - "International Trade Centre (ITC), United Nations Conference on Trade and Development (UNCTAD) and World Trade Organisation (WTO).": "UN Conference on Trade and Development and World Trade Organisation", - "International Union for Conservation of Nature (IUCN).": "International Union for Conservation of Nature", - "IOM Missing Migrant Project": "International Organization for Migration", - "ITC/UNCTAD/WTO database": "UN Conference on Trade and Development and World Trade Organisation", - "ITPGRFA Secretariat": "International Treaty on Plant Genetic Resources for Food and Agriculture Secretariat", - "ITPGRFA Secretariat ": "International Treaty on Plant Genetic Resources for Food and Agriculture Secretariat", - "IWRM Data Portal, UNEP": "UN Environment Programme", - "Key indicators of the UN-Water Global Analysis and Assessment of Sanitation and Drinking-Water (GLAAS). For the specific sources by country, refer to http://www.who.int/water_sanitation_health/monitoring/investments/glaas/en/.": "UN Water", - "NUP 2020 Status": "United Nations Human Settlement Programme", - "OECD and UNDP.": "OECD and United Nations Development Programme", - "Official country reported data (UNSD and OECD national accounts). FAO Yearbook of Fishery Statistics and Review of the State of World Marine Fishery Resources.; Official country reported data (UNSD and OECD national accounts). FAO Yearbook of Fishery Statistics. SDG 14.4.1": "Food and Agriculture Organization of the United Nations", - "Organisation for Economic Co-operation and Development (OECD) and United Nations Development Programme (UNDP).": "OECD and UN Development Programme", - "PARIS21 SDG Survey": "PARIS21 SDG Survey via United Nations Global SDG Database", - "Plastic Pollution (MLW, MDMAP, ICC)": "Data from multiple sources compiled by the UN", - "PPI DB": "World Bank", - "Protected Planet: The World Database on Protected Areas (WDPA) [On-line], December, 2018, International Union for Conservation of Nature (IUCN) and United Nations Environment Programme's World Conservation Monitoring Centre (UNEP-WCMC).": "United Nations Statistics Division", - "Public Expenditure and Financial Accountability\u00a0(PEFA). Ministry of Finance (MoF).": "Public Expenditure and Financial Accountability program via United Nations Global SDG Database", - "Public Expenditure and Financial Accountability (PEFA). Ministry of Finance (MoF).": "Public Expenditure and Financial Accountability program via United Nations Global SDG Database", - "Questionnaire on the implementation of the Code of Conduct for Responsible Fisheries - Country self-reporting": "Data from multiple sources compiled by the UN", - "Questionnaire on the implementation of the Code of Conduct for Responsible Fisheries - Country self-reporting ": "Data from multiple sources compiled by the UN", - "Ramsar National Report COP14": "Ramsar National Report COP14", - "Remittance Prices Worldwide database, World Bank.": "World Bank", - "Renewable electricity generating capacity from IRENA's electricity capacity database. Population data from the United Nations World Population Prospects.": "International Renewable Energy Agency and United Nations World Population Prospects", - "Sendai Framework Monitoring System as provided by designated national focal points (2021)": "UN Office for Disaster Risk Reduction", - "United Nations Office for Disaster Risk Reduction (2022)": "UN Office for Disaster Risk Reduction", - "Source: Joint Child Malnutrition Estimates (2021 Edition), United Nations Children's Fund (UNICEF), World Health Organisation (WHO) and the World Bank Group.": "UNICEF, World Health Organization and World Bank", - "The Office of the United Nations High Commissioner for Human Rights (OHCHR) in collaboration with Global Alliance of National Human Rights Institutions (GANHRI).": "United Nations Statistics Division", - "Source: The Office of the United Nations High Commissioner for Human Rights (OHCHR) in collaboration with Global Alliance of National Human Rights Institutions (GANHRI).": "United Nations Statistics Division", - "Source: Values aggregated by custodian agencies; ; Source: Relevant country authority; ": "Data from multiple sources compiled by the UN", - "UNCTAD Review of Maritime Transport, 2020 and UNCTADstat ( https://unctadstat.unctad.org/wds/TableViewer/tableView.aspx).": "UN Conference on Trade and Development", - "The figures include only direct conflict-related deaths of civilians recorded by the United Nations. Office of the High Commissioner for Human Rights (OHCHR)": "Office of the High Commissioner for Human Rights", - "The figures include only direct conflict-related deaths of civilians recorded by the United Nations. Office of the High Commissioner for Human Rights (OHCHR) ": "Office of the High Commissioner for Human Rights", - "The figures included here concern only verified cases of killings of human rights defenders, journalists and trade unionists. OHCHR does not yet collect data on the number of cases of kidnapping, arbitrary detention or torture of human rights defenders, j": "Office of the High Commissioner for Human Rights", - "The figures included here concern only verified cases of enforced disappearance of human rights defenders, journalists and trade unionists. OHCHR does not yet collect data on the number of cases of kidnapping, arbitrary detention or torture of human rights defenders, journalists and trade unionists.": "Office of the High Commissioner for Human Rights", - "The figures included here concern only verified cases of enforced disappearance of human rights defenders, journalists and trade unionists. OHCHR does not yet collect data on the number of cases of kidnapping, arbitrary detention or torture of human right": "Office of the High Commissioner for Human Rights", - "the International Transport Forum at the OECD (ITF-OECD); ICAO": "International Civil Aviation Organisation and Organization for Economic Co-operation and Development via United Nations Global SDG Database", - "The figures included here concern only verified cases of killings of human rights defenders, journalists and trade unionists. OHCHR does not yet collect data on the number of cases of kidnapping, arbitrary detention or torture of human rights defenders, journalists and trade unionists.": "Office of the High Commissioner for Human Rights", - "The Organisation for Economic Co-operation and Development (OECD).": "Organisation for Economic Co-operation and Development", - "The source of non-seasonally adjusted Gross Domestic Product (GDP) data in national currency, at current prices, is the International Finance Statistics quarterly database of the IMF, annualized by the World Bank, unless otherwise specified.": "International Monetary Fund (IMF) and World Bank", - "Tracking universal health coverage: 2019 Global Monitoring Report. Geneva, WHO 2019. http://www.who.int/healthinfo/universal_health_coverage/report/2019/en/": "World Health Organization", - "Trends in maternal mortality: 2000 to 2017: estimates by WHO, UNICEF, UNFPA, World Bank Group and the United Nations Population Division. Geneva: World Health Organization; 2019": "World Health Organization, UNICEF, United Nations Population Fund, World Bank and UN Population Division", - "Trends in maternal mortality: 2000 to 2017: estimates by WHO, UNICEF, UNFPA, World Bank Group and the United Nations Population Division. Geneva: World Health Organization; 2019 ": "World Health Organization, UNICEF, United Nations Population Fund, World Bank and UN Population Division", - "UN Women": "UN Women", - "UNAIDS": "Joint UN Programme on HIV and AIDS", - "UNCTAD Global AI Data Source": "UN Conference on Trade and Development", - "UNCTAD Global AI Data Source ": "UN Conference on Trade and Development", - "UNCTAD, FDI/MNE database (www.unctad.org/fdistatistics).": "UN Conference on Trade and Development", - "Source: UNCTAD Review of Maritime Transport, 2020 and UNCTADstat ( https://unctadstat.unctad.org/wds/TableViewer/tableView.aspx).": "UN Conference on Trade and Development", - "UNCTADstat. Data compiled by UNCTAD and WTO.": "UN Conference on Trade and Development and World Trade Organization", - "UNESCO Institute for Statistics. Data extracted on 13 April 2021.": "UNESCO", - "UNESCO Institute for Statistics. Data extracted on 13 April 2021.": "UNESCO", - "UNESCO World Trends in Freedom of Expression and Media Development (raw research records); Global Right to Information Rating (Access Info & Center for Law and Democracy) http://www.rti-rating.org/by-section/; Freedominfo; Article19": "Data from multiple sources compiled by the UN", - "UNESCO World Trends in Freedom of Expression and Media Development (raw research records); Global Right to Information Rating (Access Info & Center for Law and Democracy) http://www.rti-rating.org/by-section/; Freedominfo; Article19": "Data from multiple sources compiled by the UN", - "UNFCCC": "UN Framework Convention on Climate Change", - "UNFCCC based on adaptation communications - https://unfccc.int/topics/adaptation-and-resilience/workstreams/adaptation-communications": "UN Framework Convention on Climate Change", - "UNFCCC based on biennial update reports from non-Annex I Parties - https://unfccc.int/BURs": "UN Framework Convention on Climate Change", - "UNFCCC based on fourth biennial reports from Annex I Parties - https://unfccc.int/BRs": "UN Framework Convention on Climate Change", - "UNFCCC based on national adaptation plans - https://www4.unfccc.int/sites/NAPC/News/Pages/national_adaptation_plans.aspx": "UN Framework Convention on Climate Change", - "UNFCCC based on national communications from non-Annex I Parties - https://unfccc.int/non-annex-I-NCs": "UN Framework Convention on Climate Change", - "UNFCCC based on nationally determined contributions - https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx": "UN Framework Convention on Climate Change", - "UNFCCC based on seventh national communications from Annex I Parties - https://unfccc.int/NC7": "UN Framework Convention on Climate Change", - "UNHCR based on National data on refugee populations, available at UNHCR refugee statistics (https://www.unhcr.org/refugee-statistics/), and National population estimates, available in the World Population Prospects (https://population.un.org/wpp/).": "UN High Commissioner for Refugees", - "UNICEF and ILO calculations": "UNICEF and International Labour Organization (ILO)", - "UNIDO estimates based on the CIP 2020 database. Available at https://stat.unido.org; UNIDO CIP 2020 Database. Available at https://stat.unido.org": "UN Industrial Development Organization", - "UNIDO MVA 2021 Database. Available at https://stat.unido.org": "UN Industrial Development Organization", - "UNIDO MVA 2021 Database. Available at https://stat.unido.org. IEA (2020), CO2 Emissions from Fuel Combustion. https://www.iea.org/statistics": "UN Industrial Development Organization and International Energy Agency", - "United Nation Inquiry among Governments on Population and Development (the \u201cInquiry\u201d), available at https://esa.un.org/PopPolicy/Inquiry.aspx. The Inquiry was sent to 197 countries, including all 193 Member States, two Observer States (the Holy See and the State of Palestine) and two non-member States (Cook Islands and Niue) of the United Nations.": "UN Department of Economic and Social Affairs", - "United Nations Human Settlements Programme (UN-HABITAT)": "UN Human Settlements Programme", - "United Nations Human Settlements Programme (UN-Habitat)": "UN Human Settlements Programme", - "United Nations Inter-agency Group for Child Mortality Estimation (UN IGME), 2020.": "UN Inter-agency Group for Child Mortality Estimation", - "United Nations Office for Disaster Risk Reduction (2021)": "UN Office for Disaster Risk Reduction", - "United Nations Office for Disaster Risk Reduction (UNDRR) as per the Sendai Framework Monitor.": "UN Office for Disaster Risk Reduction", - "United Nations Office for Disaster Risk Reduction (UNDRR) as per the Sendai Framework Monitor. \n ": "UN Office for Disaster Risk Reduction", - "United Nations Office on Drugs and Crime": "UN Office on Drugs and Crime", - "United Nations Population Fund, global databases, 2020. Based on official responses to the United Nations 12th Inquiry among Governments on Population and Development.": "Data from multiple sources compiled by the UN", - "United Nations Population Fund, global databases, 2020. Based on official responses to the United Nations 12th Inquiry among Governments on Population and Development. ": "Data from multiple sources compiled by the UN", - "United Nations, Department of Economic and Social Affairs, Statistics Division (AMA)": "UN Department of Economic and Social Affairs", - "UNODC - IAFQ": "UN Office on Drugs and Crime", - "UNSD / UNU / WESR": "UN Statistics Division", - "UNSD national account estimates": "UN Statistics Division", - "UNSTATS / WESR": "UN Statistics Division", - "UNSTATS / WESR ": "UN Statistics Division", - "UNSTATS / WESR; UNSTATS / WESR": "UN Statistics Division", - "UNSTATS / WESR; UNSTATS / WESR ": "UN Statistics Division", - "Values aggregated by custodian agencies; Source: Relevant country authority; ": "Data from multiple sources compiled by the UN", - "WDI": "World Bank", - "WHO": "World Health Organization", - "WHO 2020": "World Health Organization", - "WHO and UNICEF HPV coverage estimates": "World Health Organization and UNICEF", - "WHO Global Health Observatory": "World Health Organization", - "WHO Global Health Observatory (https://www.who.int/data/gho/)": "World Health Organization", - "WHO Global Information System on Alcohol and Health (GISAH)": "World Health Organization", - "WHO Global Tuberculosis Report 2020 (https://www.who.int/teams/global-tuberculosis-programme/data)": "World Health Organization", - "WHO-GLASS; AMR Surveillance National Coordinating Center": "World Health Organization and Antimicrobial Resistance Surveillance National Coordinating Centers", - "WHO-GLASS; AMR Surveillance National Coordinating Center ": "World Health Organization and Antimicrobial Resistance Surveillance National Coordinating Centers", - "WHO/UNICEF coverage estimates 2018 revision, July 2019": "World Health Organization and UNICEF", - "WHO/UNICEF coverage estimates 2019 revision, July 2020": "World Health Organization and UNICEF", - "WHO/UNICEF Joint Monitoring Programme for Water Supply, Sanitation and Hygiene (2021)": "World Health Organization and UNICEF", - "WMR": "International Organization for Migration", - "World Bank": "World Bank", - "World Bank (collection agencies specified in footnote)": "World Bank", - "World Bank Enterprise Surveys 2020": "World Bank", - "World Bank staff estimates based on IMF balance of payments data, and World Bank and OECD GDP estimates.": "World Bank", - "World Bank, Development Research Group. Data are based on primary household survey data obtained from government statistical agencies and World Bank country departments. Data for high-income economies are from the Luxembourg Income Study database. For more information and methodology, please see PovcalNet (http://iresearch.worldbank.org/PovcalNet/index.htm).": "World Bank", - "World Bank national accounts data, and OECD National Accounts data files.": "World Bank", - "World Bank, International Debt Statistics.": "World Bank", - "World Bank, Private Participation in Infrastructure (PPI) Database.": "World Bank", - "World Development Indicators database, World Bank": "World Bank", - "World Development Indicators database, World Bank.": "World Bank", - "World Development Indicators\u00a0(WDI)\u00a0": "World Bank", - "World Development Indicators (WDI) ": "World Bank", - "World Environment Situation Room": "UN Statistics Division", - "Regional Seas / World Environment Situation Room": "United Nations Environment Programme", - "World Health Organization (WHO) Department of the Prevention of Noncommunicable Diseases; Secretariat of the WHO Framework Convention on Tobacco Control": "World Health Organization", - "World Health Organization (WHO) Department of the Prevention of Noncommunicable Diseases;\u00a0 Secretariat of the WHO Framework Convention on Tobacco Control": "World Health Organization", - "World Trade Organization": "World Trade Organization", - "World Trade Organization, and World Bank GDP estimates.": "World Trade Organization and World Bank", - "BirdLife International, IUCN and UNEP-WCMC (2021). Based on spatial overlap between polygons for Key Biodiversity Areas from the World Database of Key Biodiveristy Areas (www.keybiodiversityareas.org) and polygons for protected areas from the World Database on Protected Areas and (where available) for Other Effective area-based Conservation Measures and from the World Database on OECMs (www.protectedplanet.net)": "BirdLife International, IUCN and UNEP-WCMC", - "UNMMEIG; Trends in maternal mortality: 2000 to 2017: estimates by WHO, UNICEF, UNFPA, World Bank Group and the United Nations Population Division. Geneva: World Health Organization; 2019 ": "UN Maternal Mortality Estimation Interagency Group and World Health Organization", - "United Nations Inter-agency Group for Child Mortality Estimation (UN IGME), 2021.": "UN Inter-agency Group for Child Mortality Estimation", - "WHO Global Tuberculosis Report 2021 (https://www.who.int/teams/global-tuberculosis-programme/data)": "World Health Organization", - "Tracking universal health coverage: 2021 Global Monitoring Report. Geneva, WHO 2021. https://www.who.int/data/monitoring-universal-health-coverage.": "World Health Organization", - "WHO/UNICEF coverage estimates 2020 revision, October 2021": "World Health Organization and UNICEF", - "MedMon Pilot – Pilot WHO Essential Medicines and Health Products Price and Availability Monitoring Mobile Application; HAI/WHO – Health Action International/ World Health Organization (joint project for data collection and analysis) ": "Health Action International and World Health Organization", - "Source: UNCTAD Review of Maritime Transport, 2021 and UNCTADstat (http://stats.unctad.org/teu).": "UNCTAD", - "Source: UNCTAD Review of Maritime Transport, 2021 and UNCTADstat (https://unctadstat.unctad.org/wds/TableViewer/tableView.aspx?ReportId=32363).": "UNCTAD", - "UNIDO MVA 2022 Database. Available at https://stat.unido.org": "United Nations Industrial Development Organization", - "World Bank Enterprise Surveys 2022": "World Bank", - "IEA (2021), Greenhouse gas emissions from energy. https://www.iea.org/statistics": "International Energy Agency", - "UNIDO MVA 2022 Database. Available at https://stat.unido.org. IEA (2021), Greenhouse gas emissions from energy. https://www.iea.org/statistics": "United Nations Industrial Development Organization", - "UNIDO estimates based on the CIP 2021 database. Available at https://stat.unido.org; UNIDO CIP 2021 Database. Available at https://stat.unido.org": "United Nations Industrial Development Organization", - "Market Access Map data, International Trade Centre (ITC)": "International Trade Centre", - "International Monetary Fund, International Financial Statistics, supplemented by World Bank staff estimates.": "International Monetary Fund", - "World Bank, Quarterly Public Sector Debt database.": "World Bank", - "Data based on the World Telecommunication/ICT Indicators Database, 2021, International Telecommunication Union (ITU)": "International Telecommunication Union", - "WESR / UN COMTRADE": "UN Statistics Division", - "ILO modelled estimates, November 2021, available in ILOSTAT (https://ilostat.ilo.org/). For the specific sources by country and the estimation methodology refer to ILOSTAT directly.": "International Labour Organization", - "United Nation Inquiry among Governments on Population and Development (the “Inquiry”). The Inquiry was sent to 197 countries, including all 193 Member States, two Observer States, and 2 non-member States.; United Nation Inquiry among Governments on Population and Development (the “Inquiry”). The Inquiry was sent to 197 countries, including 193 Member States, 2 observer States, and 2 non-member States.": "United Nations", - "United Nation Inquiry among Governments on Population and Development (the “Inquiry”). The Inquiry was sent to 197 countries, including all 193 Member States, two Observer States, and 2 non-member States.": "United Nations", - "DAC Statistics database, 2022, The Organisation for Economic Co-operation and Development (OECD) and IRENA Public Finance Database, 2022, The international Renewable Energy Agency (IRENA)": "The Organisation for Economic Co-operation and Development and the International Renewable Energy Agency", - "Global Surface Water Explorer extraction for UN Environment Programme": "UN Environment Programme", - "UNESCO Institute for Statistics (UIS) Survey on expenditure on cultural and natural heritage (Indicator SDG 11.4.1)": "UNESCO", - "Tracking universal health coverage: 2021 Global Monitoring Report. Geneva, WHO 2021. https://www.who.int/data/monitoring-universal-health-coverage": "World Health Organization", - "Source = ILO modelled estimates, November 2021, available in ILOSTAT (https://ilostat.ilo.org/). For the specific sources by country and the estimation methodology refer to ILOSTAT directly.": "International Labour Organization", - "Poverty and Inequality Platform, World Bank": "World Bank", - "Source: ILO estimates based on country data compled through the ILO Social Security Inquiry (SSI); Source: ILO estimates based on country data compled through the ILO Social Security Inquiry (SSI). Based on information from federal programmes only.": "International Labour Organization", - "ASPIRE: The Atlas of Social Protection - Indicators of Resilience and Equity, The World Bank. Data are based on national representative household surveys. (datatopics.worldbank.org/aspire/)": "World Bank", - "UNESCO Report on Public Access to Information; Global Right to Information Rating (Access Info & Center for Law and Democracy) https://www.rti-rating.org/country-data/": "UNESCO", - "WESR / Global Material Flows Database": "United Nations Environment Programme", - "DAD-IS 1.2.2022 http://www.fao.org/dad-is/": "Food and Agriculture Organization of the United Nations", - "BirdLife International and IUCN (2022), based on global estimates of the extinction risk (IUCN Red List categories) of all mammals, birds, amphibians, corals and cycads, derived from local and national data, disaggregated to the national scale and weighted by the proportion of each species's distribution in the country or region.": "BirdLife International and IUCN", - "https://www.cbd.int/abs/nagoya-protocol/signatories/default.shtml, Access and Benefit-sharing Clearing-house https://absch.cbd.int/, http://www.fao.org/plant-treaty/countries/membership/en/, The Online Reporting System on Compliance of the International ": "Data from multiple sources compiled by the UN", - "UNSD/UNEP Questionnaire on Environment Statistics": "United Nations Environment Programme and United Nations Statistics Division", - "UNSD / UNITAR / WESR": "United Nations Statistics Division", - "Refinitiv": "Refinitiv", - "Source: ILO estimates based on country data compled through the ILO Social Security Inquiry (SSI); Source: ILO estimates based on country data compled through the ILO Social Security Inquiry (SSI). Based on information from federal programmes only.": "International Labour Organization", - "UN-Habitat Urban Indicators Database": "UN Habitat", - "OECD": "Organisation for Economic Co-operation and Development (OECD)", - "Eurostat": "Eurostat", - "ICAO": "International Civil Aviation Organization", - "the International Transport Forum at the OECD (ITF-OECD)": "Organisation for Economic Co-operation and Development (OECD)", - "Ministry of Social Development": "Ministry of Social Development", - "UNESCO Institute for Statistics. Data extracted on 01 April 2022.": "UNESCO", - "DHS 2016": "Demographic and Health Surveys", - "Trends in International Mathematics and Science Study (TIMSS). Students achieving at least Intermediate International Benchmark. Grade 4 Data extracted on 01 April 2022.": "National Centre for Education Statistics", - "Progress in International Reading Literacy Study (PIRLS). Students achieving at least low International Benchmark. Grade 4 Data extracted on 01 April 2022.": "National Centre for Education Statistics", - "Denmark – Register based Labour Force Survey": "Statistics Denmark", - "UN General Assembly website http://www.un.org/en/member-states/index.html": "United Nations", - "Estadísticas Vitales 2011": "The National Institute of Statistics and Census of Costa Rica", - "IHR National Self-Assessment and reports received and registered at WHO e-SPAR database (https://extranet.who.int/e-spar/ ).": "World Health Organization", - "DHS, MICS and other national surveys": "Data from multiple sources compiled by the UN", - "ILO estimates based on country data compled through the ILO Social Security Inquiry (SSI)": "International Labour Organization", - "Source: ILO estimates based on country data compled through the ILO Social Security Inquiry (SSI)": "International Labour Organization", - "Poverty and Inequality Portal, World Bank": "World Bank", - "Calculated by UNSD based on available country data from the Time use survey, 2014/2015 to apply ICATUS.": "United Nations Statistics Division", - "National Statistical Office based on Time use survey 2015": "National Statistical Office", - "Instituto Nacional de Estadísticas Chile based on Time Use Survey 2015": "The National Statistics Institute of Chile", - "Calculated by UNSD based on available country data from the Survey on the use of time and unpaid work 2007 to apply ICATUS.": "United Nations Statistics Division", - "Calculated by UNSD based on available country data from the Ghana Time Use Survey to apply ICATUS.": "United Nations Statistics Division", - "Statistics Sweden based on The Swedish Time-Use Survey 2000": "Statistics Sweden", - "Calculated by UNSD based on available country data from the Integrated Labour Force Survey - Time Use Survey Module to apply ICATUS.": "United Nations Statistics Division", - "Calculated by UNSD based on available country data from the Time Use Survey to apply ICATUS.": "United Nations Statistics Division", - "Calculated by UNSD based on available country data from the 2018 National Time Use Survey to apply ICATUS.": "United Nations Statistics Division", - "Calculated by UNSD based on available country data from the Pilot Time use survey to apply ICATUS.": "United Nations Statistics Division", - "Calculated by UNSD based on available country data from the Malian Survey on Time Use to apply ICATUS.": "United Nations Statistics Division", - "INSSE based on National Time Use Study": "National Statistics Institute Romania", - "Calculated by UNSD based on available country data from the Module on the use of time to apply ICATUS.": "United Nations Statistics Division", - "Statistics Norway based on Time Use Survey": "Statistics Norway", - "Source: LFS - Periodic Labour Force Survey": "Labour Force Survey", - "LFS - Periodic Labour Force Survey": "Labour Force Survey", - "LFS 2017-18, UNICEF and ILO calculations": "Labour Force Survey", - "Survey of Activities of Young People 2015, UNICEF and ILO calculations": "UNICEF and International Labour Organization", - "Demographic and Health Survey": "Demographic and Health Surveys", - "European Commission DG REGIO": "European Commission", - "Planning & Statistics Authority (PSA), Qatar": "Planning & Statistics Authority (PSA), Qatar", - "The figures include only direct conflict-related deaths recorded by the United Nations. Office of the High Commissioner for Human Rights (OHCHR)": "Office of the High Commissioner for Human Rights", - "National Criminal Justice Data as Collected through the United Nations Surveys on Crime Trends and the Operations of Criminal Justice Systems (UN-CTS)": "United Nations Office on Drugs and Crime", - "UNODC calculations based on national data": "United Nations Office on Drugs and Crime", - "National Health and Morbidity Survey 2016": "Data from multiple sources compiled by the UN", - "Ensanut 2018-19": "Data from multiple sources compiled by the UN", - "A National Prevalence Study on Exposure to Violence among Women and Men and its Association to Healt": "Data from multiple sources compiled by the UN", - "National Statistical Office - Multiple Indicator Cluster Surveys (MICS6)": "National Statistical Office", - "National Statistical Office - Encuesta de Hogares": "National Statistical Office", - "National Statistical Office - Encuesta de Indicadores Múltiples por Conglomerados (MICS6)": "National Statistical Office", - "National Statistical Office - Survey of Living Conditions": "National Statistical Office", - "National Statistical Office - Governance, Public Safety and Justice Survey": "National Statistical Office", - "National Statistical Office - Encuesta de Mujeres, Niñez y Adolescencia/MICS6": "National Statistical Office", - "National Statistical Office - Survey on Human Rights": "National Statistical Office", - "National Statistical Office - General Social Survey on Victimization": "National Statistical Office", - "National Statistical Office - General Social Survey": "National Statistical Office", - "National Statistical Office - Encuesta Nacional de Hogares (ENAHO)": "National Statistical Office", - "National Statistical Office - Democratic Governance Survey": "National Statistical Office", - "National Statistical Office - Rule of Law and Access to Justice Survey": "National Statistical Office", - "Food Waste Index Report 2021 / WESR": "UN Statistics Division", - "UNIDO National Accounts Database. Available at https://stat.unido.org. IEA (2021), Greenhouse gas emissions from energy. https://www.iea.org/statistics": "UN Industrial Development Organization and International Energy Agency", - "UNIDO National Accounts Database. Available at https://stat.unido.org": "UN Industrial Development Organization", - "World Bank's Poverty and Inequality Platform": "World Bank" -} \ No newline at end of file diff --git a/etl/steps/archive/meadow/cait/2022-08-10/ghg_emissions_by_sector.py b/etl/steps/archive/meadow/cait/2022-08-10/ghg_emissions_by_sector.py deleted file mode 100644 index dc1a36d101b..00000000000 --- a/etl/steps/archive/meadow/cait/2022-08-10/ghg_emissions_by_sector.py +++ /dev/null @@ -1,104 +0,0 @@ -import gzip -import json - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from shared import NAMESPACE, VERSION - -from etl.steps.data.converters import convert_walden_metadata - -DATASET_SHORT_NAME = "ghg_emissions_by_sector" -DATASET_TITLE = "Greenhouse gas emissions by sector" -WALDEN_SHORT_NAME = "cait_ghg_emissions" -WALDEN_VERSION = "2022-08-10" - - -def load_data(local_file: str) -> pd.DataFrame: - """Create a dataframe out of the raw data. - - Parameters - ---------- - local_file : str - Path to local file of raw data. - - Returns - ------- - df : pd.DataFrame - Raw data in dataframe format. - - """ - with gzip.open(local_file) as _file: - data = json.loads(_file.read()) - - df = pd.DataFrame.from_dict(data) - - return df - - -def prepare_data(df: pd.DataFrame) -> pd.DataFrame: - """Prepare raw data in a more convenient format. - - Parameters - ---------- - df : pd.DataFrame - Original raw data as a dataframe. - - Returns - ------- - df : pd.DataFrame - Original data in a more convenient format. - - """ - # Extract data from column "emissions", which is given as a list of dictionaries with year and value. - df = df.explode("emissions").reset_index(drop=True) - df["year"] = [emissions["year"] for emissions in df["emissions"]] - df["value"] = [emissions["value"] for emissions in df["emissions"]] - df = df.drop(columns="emissions") - - # Set an appropriate index and sort conveniently. - df = df.set_index(["country", "year", "gas", "sector", "data_source"], verify_integrity=True).sort_index() - - return df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Retrieve raw data from walden. - walden_ds = WaldenCatalog().find_one(namespace=NAMESPACE, short_name=WALDEN_SHORT_NAME, version=WALDEN_VERSION) - local_file = walden_ds.ensure_downloaded() - - # Create a dataframe from compressed file. - df = load_data(local_file=local_file) - - # - # Process data. - # - # Prepare data in a convenient format. - df = prepare_data(df=df) - - # - # Save outputs. - # - # Create new dataset, reuse walden metadata, and update metadata. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.short_name = DATASET_SHORT_NAME - ds.metadata.title = DATASET_TITLE - ds.metadata.version = VERSION - ds.save() - - # Create table with metadata from walden. - tb_metadata = TableMeta( - short_name=DATASET_SHORT_NAME, - title=DATASET_TITLE, - description=walden_ds.description, - ) - tb = Table(df, metadata=tb_metadata) - # Underscore all table columns. - tb = underscore_table(tb) - # Add table to dataset. - ds.add(tb) diff --git a/etl/steps/archive/meadow/cait/2022-08-10/shared.py b/etl/steps/archive/meadow/cait/2022-08-10/shared.py deleted file mode 100644 index 6f58e8211f8..00000000000 --- a/etl/steps/archive/meadow/cait/2022-08-10/shared.py +++ /dev/null @@ -1,9 +0,0 @@ -from pathlib import Path - -from structlog import get_logger - -log = get_logger() - -NAMESPACE = "cait" -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name diff --git a/etl/steps/archive/meadow/eia/2022-07-27/energy_consumption.py b/etl/steps/archive/meadow/eia/2022-07-27/energy_consumption.py deleted file mode 100644 index 8f174c06538..00000000000 --- a/etl/steps/archive/meadow/eia/2022-07-27/energy_consumption.py +++ /dev/null @@ -1,152 +0,0 @@ -"""Meadow step to generate a dataset on total energy consumption using EIA data. - -""" - -from typing import cast - -import numpy as np -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from structlog import get_logger - -from etl.steps.data.converters import convert_walden_metadata - -log = get_logger() - -# Namespace, short name, title and description of the output dataset. -NAMESPACE = "eia" -DATASET_SHORT_NAME = "energy_consumption" -DATASET_TITLE = "Energy consumption (EIA, 2022)" -DATASET_DESCRIPTION = "Total energy consumption." -# Short name of raw data in walden. -WALDEN_DATASET_SHORT_NAME = "international_energy_data" -WALDEN_VERSION = "2022-07-27" -# Name of variable and unit as given in the raw data file. -VARIABLE_NAME = "Total energy consumption" -UNIT_NAME = "terajoules" - - -def extract_variable_from_raw_eia_data( - raw_data: pd.DataFrame, - variable_name: str, - unit_name: str, - data_time_interval: str = "Annual", -) -> pd.DataFrame: - """Extract data for a certain variable and unit from the raw EIA data (the International Energy Data obtained via - bulk download). - - The raw data is in a json format. After reading it with pandas (`pd.read_json(data_file, lines=True)`), the - dataframe has one row per variable-country, e.g. `Total energy consumption, Germany, Annual`, and the data for this - variable-country is given in the same row, but a different column. That cell with data is a list of lists, e.g. - `[[2000, 0.5], [2001, 0.6], ...]`. This dataframe seems to have some duplicated rows (which we will simply drop). - - This function extracts will extract that data and create a more convenient, long-format dataframe indexed by - country-year. It will also contain a column of 'members', which gives the country code of countries included in each - row. This may be useful to know how aggregate regions are defined by EIA. - - Parameters - ---------- - raw_data : pd.DataFrame - Raw EIA data. - variable_name : str - Name of variable to extract, as given in the raw data file. - unit_name : str - Name of unit to extract, as given in the raw data file. - data_time_interval : str - Time interval (e.g. 'Annual'), as given in the raw data file. - - Returns - ------- - data : pd.DataFrame - Extracted data for given variable and unit, as a dataframe indexed by country-year. - - """ - - columns = { - "name": "country", - "geography": "members", - "data": "values", - } - # Keep only rows with data for the given variable and unit. - data = raw_data[ - raw_data["name"].str.contains(variable_name, regex=False) & (raw_data["units"] == unit_name) - ].reset_index(drop=True) - - # Select and rename columns. - data = data.loc[:, list(columns)].rename(columns=columns) - - # Remove rows without data. - data = data.dropna(subset=["values"]) - - # Extract the country name. - data["country"] = data["country"].str.split(f"{variable_name}, ").str[1].str.split(f", {data_time_interval}").str[0] - - # For some reason some countries are duplicated; drop those duplicates. - data = data.drop_duplicates(subset="country", keep="last") - - # Expand the list of lists (e.g. `[[2000, 0.5], [2001, 0.6], ...]`) as one year-value per row (e.g. `[2000, 0.5]`). - data = data.explode("values").reset_index(drop=True) - - # Separate years from values in different columns. - data["year"] = data["values"].str[0] - data["values"] = data["values"].str[1] - - # Missing values are given as '--' in the original data, replace them with nan. - data["values"] = data["values"].replace("--", np.nan).astype(float) - - # Set index and sort appropriately. - data = data.set_index(["country", "year"], verify_integrity=True).sort_index() - - return cast(pd.DataFrame, data) - - -def run(dest_dir: str) -> None: - log.info(f"{DATASET_SHORT_NAME}.start") - - # - # Load data. - # - # Load ingested raw data from walden. - walden_ds = WaldenCatalog().find_one( - namespace=NAMESPACE, - short_name=WALDEN_DATASET_SHORT_NAME, - version=WALDEN_VERSION, - ) - local_file = walden_ds.ensure_downloaded() - raw_data = pd.read_json(local_file, lines=True) - - # - # Process data. - # - # Extract total energy consumption from the raw data. - data = extract_variable_from_raw_eia_data(raw_data=raw_data, variable_name=VARIABLE_NAME, unit_name=UNIT_NAME) - - # - # Save outputs. - # - # Create new dataset using metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - # Update metadata appropriately. - ds.metadata.short_name = DATASET_SHORT_NAME - ds.metadata.title = DATASET_TITLE - ds.metadata.description = DATASET_DESCRIPTION - ds.save() - - # Create a table in the dataset with the same metadata as the dataset. - table_metadata = TableMeta( - short_name=DATASET_SHORT_NAME, - title=DATASET_TITLE, - description=DATASET_DESCRIPTION, - ) - tb = Table(data, metadata=table_metadata) - - # Ensure all columns are lower-case and snake-case. - tb = underscore_table(tb) - - # Add table to a dataset. - ds.add(tb) - - log.info(f"{DATASET_SHORT_NAME}.end") diff --git a/etl/steps/archive/meadow/eia/2022-07-27/shared.py b/etl/steps/archive/meadow/eia/2022-07-27/shared.py deleted file mode 100644 index 1fbde3433ad..00000000000 --- a/etl/steps/archive/meadow/eia/2022-07-27/shared.py +++ /dev/null @@ -1,4 +0,0 @@ -from pathlib import Path - -CURRENT_DIR = Path(__file__).parent -VERSION = CURRENT_DIR.name diff --git a/etl/steps/archive/meadow/ember/2022-12-13/yearly_electricity.py b/etl/steps/archive/meadow/ember/2022-12-13/yearly_electricity.py deleted file mode 100644 index e10c73ed6cc..00000000000 --- a/etl/steps/archive/meadow/ember/2022-12-13/yearly_electricity.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Load snapshot of Ember's Yearly Electricity Data and create a raw data table. - -""" -import pandas as pd -from owid.catalog import Dataset, Table - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_snapshot_metadata - -# Get naming conventions. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Retrieve snapshot. - snap = paths.load_dependency("yearly_electricity.csv") - df = pd.read_csv(snap.path) - - # Create new dataset and reuse original metadata. - ds = Dataset.create_empty(dest_dir, metadata=convert_snapshot_metadata(snap.metadata)) - ds.metadata.version = paths.version - - # Create a table with metadata and ensure all columns are snake-case. - tb = Table(df, short_name=snap.metadata.short_name, underscore=True) - - # Set appropriate indexes. - tb = tb.set_index(["area", "year", "variable", "unit"], verify_integrity=True) - - # Add table to the new dataset, and save dataset. - ds.add(tb) - ds.save() diff --git a/etl/steps/archive/meadow/ember/2023-02-20/yearly_electricity.py b/etl/steps/archive/meadow/ember/2023-02-20/yearly_electricity.py deleted file mode 100644 index e10c73ed6cc..00000000000 --- a/etl/steps/archive/meadow/ember/2023-02-20/yearly_electricity.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Load snapshot of Ember's Yearly Electricity Data and create a raw data table. - -""" -import pandas as pd -from owid.catalog import Dataset, Table - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_snapshot_metadata - -# Get naming conventions. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # Retrieve snapshot. - snap = paths.load_dependency("yearly_electricity.csv") - df = pd.read_csv(snap.path) - - # Create new dataset and reuse original metadata. - ds = Dataset.create_empty(dest_dir, metadata=convert_snapshot_metadata(snap.metadata)) - ds.metadata.version = paths.version - - # Create a table with metadata and ensure all columns are snake-case. - tb = Table(df, short_name=snap.metadata.short_name, underscore=True) - - # Set appropriate indexes. - tb = tb.set_index(["area", "year", "variable", "unit"], verify_integrity=True) - - # Add table to the new dataset, and save dataset. - ds.add(tb) - ds.save() diff --git a/etl/steps/archive/meadow/ember/2023-06-01/yearly_electricity.py b/etl/steps/archive/meadow/ember/2023-06-01/yearly_electricity.py deleted file mode 100644 index cbfb5149463..00000000000 --- a/etl/steps/archive/meadow/ember/2023-06-01/yearly_electricity.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Load snapshot of Ember's Yearly Electricity Data and create a raw data table. - -""" -import pandas as pd -from owid.catalog import Table - -from etl.helpers import PathFinder, create_dataset - -# Get naming conventions. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Retrieve snapshot. - snap = paths.load_dependency("yearly_electricity.csv") - df = pd.read_csv(snap.path) - - # - # Process data. - # - # Create a table with metadata and ensure all columns are snake-case. - tb = Table(df, short_name=paths.short_name, underscore=True) - - # Set an appropriate index and sort conveniently. - tb = tb.set_index(["area", "year", "variable", "unit"], verify_integrity=True).sort_index().sort_index(axis=1) - - # - # Save outputs. - # - # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) - - # Save changes in the new garden dataset. - ds_meadow.save() diff --git a/etl/steps/archive/meadow/emdat/2022-11-24/natural_disasters.py b/etl/steps/archive/meadow/emdat/2022-11-24/natural_disasters.py deleted file mode 100644 index c71b5dc24dc..00000000000 --- a/etl/steps/archive/meadow/emdat/2022-11-24/natural_disasters.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Load snapshot of EM-DAT natural disasters data and prepare a table with basic metadata. - -""" - -import warnings - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta - -from etl.helpers import PathFinder -from etl.snapshot import Snapshot -from etl.steps.data.converters import convert_snapshot_metadata - -# Snapshot version. -SNAPSHOT_VERSION = "2022-11-24" -# Current Meadow dataset version. -VERSION = SNAPSHOT_VERSION - -# Get naming conventions. -N = PathFinder(__file__) - -# Columns to extract from raw data, and how to rename them. -COLUMNS = { - "Country": "country", - "Year": "year", - "Disaster Group": "group", - "Disaster Subgroup": "subgroup", - "Disaster Type": "type", - "Disaster Subtype": "subtype", - "Disaster Subsubtype": "subsubtype", - "Event Name": "event", - "Region": "region", - "Continent": "continent", - "Total Deaths": "total_dead", - "No Injured": "injured", - "No Affected": "affected", - "No Homeless": "homeless", - "Total Affected": "total_affected", - "Reconstruction Costs ('000 US$)": "reconstruction_costs", - "Insured Damages ('000 US$)": "insured_damages", - "Total Damages ('000 US$)": "total_damages", - "Start Year": "start_year", - "Start Month": "start_month", - "Start Day": "start_day", - "End Year": "end_year", - "End Month": "end_month", - "End Day": "end_day", -} - - -def run(dest_dir: str) -> None: - # Load snapshot. - snap = Snapshot(f"emdat/{SNAPSHOT_VERSION}/natural_disasters.xlsx") - with warnings.catch_warnings(record=True): - df = pd.read_excel(snap.path, sheet_name="emdat data", skiprows=6) - - # Select and rename columns. - df = df[list(COLUMNS)].rename(columns=COLUMNS) - - # Sanity check. - error = "Expected only 'Natural' in 'group' column." - assert set(df["group"]) == set(["Natural"]), error - - # Create a new dataset and reuse snapshot metadata. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_snapshot_metadata(snap.metadata) - ds.metadata.version = VERSION - - # Create a table with metadata from dataframe. - table_metadata = TableMeta( - short_name=snap.metadata.short_name, - title=snap.metadata.name, - description=snap.metadata.description, - ) - tb = Table(df, metadata=table_metadata, underscore=True) - - # Add table to new dataset and save dataset. - ds.add(tb) - ds.save() diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ef.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ef.py deleted file mode 100644 index c1b3ce5eec8..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ef.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ef dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ei.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ei.py deleted file mode 100644 index 8f8c520ac1c..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ei.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ei dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ek.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ek.py deleted file mode 100644 index 8affbd5ac70..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ek.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ek dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_el.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_el.py deleted file mode 100644 index 7cda6b5ced7..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_el.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_el dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_emn.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_emn.py deleted file mode 100644 index e0341d5f29b..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_emn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_emn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ep.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ep.py deleted file mode 100644 index de1278faacf..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ep.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ep dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_esb.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_esb.py deleted file mode 100644 index d90d2c0538a..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_esb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_esb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fa.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fa.py deleted file mode 100644 index 29014f1b54a..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fa.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_fa dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fbs.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fbs.py deleted file mode 100644 index 65cbf54e4e3..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fbs.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_fbs dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fbsh.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fbsh.py deleted file mode 100644 index ef0b7233357..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fbsh.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_fbsh dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fo.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fo.py deleted file mode 100644 index 9932ebb4718..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fo.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_fo dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fs.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fs.py deleted file mode 100644 index 74f1892050e..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_fs.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_fs dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_lc.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_lc.py deleted file mode 100644 index a18b1892fbf..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_lc.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_lc dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_metadata.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_metadata.py deleted file mode 100644 index ddcb64268b5..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_metadata.py +++ /dev/null @@ -1,147 +0,0 @@ -"""FAOSTAT (additional) metadata dataset (originally ingested in walden using the FAOSTAT API). - -Load the (additional) metadata dataset from walden, and create a meadow dataset with as many tables as domain-categories -(e.g. 'faostat_qcl_area', 'faostat_fbs_item', ...). - -All categories are defined below in 'category_structure'. - -""" - -import json -from typing import Any, Dict - -import pandas as pd -from owid.catalog import Dataset, Table, utils -from owid.walden import Catalog -from shared import LATEST_VERSIONS_FILE, NAMESPACE - -from etl.steps.data.converters import convert_walden_metadata - -# Name for new meadow dataset. -DATASET_SHORT_NAME = f"{NAMESPACE}_metadata" - -# Define the structure of the additional metadata file. -category_structure = { - "area": { - "index": ["Country Code"], - "short_name": "area", - }, - "areagroup": { - "index": ["Country Group Code", "Country Code"], - "short_name": "area_group", - }, - "element": { - "index": ["Element Code"], - "short_name": "element", - }, - "flag": { - "index": ["Flag"], - "short_name": "flag", - }, - "glossary": { - "index": ["Glossary Code"], - "short_name": "glossary", - }, - "item": { - "index": ["Item Code"], - "short_name": "item", - }, - "itemfactor": { - "index": ["Item Group Code", "Item Code", "Element Code"], - "short_name": "item_factor", - }, - "itemgroup": { - "index": ["Item Group Code", "Item Code"], - "short_name": "item_group", - }, - "items": { - "index": ["Item Code"], - "short_name": "item", - }, - "itemsgroup": { - "index": ["Item Group Code", "Item Code"], - "short_name": "item_group", - }, - "recipientarea": { - "index": ["Recipient Country Code"], - "short_name": "area", - }, - "unit": { - "index": ["Unit Name"], - "short_name": "unit", - }, - "year": { - "index": ["Year Code"], - "short_name": "year", - }, - "year3": { - "index": ["Year Code"], - "short_name": "year", - }, -} - - -def check_that_category_structure_is_well_defined(md: Dict[str, Any]) -> None: - """Check that metadata content is consistent with category_structure (defined above). - - If that is not the case, it is possible that the content of metadata has changed, and therefore category_structure - may need to be edited. - - Parameters - ---------- - md : dict - Raw FAOSTAT (additional) metadata of all datasets. - - """ - for dataset in list(md): - for category in category_structure: - category_indexes = category_structure[category]["index"] - if category in md[dataset]: - category_metadata = md[dataset][category]["data"] - for entry in category_metadata: - for category_index in category_indexes: - error = ( - f"Index {category_index} not found in {category} for {dataset}. " - f"Consider redefining category_structure." - ) - assert category_index in entry, error - - -def run(dest_dir: str) -> None: - # Load file of versions. - latest_versions = pd.read_csv(LATEST_VERSIONS_FILE).set_index(["channel", "dataset"]) - - # Load FAOSTAT (additional) metadata dataset from walden. - walden_latest_version = latest_versions.loc["walden", DATASET_SHORT_NAME].item() - walden_ds = Catalog().find_one( - namespace=NAMESPACE, - version=walden_latest_version, - short_name=DATASET_SHORT_NAME, - ) - - local_file = walden_ds.ensure_downloaded() - with open(local_file) as _local_file: - additional_metadata = json.load(_local_file) - - # Check that metadata content is consistent with category_structure (defined above). - check_that_category_structure_is_well_defined(md=additional_metadata) - - # Create new meadow dataset, importing its metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.short_name = DATASET_SHORT_NAME - ds.save() - # Create a new table within the dataset for each domain-record (e.g. 'faostat_qcl_item'). - for domain in additional_metadata: - for category in list(additional_metadata[domain]): - json_data = additional_metadata[domain][category]["data"] - df = pd.DataFrame.from_dict(json_data) - if len(df) > 0: - df.set_index( - category_structure[category]["index"], - verify_integrity=True, - inplace=True, - ) - t = Table(df) - t.metadata.short_name = f'{NAMESPACE}_{domain.lower()}_{category_structure[category]["short_name"]}' - ds.add(utils.underscore_table(t)) diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_qcl.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_qcl.py deleted file mode 100644 index d66b2edc113..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_qcl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_qcl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_qi.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_qi.py deleted file mode 100644 index 460cc5faca5..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_qi.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_qi dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_qv.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_qv.py deleted file mode 100644 index 07e74a4a95b..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_qv.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_qv dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rfb.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rfb.py deleted file mode 100644 index ae439c21964..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rfb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_rfb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rfn.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rfn.py deleted file mode 100644 index bae546a50e5..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rfn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_rfn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rl.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rl.py deleted file mode 100644 index cb95f2263fb..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_rl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rp.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rp.py deleted file mode 100644 index 010769e5587..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rp.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_rp dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rt.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rt.py deleted file mode 100644 index 7254a8063e9..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_rt.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_rt dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_scl.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_scl.py deleted file mode 100644 index 9e81649bef0..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_scl.py +++ /dev/null @@ -1,117 +0,0 @@ -"""FAOSTAT meadow step for faostat_scl dataset. - -In the original data item_code seems to be mistaken with cpc_code. -For example, item "Wheat" in the data has item code 111, but in the metadata, "Wheat" has item code 15 (and cpc code -111). This does not affect the data values, but if we wanted to merge this dataset with another one using item code, -we would get wrong results. Also, descriptions fetched from the metadata would be wrong for this dataset. -Here, we rename item_code to cpc_code and join with the metadata to get the true item codes. - -Apart from this issue, the rest of the processing of the dataset is identical to all other datasets. - -""" - -from pathlib import Path - -import pandas as pd -from owid.catalog import Dataset, Table, utils -from owid.walden import Catalog -from shared import ( - LATEST_VERSIONS_FILE, - NAMESPACE, - load_data, - prepare_output_data, - run_sanity_checks, -) - -from etl.paths import DATA_DIR -from etl.steps.data.converters import convert_walden_metadata - - -def fix_items(data: pd.DataFrame, metadata: Dataset) -> pd.DataFrame: - """Add the true item codes to the data, extracted from the metadata. - - Parameters - ---------- - data : pd.DataFrame - Data for faostat_scl. - metadata : catalog.Dataset - Global metadata dataset. - - Returns - ------- - data_fixed : pd.DataFrame - Original data after replacing item_code by the true item codes. - - """ - # Get items metadata for faostat_scl dataset. - items_metadata = metadata[f"{NAMESPACE}_scl_item"] - - # Replace item_code by cpc_code, join with items metadata for this dataset, and get the right item_codes. - data_fixed = ( - pd.merge( - data.reset_index(drop=True).rename(columns={"Item Code": "cpc_code"}), - items_metadata.reset_index()[["cpc_code", "item_code"]], - on="cpc_code", - how="left", - ) - .drop(columns="cpc_code") - .rename(columns={"item_code": "Item Code"}) - ) - - return data_fixed - - -def run(dest_dir: str) -> None: - #################################################################################################################### - # Common definitions. - #################################################################################################################### - - # Assume dest_dir is a path to the step that needs to be run, e.g. "faostat_qcl", and fetch dataset short name from - # that path. - dataset_short_name = Path(dest_dir).name - - #################################################################################################################### - # Load data. - #################################################################################################################### - - # Load file of versions. - latest_versions = pd.read_csv(LATEST_VERSIONS_FILE).set_index(["channel", "dataset"]) - - # Fetch latest walden dataset. - walden_version = latest_versions.loc["walden", dataset_short_name].item() - walden_ds = Catalog().find_one(namespace=NAMESPACE, version=walden_version, short_name=dataset_short_name) - - # Load data. - data = load_data(walden_ds.local_path) - - # Load metadata. - metadata_version = latest_versions.loc["meadow", f"{NAMESPACE}_metadata"].item() - metadata = Dataset(DATA_DIR / "meadow" / NAMESPACE / metadata_version / f"{NAMESPACE}_metadata") - - #################################################################################################################### - # Prepare data. - #################################################################################################################### - - # Fix issue with item codes. - data = fix_items(data=data, metadata=metadata) - - # Run sanity checks. - run_sanity_checks(data=data) - - #################################################################################################################### - # Save outputs. - #################################################################################################################### - - # Prepare output data. - data = prepare_output_data(data=data) - - # Initialise meadow dataset. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.short_name = dataset_short_name - ds.save() - - # Add tables to dataset. - t = Table(data) - t.metadata.short_name = dataset_short_name - ds.add(utils.underscore_table(t)) diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_sdgb.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_sdgb.py deleted file mode 100644 index bde23c34c06..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_sdgb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_sdgb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_tcl.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_tcl.py deleted file mode 100644 index c5299c892af..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_tcl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_tcl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ti.py b/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ti.py deleted file mode 100644 index 9cfc9f9af7a..00000000000 --- a/etl/steps/archive/meadow/faostat/2022-05-17/faostat_ti.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ti dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ef.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ef.py deleted file mode 100644 index c1b3ce5eec8..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ef.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ef dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ei.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ei.py deleted file mode 100644 index 8f8c520ac1c..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ei.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ei dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ek.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ek.py deleted file mode 100644 index 8affbd5ac70..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ek.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ek dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_el.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_el.py deleted file mode 100644 index 7cda6b5ced7..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_el.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_el dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_emn.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_emn.py deleted file mode 100644 index e0341d5f29b..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_emn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_emn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ep.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ep.py deleted file mode 100644 index de1278faacf..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ep.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ep dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_esb.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_esb.py deleted file mode 100644 index d90d2c0538a..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_esb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_esb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fa.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fa.py deleted file mode 100644 index 29014f1b54a..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fa.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_fa dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fbs.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fbs.py deleted file mode 100644 index 65cbf54e4e3..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fbs.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_fbs dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fbsh.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fbsh.py deleted file mode 100644 index ef0b7233357..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fbsh.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_fbsh dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fo.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fo.py deleted file mode 100644 index 9932ebb4718..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fo.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_fo dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fs.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fs.py deleted file mode 100644 index 74f1892050e..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_fs.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_fs dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_gn.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_gn.py deleted file mode 100644 index 6cc1cdd3414..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_gn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_gn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ic.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ic.py deleted file mode 100644 index 76a7833c6f8..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ic.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ic dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_lc.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_lc.py deleted file mode 100644 index a18b1892fbf..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_lc.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_lc dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_metadata.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_metadata.py deleted file mode 100644 index 2bebb7c9401..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_metadata.py +++ /dev/null @@ -1,190 +0,0 @@ -"""Load FAOSTAT (additional) metadata (snapshot ingested using the API) and create a meadow faostat_metadata dataset. - -The resulting meadow dataset has as many tables as domain-categories ('faostat_qcl_area', 'faostat_fbs_item', ...). - -All categories are defined below in 'category_structure'. - -""" - -from pathlib import Path -from typing import Any, Dict, List - -import pandas as pd -from owid.catalog import Table -from owid.datautils.io import load_json -from shared import CURRENT_DIR, NAMESPACE - -from etl.helpers import PathFinder, create_dataset - -# Name for new meadow dataset. -DATASET_SHORT_NAME = f"{NAMESPACE}_metadata" - -# Define the structure of the additional metadata file. -category_structure = { - "area": { - "index": ["Country Code"], - "short_name": "area", - }, - "areagroup": { - "index": ["Country Group Code", "Country Code"], - "short_name": "area_group", - }, - "element": { - "index": ["Element Code"], - "short_name": "element", - }, - "flag": { - "index": ["Flag"], - "short_name": "flag", - }, - "glossary": { - "index": ["Glossary Code"], - "short_name": "glossary", - }, - "item": { - "index": ["Item Code"], - "short_name": "item", - }, - "itemfactor": { - "index": ["Item Group Code", "Item Code", "Element Code"], - "short_name": "item_factor", - }, - "itemgroup": { - "index": ["Item Group Code", "Item Code"], - "short_name": "item_group", - }, - "items": { - "index": ["Item Code"], - "short_name": "item", - }, - "itemsgroup": { - "index": ["Item Group Code", "Item Code"], - "short_name": "item_group", - }, - # Specific for faostat_fa. - "recipientarea": { - "index": ["Recipient Country Code"], - "short_name": "area", - }, - "unit": { - "index": ["Unit Name"], - "short_name": "unit", - }, - # Specific for faostat_fa. - "year": { - "index": ["Year Code"], - "short_name": "year", - }, - # Specific for faostat_fs. - "year3": { - "index": ["Year Code"], - "short_name": "year", - }, - "years": { - "index": ["Year Code"], - "short_name": "year", - }, - # Specific for faostat_wcad. - "yearwca": { - "index": ["Year Code"], - "short_name": "year", - }, - # Specific for faostat_gn. - "sources": { - "index": ["Source Code"], - "short_name": "sources", - }, -} - - -def check_that_category_structure_is_well_defined(md: Dict[str, Any]) -> None: - """Check that metadata content is consistent with category_structure (defined above). - - If that is not the case, it is possible that the content of metadata has changed, and therefore category_structure - may need to be edited. - - Parameters - ---------- - md : dict - Raw FAOSTAT (additional) metadata of all datasets. - - """ - for dataset in list(md): - for category in category_structure: - category_indexes = category_structure[category]["index"] - if category in md[dataset]: - category_metadata = md[dataset][category]["data"] - for entry in category_metadata: - for category_index in category_indexes: - error = ( - f"Index {category_index} not found in {category} for {dataset}. " - f"Consider redefining category_structure." - ) - assert category_index in entry, error - - -def create_tables_for_all_domain_records(additional_metadata: Dict[str, Any]) -> List[Table]: - """Create a table for each of the domain-categories (e.g. 'faostat_qcl_item'). - - Parameters - ---------- - additional_metadata : Dict[str, Any] - FAOSTAT additional metadata. - - Returns - ------- - tables: List[Table] - List of tables, each one corresponding to a specific domain-category. - - """ - # Create a new table for each domain-category (e.g. 'faostat_qcl_item'). - tables = [] - for domain in additional_metadata: - for category in list(additional_metadata[domain]): - json_data = additional_metadata[domain][category]["data"] - df = pd.DataFrame.from_dict(json_data) - if len(df) > 0: - df.set_index( - category_structure[category]["index"], - verify_integrity=True, - inplace=True, - ) - table_short_name = f'{NAMESPACE}_{domain.lower()}_{category_structure[category]["short_name"]}' - table = Table(df, short_name=table_short_name) - tables.append(table) - - return tables - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Fetch the dataset short name from dest_dir. - dataset_short_name = Path(dest_dir).name - - # Define path to current step file. - current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") - - # Get paths and naming conventions for current data step. - paths = PathFinder(current_step_file.as_posix()) - - # Load snapshot. - snapshot = paths.load_dependency(short_name=dataset_short_name + ".json", channel="snapshot") - additional_metadata = load_json(snapshot.path) - - # - # Process data. - # - # Run sanity checks. - check_that_category_structure_is_well_defined(md=additional_metadata) - - # Create a new table for each domain-record (e.g. 'faostat_qcl_item'). - tables = create_tables_for_all_domain_records(additional_metadata=additional_metadata) - - # - # Save outputs. - # - # Create a new meadow dataset. - ds_meadow = create_dataset(dest_dir=dest_dir, tables=tables, default_metadata=snapshot.metadata) - ds_meadow.save() diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_qcl.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_qcl.py deleted file mode 100644 index d66b2edc113..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_qcl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_qcl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_qi.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_qi.py deleted file mode 100644 index 460cc5faca5..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_qi.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_qi dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_qv.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_qv.py deleted file mode 100644 index 07e74a4a95b..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_qv.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_qv dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rfb.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rfb.py deleted file mode 100644 index ae439c21964..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rfb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_rfb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rfn.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rfn.py deleted file mode 100644 index bae546a50e5..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rfn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_rfn dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rl.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rl.py deleted file mode 100644 index cb95f2263fb..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_rl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rp.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rp.py deleted file mode 100644 index 010769e5587..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rp.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_rp dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rt.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rt.py deleted file mode 100644 index 7254a8063e9..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_rt.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_rt dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_scl.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_scl.py deleted file mode 100644 index e9fc0ab99e4..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_scl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_scl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_sdgb.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_sdgb.py deleted file mode 100644 index bde23c34c06..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_sdgb.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_sdgb dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_tcl.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_tcl.py deleted file mode 100644 index c5299c892af..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_tcl.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_tcl dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ti.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ti.py deleted file mode 100644 index 9cfc9f9af7a..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_ti.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ti dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_wcad.py b/etl/steps/archive/meadow/faostat/2023-02-22/faostat_wcad.py deleted file mode 100644 index e3e3d84a0a8..00000000000 --- a/etl/steps/archive/meadow/faostat/2023-02-22/faostat_wcad.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_wcad dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/archive/meadow/gcp/2022-09-29/global_carbon_budget_additional.py b/etl/steps/archive/meadow/gcp/2022-09-29/global_carbon_budget_additional.py deleted file mode 100644 index 985a1d19dc8..00000000000 --- a/etl/steps/archive/meadow/gcp/2022-09-29/global_carbon_budget_additional.py +++ /dev/null @@ -1,175 +0,0 @@ -"""This step just loads additional variables that are currently not included in the Global Carbon Budget (GCB) dataset -(which was created in importers). - -In the future (next time GCB dataset is updated and moved to ETL), a newer version of this step should gather all -required data from walden. - -""" - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog - -from etl.steps.data.converters import convert_walden_metadata - -# Conversion factor to change from million tonnes of carbon to tonnes of CO2. -MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 -# Conversion factor to change from billion tonnes of carbon to tonnes of CO2. -BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e9 - -# Details of dataset(s) to be imported. -WALDEN_GLOBAL_DATASET_NAME = "global_carbon_budget_global" -WALDEN_NATIONAL_DATASET_NAME = "global_carbon_budget_national" -WALDEN_VERSION = "2022-09-29" -# Details of dataset to be exported. -MEADOW_VERSION = "2022-09-29" -MEADOW_DATASET_NAME = "global_carbon_budget_additional" -MEADOW_TITLE = "Global Carbon Budget - Additional variables" - - -def prepare_historical_budget(df: pd.DataFrame) -> pd.DataFrame: - """Select variables and prepare the historical budget sheet of GCB's raw national data file. - - Parameters - ---------- - df : pd.DataFrame - Historical budget sheet of GCB's raw national data file. - - Returns - ------- - df : pd.DataFrame - Historical budget after selecting variables and processing them. - - """ - # Columns to select in historical budget and how to rename them. - columns = { - "Year": "year", - "fossil emissions excluding carbonation": "global_fossil_emissions", - "land-use change emissions": "global_land_use_change_emissions", - } - df = df[list(columns)].rename(columns=columns) - - # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in df.drop(columns="year").columns: - df[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - # Add column for country (to be able to combine this with the national data). - df["country"] = "World" - - # Set an index and sort row and columns conveniently. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return df - - -def prepare_emissions(df: pd.DataFrame, column_name: str) -> pd.DataFrame: - """Select variables and prepare the territorial emissions or the consumption emissions sheet of - GCB's raw global data file. - - Parameters - ---------- - df : pd.DataFrame - Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. - column_name : str - Name to assign to emissions column to be generated. - - Returns - ------- - df : pd.DataFrame - Processed territorial (or consumption) emissions sheet of GCB's raw global data file. - - """ - df = df.copy() - - # The zeroth column is expected to be year. - df = df.rename(columns={df.columns[0]: "year"}) - - # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". - # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). - # In fact "Bunkers" is a global variable (I don't know why it is included at the national level), but this will be - # handled at the garden step. - - # Remove unnecessary column. - df = df.drop(columns=["Statistical Difference"]) - - # Convert from wide to long format dataframe. - df = df.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) - - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in df.drop(columns=["country", "year"]).columns: - df[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - # Set an index and sort row and columns conveniently. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load historical budget data from the global data file in walden. - global_ds = WaldenCatalog().find_one(namespace="gcp", short_name=WALDEN_GLOBAL_DATASET_NAME, version=WALDEN_VERSION) - historical_budget_df = pd.read_excel(global_ds.ensure_downloaded(), sheet_name="Historical Budget", skiprows=15) - error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." - assert historical_budget_df.columns[0] == "Year", error - - # Load national data file from walden. - national_ds = WaldenCatalog().find_one( - namespace="gcp", short_name=WALDEN_NATIONAL_DATASET_NAME, version=WALDEN_VERSION - ) - # Load production-based emissions from the national data file. - production_emissions_df = pd.read_excel( - national_ds.ensure_downloaded(), sheet_name="Territorial Emissions", skiprows=11 - ) - error = "'Territorial Emissions' sheet in national data file has changed (consider changing 'skiprows')." - assert production_emissions_df.columns[1] == "Afghanistan", error - # Load consumption-based emissions from the national data file. - consumption_emissions_df = pd.read_excel( - national_ds.ensure_downloaded(), sheet_name="Consumption Emissions", skiprows=8 - ) - error = "'Consumption Emissions' sheet in national data file has changed (consider changing 'skiprows')." - assert consumption_emissions_df.columns[1] == "Afghanistan", error - - # - # Process data. - # - # Prepare historical budget data. - historical_budget_df = prepare_historical_budget(df=historical_budget_df) - - # Prepare production and consumption based emissions data. - production_emissions_df = prepare_emissions(df=production_emissions_df, column_name="production_emissions") - consumption_emissions_df = prepare_emissions(df=consumption_emissions_df, column_name="consumption_emissions") - - # - # Save outputs. - # - # Create new dataset and reuse walden metadata (from any of the raw files). - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(global_ds) - ds.metadata.version = MEADOW_VERSION - ds.metadata.short_name = MEADOW_DATASET_NAME - # Create tables with metadata. - consumption_emissions_tb = Table( - consumption_emissions_df, - metadata=TableMeta(short_name="consumption_emissions", title="Consumption-based emissions"), - ) - production_emissions_tb = Table( - production_emissions_df, - metadata=TableMeta(short_name="production_emissions", title="Production-based emissions"), - ) - historical_budget_tb = Table( - historical_budget_df, metadata=TableMeta(short_name="historical_emissions", title="Historical emissions") - ) - # Ensure all columns are lower snake case. - consumption_emissions_tb = underscore_table(consumption_emissions_tb) - production_emissions_tb = underscore_table(production_emissions_tb) - historical_budget_tb = underscore_table(historical_budget_tb) - # Add tables to new dataset. - ds.add(consumption_emissions_tb) - ds.add(production_emissions_tb) - ds.add(historical_budget_tb) - # Save dataset. - ds.save() diff --git a/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_fossil_co2_emissions.py b/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_fossil_co2_emissions.py deleted file mode 100644 index 5c25e20b613..00000000000 --- a/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_fossil_co2_emissions.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Prepare Global Carbon Budget Fossil CO2 data. - -The resulting dataset will have one table of national fossil CO2 emissions (that does not include land-use change -emissions). Bunker emissions are included as a separate country, called "International Transport". - -""" - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_walden_metadata - -# Get naming conventions. -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load fossil CO2 data from Walden. - emissions_ds = N.walden_dataset - # Create a dataframe with the data. - emissions_df = pd.read_csv(emissions_ds.ensure_downloaded()) - - # - # Process data. - # - # Set an appropriate index and sort conveniently. - emissions_df = emissions_df.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # - # Save outputs. - # - # Create new dataset and reuse walden metadata (from any of the raw files). - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(emissions_ds) - - # Create a new table with metadata. - emissions_tb = Table(emissions_df, metadata=TableMeta(short_name=N.short_name)) - - # Ensure all columns are lower snake case. - emissions_tb = underscore_table(emissions_tb) - - # Add table to new dataset and save dataset. - ds.add(emissions_tb) - ds.save() diff --git a/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_global_emissions.py b/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_global_emissions.py deleted file mode 100644 index 2bb0e74bcc6..00000000000 --- a/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_global_emissions.py +++ /dev/null @@ -1,97 +0,0 @@ -"""Prepare global emissions data (from one of the official excel files) of the Global Carbon Budget. - -The resulting dataset will have one table of historical global emissions, where fossil and land-use change emissions are -separate variables. Bunker fuel emissions are not included as a separate variable (but their contribution is included as -part of fossil emissions). - -""" - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog - -from etl.steps.data.converters import convert_walden_metadata - -# Conversion factor to change from billion tonnes of carbon to tonnes of CO2. -BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e9 - -# Details of dataset(s) to be imported. -WALDEN_DATASET_NAME = "global_carbon_budget_global_emissions" -WALDEN_VERSION = "2022-11-11" -# Details of dataset to be exported. -MEADOW_VERSION = WALDEN_VERSION -MEADOW_DATASET_NAME = WALDEN_DATASET_NAME -MEADOW_TITLE = "Global Carbon Budget - Global emissions" - - -def prepare_historical_budget(df: pd.DataFrame) -> pd.DataFrame: - """Select variables and prepare the historical budget sheet of GCB's raw global data file. - - Parameters - ---------- - df : pd.DataFrame - Historical budget sheet of GCB's raw global data file. - - Returns - ------- - df : pd.DataFrame - Historical budget after selecting variables and processing them. - - """ - # Columns to select in historical budget and how to rename them. - columns = { - "Year": "year", - "fossil emissions excluding carbonation": "global_fossil_emissions", - "land-use change emissions": "global_land_use_change_emissions", - } - df = df[list(columns)].rename(columns=columns) - - # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in df.drop(columns="year").columns: - df[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - # Add column for country (to be able to combine this with the national data). - df["country"] = "World" - - # Set an index and sort row and columns conveniently. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load historical budget data from the global data file in walden. - global_ds = WaldenCatalog().find_one(namespace="gcp", short_name=WALDEN_DATASET_NAME, version=WALDEN_VERSION) - historical_budget_df = pd.read_excel(global_ds.ensure_downloaded(), sheet_name="Historical Budget", skiprows=15) - - # Sanity check. - error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." - assert historical_budget_df.columns[0] == "Year", error - - # - # Process data. - # - # Prepare historical budget data. - historical_budget_df = prepare_historical_budget(df=historical_budget_df) - - # - # Save outputs. - # - # Create new dataset and reuse walden metadata (from any of the raw files). - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(global_ds) - ds.metadata.version = MEADOW_VERSION - # Create tables with metadata. - historical_budget_tb = Table( - historical_budget_df, metadata=TableMeta(short_name="historical_emissions", title="Historical emissions") - ) - # Ensure all columns are lower snake case. - historical_budget_tb = underscore_table(historical_budget_tb) - # Add table to new dataset. - ds.add(historical_budget_tb) - # Save dataset. - ds.save() diff --git a/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_land_use_change_emissions.py b/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_land_use_change_emissions.py deleted file mode 100644 index a69397cb39e..00000000000 --- a/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_land_use_change_emissions.py +++ /dev/null @@ -1,112 +0,0 @@ -"""Prepare national land-use change emissions data (from one of the official excel files) of the Global Carbon Budget. - -""" - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog - -from etl.steps.data.converters import convert_walden_metadata - -# Conversion factor to change from million tonnes of carbon to tonnes of CO2. -MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 - -# Details of dataset(s) to be imported. -WALDEN_DATASET_NAME = "global_carbon_budget_land_use_change_emissions" -WALDEN_VERSION = "2022-11-11" -# Details of dataset to be exported. -MEADOW_VERSION = WALDEN_VERSION -MEADOW_DATASET_NAME = WALDEN_DATASET_NAME -MEADOW_TITLE = "Global Carbon Budget - National land-use change emissions" - - -def prepare_land_use_emissions(land_use_df: pd.DataFrame) -> pd.DataFrame: - """Prepare data from a specific sheet of the land-use change data file. - - Parameters - ---------- - land_use_df : pd.DataFrame - Data from a specific sheet of the land-use change emissions data file. - - Returns - ------- - land_use_df : pd.DataFrame - Processed land-use change emissions data. - - """ - land_use_df = land_use_df.copy() - - # Extract quality flag from the zeroth row of the data. - # Ignore nans (which happen when a certain country has no data). - quality_flag = ( - land_use_df.drop(columns=land_use_df.columns[0]) - .loc[0] - .dropna() - .astype(int) - .to_frame("quality_flag") - .reset_index() - .rename(columns={"index": "country"}) - ) - - # Drop the first row, which is for quality factor (which we have already extracted). - land_use_df = land_use_df.rename(columns={land_use_df.columns[0]: "year"}).drop(0) - - # Ignore countries that have no data. - land_use_df = land_use_df.dropna(axis=1, how="all") - - # Restructure data to have a column for country and another for emissions. - land_use_df = land_use_df.melt(id_vars="year", var_name="country", value_name="emissions") - - error = "Countries with emissions data differ from countries with quality flag." - assert set(land_use_df["country"]) == set(quality_flag["country"]), error - - # Add quality factor as an additional column. - land_use_df = pd.merge(land_use_df, quality_flag, how="left", on="country") - - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - land_use_df["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - # Set an index and sort row and columns conveniently. - land_use_df = land_use_df.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return land_use_df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load national land-use change data file from walden. - land_use_ds = WaldenCatalog().find_one(namespace="gcp", short_name=WALDEN_DATASET_NAME, version=WALDEN_VERSION) - # Load production-based emissions from the national data file. - land_use_df = pd.read_excel(land_use_ds.ensure_downloaded(), sheet_name="BLUE", skiprows=7) - - # Sanity check. - error = "'BLUE' sheet in national land-use change data file has changed (consider changing 'skiprows')." - assert land_use_df.columns[1] == "Afghanistan", error - - # - # Process data. - # - # Prepare land-use change emissions data (including a column for quality flag). - land_use_df = prepare_land_use_emissions(land_use_df=land_use_df) - - # - # Save outputs. - # - # Create new dataset and reuse walden metadata (from any of the raw files). - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(land_use_ds) - ds.metadata.version = MEADOW_VERSION - # Create tables with metadata. - land_use_tb = Table( - land_use_df, - metadata=TableMeta(short_name="land_use_change_emissions", title="Land-use change emissions"), - ) - # Ensure all columns are lower snake case. - land_use_tb = underscore_table(land_use_tb) - # Add table to new dataset. - ds.add(land_use_tb) - # Save dataset. - ds.save() diff --git a/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_national_emissions.py b/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_national_emissions.py deleted file mode 100644 index ce66c835b04..00000000000 --- a/etl/steps/archive/meadow/gcp/2022-11-11/global_carbon_budget_national_emissions.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Prepare national fossil emissions data (from one of the official excel files) of the Global Carbon Budget. - -The resulting dataset will have one table for production-based emissions, and another for consumption-based emissions. -Bunker emissions (which should be the same in both tables) is included as a separate country (called "Bunkers"). - -""" - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog - -from etl.steps.data.converters import convert_walden_metadata - -# Conversion factor to change from million tonnes of carbon to tonnes of CO2. -MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 - -# Details of dataset(s) to be imported. -WALDEN_DATASET_NAME = "global_carbon_budget_national_emissions" -WALDEN_VERSION = "2022-11-11" -# Details of dataset to be exported. -MEADOW_VERSION = WALDEN_VERSION -MEADOW_DATASET_NAME = "global_carbon_budget_national_emissions" -MEADOW_TITLE = "Global Carbon Budget - National emissions" - - -def prepare_national_emissions(df: pd.DataFrame, column_name: str) -> pd.DataFrame: - """Select variables and prepare the territorial emissions (or the consumption emissions) sheet of GCB's raw national - data file. - - Parameters - ---------- - df : pd.DataFrame - Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. - column_name : str - Name to assign to emissions column to be generated. - - Returns - ------- - df : pd.DataFrame - Processed territorial (or consumption) emissions sheet of GCB's raw national data file. - - """ - df = df.copy() - - # The zeroth column is expected to be year. - df = df.rename(columns={df.columns[0]: "year"}) - - # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". - # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). - # In fact "Bunkers" is a global variable (I don't know why it is included at the national level), but this will be - # handled at the garden step. - - # Remove unnecessary column. - df = df.drop(columns=["Statistical Difference"]) - - # Convert from wide to long format dataframe. - df = df.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) - - # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in df.drop(columns=["country", "year"]).columns: - df[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - - # Set an index and sort row and columns conveniently. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load national data file from walden. - national_ds = WaldenCatalog().find_one(namespace="gcp", short_name=WALDEN_DATASET_NAME, version=WALDEN_VERSION) - # Load production-based emissions from the national data file. - production_emissions_df = pd.read_excel( - national_ds.ensure_downloaded(), sheet_name="Territorial Emissions", skiprows=11 - ) - - # Sanity check. - error = "'Territorial Emissions' sheet in national data file has changed (consider changing 'skiprows')." - assert production_emissions_df.columns[1] == "Afghanistan", error - - # Load consumption-based emissions from the national data file. - consumption_emissions_df = pd.read_excel( - national_ds.ensure_downloaded(), sheet_name="Consumption Emissions", skiprows=8 - ) - - # Sanity check. - error = "'Consumption Emissions' sheet in national data file has changed (consider changing 'skiprows')." - assert consumption_emissions_df.columns[1] == "Afghanistan", error - - # - # Process data. - # - # Prepare production-based and consumption-based emissions data. - production_emissions_df = prepare_national_emissions(df=production_emissions_df, column_name="production_emissions") - consumption_emissions_df = prepare_national_emissions( - df=consumption_emissions_df, column_name="consumption_emissions" - ) - - # - # Save outputs. - # - # Create new dataset and reuse walden metadata (from any of the raw files). - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(national_ds) - ds.metadata.version = MEADOW_VERSION - # Create tables with metadata. - consumption_emissions_tb = Table( - consumption_emissions_df, - metadata=TableMeta(short_name="consumption_emissions", title="Consumption-based emissions"), - ) - production_emissions_tb = Table( - production_emissions_df, - metadata=TableMeta(short_name="production_emissions", title="Production-based emissions"), - ) - # Ensure all columns are lower snake case. - consumption_emissions_tb = underscore_table(consumption_emissions_tb) - production_emissions_tb = underscore_table(production_emissions_tb) - # Add tables to new dataset. - ds.add(consumption_emissions_tb) - ds.add(production_emissions_tb) - # Save dataset. - ds.save() diff --git a/etl/steps/archive/meadow/gcp/2023-04-28/global_carbon_budget.py b/etl/steps/archive/meadow/gcp/2023-04-28/global_carbon_budget.py deleted file mode 100644 index 06d59474b01..00000000000 --- a/etl/steps/archive/meadow/gcp/2023-04-28/global_carbon_budget.py +++ /dev/null @@ -1,238 +0,0 @@ -"""Load a snapshot and create a meadow dataset. - -It combines the following snapshots: -- GCP's Fossil CO2 emissions (long-format csv). -- GCP's official GCB global emissions (excel file) containing global bunker fuel and land-use change emissions. -- GCP's official GCB national emissions (excel file) containing consumption-based emissions for each country. - - Production-based emissions from this file are also used, but just to include total emissions of regions - according to GCP (e.g. "Africa (GCP)") and for sanity checks. -- GCP's official GCB national land-use change emissions (excel file) with land-use change emissions for each country. - -""" - -import pandas as pd -from owid.catalog import Table -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset -from etl.snapshot import Snapshot - -# Initialize logger. -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def prepare_fossil_co2(df_fossil_co2: pd.DataFrame) -> Table: - # Set an appropriate index and sort conveniently. - df_fossil_co2 = df_fossil_co2.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create a new table and ensure all columns are snake-case. - tb_fossil_co2 = Table(df_fossil_co2, short_name="global_carbon_budget_fossil_co2_emissions", underscore=True) - - return tb_fossil_co2 - - -def prepare_historical_budget(df_historical_budget: pd.DataFrame) -> Table: - """Select variables and prepare the historical budget sheet of GCB's raw global data file. - - Parameters - ---------- - df_historical_budget : pd.DataFrame - Historical budget sheet of GCB's raw global data file. - - Returns - ------- - tb_historical_budget : Table - Historical budget after selecting variables and processing them. - - """ - # Sanity check. - error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." - assert df_historical_budget.columns[0] == "Year", error - - # Columns to select in historical budget and how to rename them. - columns = { - "Year": "year", - "fossil emissions excluding carbonation": "global_fossil_emissions", - "land-use change emissions": "global_land_use_change_emissions", - } - df_historical_budget = df_historical_budget[list(columns)].rename(columns=columns) - - # Add column for country (to be able to combine this with the national data). - df_historical_budget["country"] = "World" - - # Set an index and sort row and columns conveniently. - df_historical_budget = ( - df_historical_budget.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - ) - - # Create a table with the generated data. - tb_historical_budget = Table( - df_historical_budget, short_name="global_carbon_budget_historical_budget", underscore=True - ) - - return tb_historical_budget - - -def prepare_land_use_emissions(df_land_use: pd.DataFrame) -> Table: - """Prepare data from a specific sheet of the land-use change data file. - - Parameters - ---------- - df_land_use : pd.DataFrame - Data from a specific sheet of the land-use change emissions data file. - - Returns - ------- - tb_land_use : Table - Processed land-use change emissions data. - - """ - df_land_use = df_land_use.copy() - - # Sanity check. - error = "'BLUE' sheet in national land-use change data file has changed (consider changing 'skiprows')." - assert df_land_use.columns[1] == "Afghanistan", error - - # Extract quality flag from the zeroth row of the data. - # Ignore nans (which happen when a certain country has no data). - quality_flag = ( - df_land_use.drop(columns=df_land_use.columns[0]) - .loc[0] - .dropna() - .astype(int) - .to_frame("quality_flag") - .reset_index() - .rename(columns={"index": "country"}) - ) - - # Drop the first row, which is for quality factor (which we have already extracted). - df_land_use = df_land_use.rename(columns={df_land_use.columns[0]: "year"}).drop(0) - - # Ignore countries that have no data. - df_land_use = df_land_use.dropna(axis=1, how="all") - - # Restructure data to have a column for country and another for emissions. - df_land_use = df_land_use.melt(id_vars="year", var_name="country", value_name="emissions") - - error = "Countries with emissions data differ from countries with quality flag." - assert set(df_land_use["country"]) == set(quality_flag["country"]), error - - # Add quality factor as an additional column. - df_land_use = pd.merge(df_land_use, quality_flag, how="left", on="country") - - # Set an index and sort row and columns conveniently. - df_land_use = df_land_use.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create a table with the generated data. - tb_land_use = Table(df_land_use, short_name="global_carbon_budget_land_use_change", underscore=True) - - return tb_land_use - - -def prepare_national_emissions(df: pd.DataFrame, column_name: str) -> Table: - """Select variables and prepare the territorial emissions (or the consumption emissions) sheet of GCB's raw national - data file. - - Parameters - ---------- - df : pd.DataFrame - Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. - column_name : str - Name to assign to emissions column to be generated. - - Returns - ------- - tb_national : Table - Processed territorial (or consumption) emissions sheet of GCB's raw national data file. - - """ - df = df.copy() - - error = f"Sheet in national data file for {column_name} has changed (consider changing 'skiprows')." - assert df.columns[1] == "Afghanistan", error - - # The zeroth column is expected to be year. - df = df.rename(columns={df.columns[0]: "year"}) - - # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". - # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). - # In fact "Bunkers" is a global variable (I don't know why it is included at the national level), but this will be - # handled at the garden step. - - # Remove unnecessary column. - df = df.drop(columns=["Statistical Difference"]) - - # Convert from wide to long format dataframe. - df = df.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) - - # Set an index and sort row and columns conveniently. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create a table with the generated data. - tb_national = Table(df, short_name=f"global_carbon_budget_{column_name}", underscore=True) - - return tb_national - - -def run(dest_dir: str) -> None: - log.info("global_carbon_budget.start") - - # - # Load inputs. - # - # Retrieve snapshots. - snap_fossil_co2: Snapshot = paths.load_dependency("global_carbon_budget_fossil_co2_emissions.csv") - snap_global: Snapshot = paths.load_dependency("global_carbon_budget_global_emissions.xlsx") - snap_national: Snapshot = paths.load_dependency("global_carbon_budget_national_emissions.xlsx") - snap_land_use: Snapshot = paths.load_dependency("global_carbon_budget_land_use_change_emissions.xlsx") - - # Load data from fossil CO2 emissions. - df_fossil_co2 = pd.read_csv(snap_fossil_co2.path) - - # Load historical budget from the global emissions file. - df_historical = pd.read_excel(snap_global.path, sheet_name="Historical Budget", skiprows=15) - - # Load land-use emissions. - df_land_use = pd.read_excel(snap_land_use.path, sheet_name="BLUE", skiprows=7) - - # Load production-based national emissions. - df_production = pd.read_excel(snap_national.path, sheet_name="Territorial Emissions", skiprows=11) - - # Load consumption-based national emissions. - df_consumption = pd.read_excel(snap_national.path, sheet_name="Consumption Emissions", skiprows=8) - - # - # Process data. - # - # Prepare data for fossil CO2 emissions. - tb_fossil_co2 = prepare_fossil_co2(df_fossil_co2=df_fossil_co2) - - # Prepare data for historical emissions. - tb_historical = prepare_historical_budget(df_historical_budget=df_historical) - - # Prepare data for land-use emissions. - tb_land_use = prepare_land_use_emissions(df_land_use=df_land_use) - - # Prepare data for production-based emissions, from the file of national emissions. - tb_production = prepare_national_emissions(df=df_production, column_name="production_emissions") - - # Prepare data for consumption-based emissions, from the file of national emissions. - tb_consumption = prepare_national_emissions(df=df_consumption, column_name="consumption_emissions") - - # - # Save outputs. - # - # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset( - dest_dir, - tables=[tb_fossil_co2, tb_historical, tb_land_use, tb_production, tb_consumption], - default_metadata=snap_fossil_co2.metadata, - ) - - # Save changes in the new garden dataset. - ds_meadow.save() - - log.info("global_carbon_budget.end") diff --git a/etl/steps/archive/meadow/gcp/2023-07-10/global_carbon_budget.py b/etl/steps/archive/meadow/gcp/2023-07-10/global_carbon_budget.py deleted file mode 100644 index ec1c906629c..00000000000 --- a/etl/steps/archive/meadow/gcp/2023-07-10/global_carbon_budget.py +++ /dev/null @@ -1,231 +0,0 @@ -"""Load a snapshot and create a meadow dataset. - -It combines the following snapshots: -- GCP's Fossil CO2 emissions (long-format csv). -- GCP's official GCB global emissions (excel file) containing global bunker fuel and land-use change emissions. -- GCP's official GCB national emissions (excel file) containing consumption-based emissions for each country. - - Production-based emissions from this file are also used, but just to include total emissions of regions - according to GCP (e.g. "Africa (GCP)") and for sanity checks. -- GCP's official GCB national land-use change emissions (excel file) with land-use change emissions for each country. - -""" - -import owid.catalog.processing as pr -from owid.catalog import Table -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset - -# Initialize logger. -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def prepare_fossil_co2(tb_fossil_co2: Table) -> Table: - # Set an appropriate index and sort conveniently. - tb_fossil_co2 = tb_fossil_co2.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Ensure all columns are snake-case. - tb_fossil_co2 = tb_fossil_co2.underscore() - - return tb_fossil_co2 - - -def prepare_historical_budget(tb_historical: Table) -> Table: - """Select variables and prepare the historical budget sheet of GCB's raw global data file. - - Parameters - ---------- - tb_historical : Table - Historical budget sheet of GCB's raw global data file. - - Returns - ------- - tb_historical : Table - Historical budget after selecting variables and processing them. - - """ - # Sanity check. - error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." - assert tb_historical.columns[0] == "Year", error - - # Columns to select in historical budget and how to rename them. - columns = { - "Year": "year", - "fossil emissions excluding carbonation": "global_fossil_emissions", - "land-use change emissions": "global_land_use_change_emissions", - } - tb_historical = tb_historical[list(columns)].rename(columns=columns) - - # Add column for country (to be able to combine this with the national data). - tb_historical["country"] = "World" - - # Set an index and sort row and columns conveniently. - tb_historical = tb_historical.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Rename table. - tb_historical.metadata.short_name = "global_carbon_budget_historical_budget" - - return tb_historical - - -def prepare_land_use_emissions(tb_land_use: Table) -> Table: - """Prepare data from a specific sheet of the land-use change data file. - - Parameters - ---------- - tb_land_use : Table - Data from a specific sheet of the land-use change emissions data file. - - Returns - ------- - tb_land_use : Table - Processed land-use change emissions data. - - """ - tb_land_use = tb_land_use.copy() - - # Sanity check. - error = "'BLUE' sheet in national land-use change data file has changed (consider changing 'skiprows')." - assert tb_land_use.columns[1] == "Afghanistan", error - - # Extract quality flag from the zeroth row of the data. - # Ignore nans (which happen when a certain country has no data). - quality_flag = ( - tb_land_use.drop(columns=tb_land_use.columns[0]) - .loc[0] - .dropna() - .astype(int) - .to_frame("quality_flag") - .reset_index() - .rename(columns={"index": "country"}) - ) - - # Drop the first row, which is for quality factor (which we have already extracted). - tb_land_use = tb_land_use.rename(columns={tb_land_use.columns[0]: "year"}).drop(0) - - # Ignore countries that have no data. - tb_land_use = tb_land_use.dropna(axis=1, how="all") - - # Restructure data to have a column for country and another for emissions. - tb_land_use = tb_land_use.melt(id_vars="year", var_name="country", value_name="emissions") - - error = "Countries with emissions data differ from countries with quality flag." - assert set(tb_land_use["country"]) == set(quality_flag["country"]), error - - # Add quality factor as an additional column. - tb_land_use = pr.merge(tb_land_use, quality_flag, how="left", on="country") - - # Copy metadata from another existing variable to the new quality flag. - tb_land_use["quality_flag"] = tb_land_use["quality_flag"].copy_metadata(tb_land_use["emissions"]) - - # Set an index and sort row and columns conveniently. - tb_land_use = tb_land_use.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Rename table. - tb_land_use.metadata.short_name = "global_carbon_budget_land_use_change" - - return tb_land_use - - -def prepare_national_emissions(tb: Table, column_name: str) -> Table: - """Select variables and prepare the territorial emissions (or the consumption emissions) sheet of GCB's raw national - data file. - - Parameters - ---------- - tb : Table - Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. - column_name : str - Name to assign to emissions column to be generated. - - Returns - ------- - tb_national : Table - Processed territorial (or consumption) emissions sheet of GCB's raw national data file. - - """ - tb = tb.copy() - - error = f"Sheet in national data file for {column_name} has changed (consider changing 'skiprows')." - assert tb.columns[1] == "Afghanistan", error - - # The zeroth column is expected to be year. - tb = tb.rename(columns={tb.columns[0]: "year"}) - - # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". - # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). - # In fact "Bunkers" is a global variable (I don't know why it is included at the national level), but this will be - # handled at the garden step. - - # Remove unnecessary column. - tb = tb.drop(columns=["Statistical Difference"]) - - # Convert from wide to long format dataframe. - tb = tb.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) - - # Set an index and sort row and columns conveniently. - tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Rename table. - tb.metadata.short_name = f"global_carbon_budget_{column_name}" - - return tb - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Retrieve snapshots. - snap_fossil_co2 = paths.load_snapshot("global_carbon_budget_fossil_co2_emissions.csv") - snap_global = paths.load_snapshot("global_carbon_budget_global_emissions.xlsx") - snap_national = paths.load_snapshot("global_carbon_budget_national_emissions.xlsx") - snap_land_use = paths.load_snapshot("global_carbon_budget_land_use_change_emissions.xlsx") - - # Load data from fossil CO2 emissions. - tb_fossil_co2 = snap_fossil_co2.read_csv() - - # Load historical budget from the global emissions file. - tb_historical = snap_global.read_excel(sheet_name="Historical Budget", skiprows=15) - - # Load land-use emissions. - tb_land_use = snap_land_use.read_excel(sheet_name="BLUE", skiprows=7) - - # Load production-based national emissions. - tb_production = snap_national.read_excel(sheet_name="Territorial Emissions", skiprows=11) - - # Load consumption-based national emissions. - tb_consumption = snap_national.read_excel(sheet_name="Consumption Emissions", skiprows=8) - - # - # Process data. - # - # Prepare data for fossil CO2 emissions. - tb_fossil_co2 = prepare_fossil_co2(tb_fossil_co2=tb_fossil_co2) - - # Prepare data for historical emissions. - tb_historical = prepare_historical_budget(tb_historical=tb_historical) - - # Prepare data for land-use emissions. - tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) - - # Prepare data for production-based emissions, from the file of national emissions. - tb_production = prepare_national_emissions(tb=tb_production, column_name="production_emissions") - - # Prepare data for consumption-based emissions, from the file of national emissions. - tb_consumption = prepare_national_emissions(tb=tb_consumption, column_name="consumption_emissions") - - # - # Save outputs. - # - # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset( - dest_dir, - tables=[tb_fossil_co2, tb_historical, tb_land_use, tb_production, tb_consumption], - default_metadata=snap_fossil_co2.metadata, - check_variables_metadata=True, - ) - ds_meadow.save() diff --git a/etl/steps/archive/meadow/gcp/2023-09-28/global_carbon_budget.py b/etl/steps/archive/meadow/gcp/2023-09-28/global_carbon_budget.py deleted file mode 100644 index 5b8487ec203..00000000000 --- a/etl/steps/archive/meadow/gcp/2023-09-28/global_carbon_budget.py +++ /dev/null @@ -1,230 +0,0 @@ -"""Load a snapshot and create a meadow dataset. - -It combines the following snapshots: -- GCP's Fossil CO2 emissions (long-format csv). -- GCP's official GCB global emissions (excel file) containing global bunker fuel and land-use change emissions. -- GCP's official GCB national emissions (excel file) containing consumption-based emissions for each country. - - Production-based emissions from this file are also used, but just to include total emissions of regions - according to GCP (e.g. "Africa (GCP)") and for sanity checks. -- GCP's official GCB national land-use change emissions (excel file) with land-use change emissions for each country. - -""" - -from owid.catalog import Table -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset - -# Initialize logger. -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def prepare_fossil_co2(tb_fossil_co2: Table) -> Table: - # Set an appropriate index and sort conveniently. - tb_fossil_co2 = tb_fossil_co2.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Ensure all columns are snake-case. - tb_fossil_co2 = tb_fossil_co2.underscore() - - return tb_fossil_co2 - - -def prepare_historical_budget(tb_historical: Table) -> Table: - """Select variables and prepare the historical budget sheet of GCB's raw global data file. - - Parameters - ---------- - tb_historical : Table - Historical budget sheet of GCB's raw global data file. - - Returns - ------- - tb_historical : Table - Historical budget after selecting variables and processing them. - - """ - # Sanity check. - error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." - assert tb_historical.columns[0] == "Year", error - - # Columns to select in historical budget and how to rename them. - columns = { - "Year": "year", - "fossil emissions excluding carbonation": "global_fossil_emissions", - "land-use change emissions": "global_land_use_change_emissions", - } - tb_historical = tb_historical[list(columns)].rename(columns=columns) - - # Add column for country (to be able to combine this with the national data). - tb_historical["country"] = "World" - - # Set an index and sort row and columns conveniently. - tb_historical = tb_historical.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Rename table. - tb_historical.metadata.short_name = "global_carbon_budget_historical_budget" - - return tb_historical - - -def prepare_land_use_emissions(tb_land_use: Table) -> Table: - """Prepare data from a specific sheet of the land-use change data file. - - Parameters - ---------- - tb_land_use : Table - Data from a specific sheet of the land-use change emissions data file. - - Returns - ------- - tb_land_use : Table - Processed land-use change emissions data. - - """ - tb_land_use = tb_land_use.copy() - - # Sanity check. - error = "'BLUE' sheet in national land-use change data file has changed (consider changing 'skiprows')." - assert tb_land_use.columns[1] == "Afghanistan", error - - # Extract quality flag from the zeroth row of the data. - # Ignore nans (which happen when a certain country has no data). - quality_flag = ( - tb_land_use.drop(columns=tb_land_use.columns[0]) - .loc[0] - .dropna() - .astype(int) - .to_frame("quality_flag") - .reset_index() - .rename(columns={"index": "country"}) - ) - - # Drop the first row, which is for quality factor (which we have already extracted). - tb_land_use = tb_land_use.rename(columns={tb_land_use.columns[0]: "year"}).drop(0) - - # Ignore countries that have no data. - tb_land_use = tb_land_use.dropna(axis=1, how="all") - - # Restructure data to have a column for country and another for emissions. - tb_land_use = tb_land_use.melt(id_vars="year", var_name="country", value_name="emissions") - - error = "Countries with emissions data differ from countries with quality flag." - assert set(tb_land_use["country"]) == set(quality_flag["country"]), error - - # Add quality factor as an additional column. - tb_land_use = tb_land_use.merge(quality_flag, how="left", on="country") - - # Copy metadata from another existing variable to the new quality flag. - tb_land_use["quality_flag"] = tb_land_use["quality_flag"].copy_metadata(tb_land_use["emissions"]) - - # Set an index and sort row and columns conveniently. - tb_land_use = tb_land_use.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Rename table. - tb_land_use.metadata.short_name = "global_carbon_budget_land_use_change" - - return tb_land_use - - -def prepare_national_emissions(tb: Table, column_name: str) -> Table: - """Select variables and prepare the territorial emissions (or the consumption emissions) sheet of GCB's raw national - data file. - - Parameters - ---------- - tb : Table - Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. - column_name : str - Name to assign to emissions column to be generated. - - Returns - ------- - tb_national : Table - Processed territorial (or consumption) emissions sheet of GCB's raw national data file. - - """ - tb = tb.copy() - - error = f"Sheet in national data file for {column_name} has changed (consider changing 'skiprows')." - assert tb.columns[1] == "Afghanistan", error - - # The zeroth column is expected to be year. - tb = tb.rename(columns={tb.columns[0]: "year"}) - - # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". - # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). - # In fact "Bunkers" is a global variable (I don't know why it is included at the national level), but this will be - # handled at the garden step. - - # Remove unnecessary column. - tb = tb.drop(columns=["Statistical Difference"]) - - # Convert from wide to long format dataframe. - tb = tb.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) - - # Set an index and sort row and columns conveniently. - tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Rename table. - tb.metadata.short_name = f"global_carbon_budget_{column_name}" - - return tb - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Retrieve snapshots. - snap_fossil_co2 = paths.load_snapshot("global_carbon_budget_fossil_co2_emissions.csv") - snap_global = paths.load_snapshot("global_carbon_budget_global_emissions.xlsx") - snap_national = paths.load_snapshot("global_carbon_budget_national_emissions.xlsx") - snap_land_use = paths.load_snapshot("global_carbon_budget_land_use_change_emissions.xlsx") - - # Load data from fossil CO2 emissions. - tb_fossil_co2 = snap_fossil_co2.read() - - # Load historical budget from the global emissions file. - tb_historical = snap_global.read(sheet_name="Historical Budget", skiprows=15) - - # Load land-use emissions. - tb_land_use = snap_land_use.read(sheet_name="BLUE", skiprows=7) - - # Load production-based national emissions. - tb_production = snap_national.read(sheet_name="Territorial Emissions", skiprows=11) - - # Load consumption-based national emissions. - tb_consumption = snap_national.read(sheet_name="Consumption Emissions", skiprows=8) - - # - # Process data. - # - # Prepare data for fossil CO2 emissions. - tb_fossil_co2 = prepare_fossil_co2(tb_fossil_co2=tb_fossil_co2) - - # Prepare data for historical emissions. - tb_historical = prepare_historical_budget(tb_historical=tb_historical) - - # Prepare data for land-use emissions. - tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) - - # Prepare data for production-based emissions, from the file of national emissions. - tb_production = prepare_national_emissions(tb=tb_production, column_name="production_emissions") - - # Prepare data for consumption-based emissions, from the file of national emissions. - tb_consumption = prepare_national_emissions(tb=tb_consumption, column_name="consumption_emissions") - - # - # Save outputs. - # - # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset( - dest_dir, - tables=[tb_fossil_co2, tb_historical, tb_land_use, tb_production, tb_consumption], - default_metadata=snap_fossil_co2.metadata, - check_variables_metadata=True, - ) - ds_meadow.save() diff --git a/etl/steps/archive/meadow/gcp/2023-12-05/global_carbon_budget.py b/etl/steps/archive/meadow/gcp/2023-12-05/global_carbon_budget.py deleted file mode 100644 index d04b6ced74b..00000000000 --- a/etl/steps/archive/meadow/gcp/2023-12-05/global_carbon_budget.py +++ /dev/null @@ -1,233 +0,0 @@ -"""Load a snapshot and create a meadow dataset. - -It combines the following snapshots: -- GCP's Fossil CO2 emissions (long-format csv). -- GCP's official GCB global emissions (excel file) containing global bunker fuel and land-use change emissions. -- GCP's official GCB national emissions (excel file) containing consumption-based emissions for each country. - - Production-based emissions from this file are also used, but just to include total emissions of regions - according to GCP (e.g. "Africa (GCP)") and for sanity checks. -- GCP's official GCB national land-use change emissions (excel file) with land-use change emissions for each country. - -""" - -from owid.catalog import Table -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset - -# Initialize logger. -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def prepare_fossil_co2(tb_fossil_co2: Table) -> Table: - # Set an appropriate index and sort conveniently. - tb_fossil_co2 = tb_fossil_co2.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Ensure all columns are snake-case. - tb_fossil_co2 = tb_fossil_co2.underscore() - - return tb_fossil_co2 - - -def prepare_historical_budget(tb_historical: Table) -> Table: - """Select variables and prepare the historical budget sheet of GCB's raw global data file. - - Parameters - ---------- - tb_historical : Table - Historical budget sheet of GCB's raw global data file. - - Returns - ------- - tb_historical : Table - Historical budget after selecting variables and processing them. - - """ - # Sanity check. - error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." - assert tb_historical.columns[0] == "Year", error - - # Columns to select in historical budget and how to rename them. - columns = { - "Year": "year", - "fossil emissions excluding carbonation": "global_fossil_emissions", - "land-use change emissions": "global_land_use_change_emissions", - } - tb_historical = tb_historical[list(columns)].rename(columns=columns) - - # Add column for country (to be able to combine this with the national data). - tb_historical["country"] = "World" - - # Set an index and sort row and columns conveniently. - tb_historical = tb_historical.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Rename table. - tb_historical.metadata.short_name = "global_carbon_budget_historical_budget" - - return tb_historical - - -def prepare_land_use_emissions(tb_land_use: Table) -> Table: - """Prepare data from a specific sheet of the land-use change data file. - - Parameters - ---------- - tb_land_use : Table - Data from a specific sheet of the land-use change emissions data file. - - Returns - ------- - tb_land_use : Table - Processed land-use change emissions data. - - """ - tb_land_use = tb_land_use.copy() - - # Sanity check. - error = "'BLUE' sheet in national land-use change data file has changed (consider changing 'skiprows')." - assert tb_land_use.columns[1] == "Afghanistan", error - - # Extract quality flag from the zeroth row of the data. - # Ignore nans (which happen when a certain country has no data). - quality_flag = ( - tb_land_use.drop(columns=tb_land_use.columns[0]) - .loc[0] - .dropna() - .astype(int) - .to_frame("quality_flag") - .reset_index() - .rename(columns={"index": "country"}) - ) - - # Drop the first row, which is for quality factor (which we have already extracted). - tb_land_use = tb_land_use.rename(columns={tb_land_use.columns[0]: "year"}).drop(0) - - # Ignore countries that have no data. - tb_land_use = tb_land_use.dropna(axis=1, how="all") - - # Remove rows that are either empty, or have some other additional operation (e.g. 2013-2022). - tb_land_use = tb_land_use[tb_land_use["year"].astype(str).str.match(r"^\d{4}$")].reset_index(drop=True) - - # Restructure data to have a column for country and another for emissions. - tb_land_use = tb_land_use.melt(id_vars="year", var_name="country", value_name="emissions") - - error = "Countries with emissions data differ from countries with quality flag." - assert set(tb_land_use["country"]) == set(quality_flag["country"]), error - - # Add quality factor as an additional column. - tb_land_use = tb_land_use.merge(quality_flag, how="left", on="country") - - # Copy metadata from another existing variable to the new quality flag. - tb_land_use["quality_flag"] = tb_land_use["quality_flag"].copy_metadata(tb_land_use["emissions"]) - - # Set an index and sort row and columns conveniently. - tb_land_use = tb_land_use.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Rename table. - tb_land_use.metadata.short_name = "global_carbon_budget_land_use_change" - - return tb_land_use - - -def prepare_national_emissions(tb: Table, column_name: str) -> Table: - """Select variables and prepare the territorial emissions (or the consumption emissions) sheet of GCB's raw national - data file. - - Parameters - ---------- - tb : Table - Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. - column_name : str - Name to assign to emissions column to be generated. - - Returns - ------- - tb_national : Table - Processed territorial (or consumption) emissions sheet of GCB's raw national data file. - - """ - tb = tb.copy() - - error = f"Sheet in national data file for {column_name} has changed (consider changing 'skiprows')." - assert tb.columns[1] == "Afghanistan", error - - # The zeroth column is expected to be year. - tb = tb.rename(columns={tb.columns[0]: "year"}) - - # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". - # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). - # In fact "Bunkers" is a global variable (I don't know why it is included at the national level), but this will be - # handled at the garden step. - - # Remove unnecessary column. - tb = tb.drop(columns=["Statistical Difference"]) - - # Convert from wide to long format dataframe. - tb = tb.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) - - # Set an index and sort row and columns conveniently. - tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Rename table. - tb.metadata.short_name = f"global_carbon_budget_{column_name}" - - return tb - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Retrieve snapshots. - snap_fossil_co2 = paths.load_snapshot("global_carbon_budget_fossil_co2_emissions.csv") - snap_global = paths.load_snapshot("global_carbon_budget_global_emissions.xlsx") - snap_national = paths.load_snapshot("global_carbon_budget_national_emissions.xlsx") - snap_land_use = paths.load_snapshot("global_carbon_budget_land_use_change_emissions.xlsx") - - # Load data from fossil CO2 emissions. - tb_fossil_co2 = snap_fossil_co2.read() - - # Load historical budget from the global emissions file. - tb_historical = snap_global.read(sheet_name="Historical Budget", skiprows=15) - - # Load land-use emissions. - tb_land_use = snap_land_use.read(sheet_name="BLUE", skiprows=7) - - # Load production-based national emissions. - tb_production = snap_national.read(sheet_name="Territorial Emissions", skiprows=11) - - # Load consumption-based national emissions. - tb_consumption = snap_national.read(sheet_name="Consumption Emissions", skiprows=8) - - # - # Process data. - # - # Prepare data for fossil CO2 emissions. - tb_fossil_co2 = prepare_fossil_co2(tb_fossil_co2=tb_fossil_co2) - - # Prepare data for historical emissions. - tb_historical = prepare_historical_budget(tb_historical=tb_historical) - - # Prepare data for land-use emissions. - tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) - - # Prepare data for production-based emissions, from the file of national emissions. - tb_production = prepare_national_emissions(tb=tb_production, column_name="production_emissions") - - # Prepare data for consumption-based emissions, from the file of national emissions. - tb_consumption = prepare_national_emissions(tb=tb_consumption, column_name="consumption_emissions") - - # - # Save outputs. - # - # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset( - dest_dir, - tables=[tb_fossil_co2, tb_historical, tb_land_use, tb_production, tb_consumption], - default_metadata=snap_fossil_co2.metadata, - check_variables_metadata=True, - ) - ds_meadow.save() diff --git a/etl/steps/archive/meadow/health/2022-12-28/deaths_karlinsky.py b/etl/steps/archive/meadow/health/2022-12-28/deaths_karlinsky.py deleted file mode 100644 index 2f57494057d..00000000000 --- a/etl/steps/archive/meadow/health/2022-12-28/deaths_karlinsky.py +++ /dev/null @@ -1,37 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table -from structlog import get_logger - -from etl.helpers import PathFinder -from etl.snapshot import Snapshot -from etl.steps.data.converters import convert_snapshot_metadata - -log = get_logger() - -# naming conventions -N = PathFinder(__file__) -SNAPSHOT_DATASET = "health/2022-12-28/deaths_karlinsky.csv" -MEADOW_VERSION = N.version - - -def run(dest_dir: str) -> None: - log.info("deaths_karlinsky.start") - - # retrieve snapshot - snap = Snapshot(SNAPSHOT_DATASET) - df = pd.read_csv(snap.path) - - # create new dataset and reuse walden metadata - ds = Dataset.create_empty(dest_dir, metadata=convert_snapshot_metadata(snap.metadata)) - ds.metadata.version = MEADOW_VERSION - - # create table with metadata from dataframe and underscore all columns - tb = Table(df, short_name="deaths", underscore=True) - - # add table to a dataset - ds.add(tb) - - # finally save the dataset - ds.save() - - log.info("deaths_karlinsky.end") diff --git a/etl/steps/archive/meadow/hmd/2022-11-04/life_tables.py b/etl/steps/archive/meadow/hmd/2022-11-04/life_tables.py deleted file mode 100644 index 7ac5487218e..00000000000 --- a/etl/steps/archive/meadow/hmd/2022-11-04/life_tables.py +++ /dev/null @@ -1,340 +0,0 @@ -"""Imports Life Tables dataset to Meadow. - -This dataset is from Human Mortality Database. - -The source data provides a zip which contains 6 folders. Each folder contains a TXT file per country (not great). -This step generates a dataset with 6 tables, one for each folder. Each table contains the data from all TXT files. -""" -import os -import re -import tempfile -from glob import glob -from io import StringIO -from typing import List, cast - -import pandas as pd -from owid import catalog -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.datautils.io import decompress_file -from owid.walden import Catalog as WaldenCatalog -from owid.walden.catalog import Dataset as WaldenDataset -from structlog import get_logger - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_walden_metadata - -log = get_logger() - -# naming conventions -N = PathFinder(__file__) - - -# Files expected once Walden file is uncompressed -FILES_EXPECTED = ["bltper_1x1", "bltper_1x10", "bltper_1x5", "bltper_5x1", "bltper_5x10", "bltper_5x5"] -# Regular expression to extract relevant fields from file (used in make_df) -FILE_REGEX = ( - r"([a-zA-Z\-\s,]+), Life tables \(period {table}\), Total\tLast modified: (\d+ [a-zA-Z]{{3}} \d+); Methods" - r" Protocol: v\d+ \(\d+\)\n\n((?s:.)*)" -) -# Dataset details from Walden -NAMESPACE = N.namespace -SHORT_NAME = N.short_name -VERSION_WALDEN = "2022-11-04" -# Meadow version -VERSION_MEADOW = N.version -# Column renaming -COLUMNS_RENAME = { - "Country": "country", - "Year": "year", - "Age": "age", - "mx": "central_death_rate", - "qx": "probability_of_death", - "ax": "avg_survival_length", - "lx": "num_survivors", - "dx": "num_deaths", - "Lx": "num_person_years_lived", - "Tx": "num_person_years_remaining", - "ex": "life_expectancy", -} -# Column dtypes -DTYPES = { - "central_death_rate": "Float64", - "probability_of_death": "Float64", - "avg_survival_length": "Float64", - "num_survivors": "Int64", - "num_deaths": "Int64", - "num_person_years_lived": "Int64", - "num_person_years_remaining": "Int64", - "life_expectancy": "Float64", - "age": "category", - "country": "category", - "year": "category", -} - - -def run(dest_dir: str) -> None: - """Run step.""" - log.info("hmd_lt.start") - - # Retrieve raw data from walden - walden_ds = WaldenCatalog().find_one(namespace=NAMESPACE, short_name=SHORT_NAME, version=VERSION_WALDEN) - local_file = walden_ds.ensure_downloaded() - - # Create new dataset and reuse walden metadata - ds = init_meadow_dataset(dest_dir, walden_ds) - - # Create and add tables to dataset - ds = create_and_add_tables_to_dataset(local_file, ds, walden_ds) - - # Save the dataset - ds.save() - log.info("hmd_lt.end") - - -def init_meadow_dataset(dest_dir: str, walden_ds: WaldenDataset) -> Dataset: - """Initialize meadow dataset.""" - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION_MEADOW - return ds - - -def create_and_add_tables_to_dataset(local_file: str, ds: Dataset, walden_ds: WaldenDataset) -> Dataset: - """Create and add tables to dataset. - - This method creates tables for all the folders found once `local_file` is uncompressed. Then, - it cleans and adds them to the dataset `ds`. It uses the metadata from `walden_ds` to create the tables. - - Parameters - ---------- - local_file : str - File to walden raw file. - ds : Dataset - Dataset where tables should be added. - walden_ds : WaldenDataset - Walden dataset. - - Returns - ------- - Dataset - Dataset with tables - """ - with tempfile.TemporaryDirectory() as tmp_dir: - # Decompress files - decompress_file(local_file, tmp_dir) - - # Check available files - _sanity_check_files(tmp_dir) - - # Load data - for age in [1, 5]: - for year in [1, 5, 10]: - # Create table - log.info(f"Creating table for {age}-year age groups and {year}-year intervals...") - table = make_table(tmp_dir, age, year, walden_ds) - # set index - table = table.set_index(["country", "year", "age"], verify_integrity=True) - # add table to a dataset - log.info("Adding table to dataset...") - ds.add(table) - return ds - - -def _sanity_check_files(path: str) -> None: - """Checks that all required files are present once zip is uncompressed.""" - files_found = sorted(os.listdir(path)) - assert ( - files_found == FILES_EXPECTED - ), f"Files found are not the ones expected! Check that {FILES_EXPECTED} are actually there!" - - -def make_table(input_folder: str, age: int, year: int, walden_ds: WaldenDataset) -> catalog.Table: - """Create table. - - Loads data from `input_folder` and creates a table with the name `table_name`. It uses the metadata from `walden_ds`. - - Parameters - ---------- - input_folder : str - Folder containing uncompressed data from Walden. - age: int - Age group size (1 or 5). - year: int - Year interval size (1, 5 or 10). - walden_ds : WaldenDataset - Walden dataset. - - Returns - ------- - catalog.Table - Table with data. - """ - # Load files - table_name = _table_name(age, year) - f = f"bltper_{table_name}" - files = glob(os.path.join(input_folder, f"{f}/*.txt")) - log.info(f"Looking for available files in {f}...") - # Create df - df = make_df(files, table_name) - # Clean df - df = clean_df(df) - # df to table - table = df_to_table(walden_ds, age, year, df) - # underscore all table columns - table = underscore_table(table) - return table - - -def _table_name(age: int, year: int) -> str: - return f"{age}x{year}" - - -def make_df(files: List[str], table_name: str) -> pd.DataFrame: - """Create dataframe. - - Parameters - ---------- - files : List[str] - Files to load and extract data from. There is a file per country. Within the file, data is found - along with country name. Note that sometimes, the country name can be like "New Zealand - Non-Maori". - table_name : str - Name of the table. - - Returns - ------- - pd.DataFrame - _description_ - """ - log.info("Creating dataframe from files...") - regex = FILE_REGEX.format(table=table_name) - dfs = [] - # Load each file - for f in files: - with open(f, "r") as f: - text = f.read() - # Get relevant fields - match = re.search(regex, text) - if match is not None: - country, _, table_str = match.group(1, 2, 3) - else: - raise ValueError("No match found! Please revise that source files' content matches FILE_REGEX.") - # Build country df - df_ = _make_df_country(country, table_str) - dfs.append(df_) - # Concatenate all country dfs - df = pd.concat(dfs, ignore_index=True) - return cast(pd.DataFrame, df) - - -def _make_df_country(country: str, table: str) -> pd.DataFrame: - """Create dataframe for individual country.""" - # Remove starting/ending spaces - table = table.strip() - # Remove spacing after newline - table = re.sub(r"\n\s+", "\n", table) - # Replace spacing with tabs - table = re.sub(r"[^\S\r\n]+", "\t", table) - # Build df - df = pd.read_csv(StringIO(table), sep="\t") - # Assign country - df = df.assign(Country=country) - # # Filter columns - # df = df[["Country", "Year", "ex"]].rename(columns={"ex": "life_expectancy"}) - return df - - -def clean_df(df: pd.DataFrame) -> pd.DataFrame: - """Clean dataframe. - - Orders columns, renames columns, checks for missing values and sets dtypes. - - Parameters - ---------- - df : pd.DataFrame - Initial dataframe. - - Returns - ------- - pd.DataFrame - Cleaned dataframe. - """ - log.info("Cleaning dataframe...") - # Order columns - cols_first = ["Country", "Year"] - cols = cols_first + [col for col in df.columns if col not in cols_first] - df = df[cols] - # Rename columns - df = _clean_rename_columns_df(df) - # Correct missing data - df = _clean_correct_missing_data(df) - # Set dtypes - df = _clean_set_dtypes_df(df) - return df - - -def _clean_rename_columns_df(df: pd.DataFrame) -> pd.DataFrame: - """Rename columns.""" - return df.rename(columns=COLUMNS_RENAME) - - -def _clean_correct_missing_data(df: pd.DataFrame) -> pd.DataFrame: - """Checks on missing data.""" - # Expected missing data - num_rows_missing_expected = 0.01 - countries_missing_data_expected = 1 - # Find missing data - rows_missing = df[df.central_death_rate == "."] - num_rows_missing = len(rows_missing) / len(df) - countries_missing_data = rows_missing.country.unique() - # Run checks - assert num_rows_missing < num_rows_missing_expected, ( - f"More missing data than expected was found! {round(num_rows_missing*100, 2)} rows missing, but" - f" {round(num_rows_missing_expected*100,2)} were expected." - ) - assert len(countries_missing_data) <= countries_missing_data_expected, ( - f"More missing data than expected was found! Found {len(countries_missing_data)} countries, expected is" - f" {countries_missing_data_expected}. Check {countries_missing_data}!" - ) - # Correct - df = df.replace(".", pd.NA) - return df - - -def _clean_set_dtypes_df(df: pd.DataFrame) -> pd.DataFrame: - """Set dtypes.""" - # Numeric - cols_numeric = [col_name for col_name, dtype in DTYPES.items() if dtype in ["Int64", "Float64"]] - for col in cols_numeric: - df[col] = pd.to_numeric(df[col]) - return df.astype(DTYPES) - - -def df_to_table(walden_ds: WaldenDataset, age: int, year: int, df: pd.DataFrame) -> catalog.Table: - """Convert plain pandas.DataFrame into table. - - Parameters - ---------- - walden_ds : WaldenDataset - Raw Walden dataset. - age: int - Age group size (1 or 5). - year: int - Year interval size (1, 5 or 10). - df : pd.DataFrame - Dataframe. - - Returns - ------- - catalog.Table - Table created from dataframe, walden metadata and table name. - """ - table_name = _table_name(age, year) - # create table with metadata from dataframe - table_metadata = TableMeta( - short_name=f"period_{age}x{year}", - title=f"{walden_ds.name} [{table_name}]", - description=f"Contains data in {age}-year age groups grouped in {year}-year intervals.", - ) - tb = Table(df, metadata=table_metadata) - return tb diff --git a/etl/steps/archive/meadow/homicide/2023-01-04/unodc.meta.yml b/etl/steps/archive/meadow/homicide/2023-01-04/unodc.meta.yml deleted file mode 100644 index 0adbc5cafc4..00000000000 --- a/etl/steps/archive/meadow/homicide/2023-01-04/unodc.meta.yml +++ /dev/null @@ -1,58 +0,0 @@ -dataset: - namespace: homicide - short_name: unodc - title: United Nations Office on Drugs and Crime - Intentional Homicide Victims - description: '' - licenses: - - url: https://www.un.org/en/about-us/terms-of-use - version: '2023-01-04' - sources: - - name: United Nations Office on Drugs and Crime (2022) - url: https://dataunodc.un.org/dp-intentional-homicide-victims - source_data_url: https://dataunodc.un.org/sites/dataunodc.un.org/files/data_cts_intentional_homicide.xlsx - date_accessed: '2023-01-04' - publication_date: '2022-08-08' - publication_year: 2022 -tables: - unodc: - variables: - region: - title: Region - unit: '' - short_unit: '' - subregion: - title: Subregion - unit: '' - short_unit: '' - indicator: - title: Indicator - unit: '' - short_unit: '' - dimension: - title: Dimension - unit: '' - short_unit: '' - category: - title: Category - unit: '' - short_unit: '' - sex: - title: Sex - unit: '' - short_unit: '' - age: - title: Age - unit: '' - short_unit: '' - unit_of_measurement: - title: Unit of measurement - unit: '' - short_unit: '' - value: - title: VALUE - unit: '' - short_unit: '' - source: - title: Source - unit: '' - short_unit: '' diff --git a/etl/steps/archive/meadow/homicide/2023-01-04/unodc.py b/etl/steps/archive/meadow/homicide/2023-01-04/unodc.py deleted file mode 100644 index ac3b8f275cb..00000000000 --- a/etl/steps/archive/meadow/homicide/2023-01-04/unodc.py +++ /dev/null @@ -1,62 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table -from structlog import get_logger - -from etl.helpers import PathFinder -from etl.snapshot import Snapshot -from etl.steps.data.converters import convert_snapshot_metadata - -log = get_logger() - -# naming conventions -N = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - log.info("unodc.start") - - # retrieve snapshot - snap = Snapshot("homicide/2023-01-04/unodc.xlsx") - df = pd.read_excel(snap.path, skiprows=2) - - # clean and transform data - df = clean_data(df) - - # reset index so the data can be saved in feather format - df = df.reset_index().drop(columns="index") - - # create new dataset and reuse walden metadata - ds = Dataset.create_empty(dest_dir, metadata=convert_snapshot_metadata(snap.metadata)) - ds.metadata.version = "2023-01-04" - - # # create table with metadata from dataframe and underscore all columns - tb = Table(df, short_name=snap.metadata.short_name, underscore=True) - - # add table to a dataset - ds.add(tb) - - # update metadata - ds.update_metadata(N.metadata_path) - - # finally save the dataset - ds.save() - - log.info("unodc.end") - - -def clean_data(df: pd.DataFrame) -> pd.DataFrame: - df = df[ - (df["Dimension"].isin(["Total", "by mechanisms"])) - & ( - df["Indicator"].isin( - ["Victims of intentional homicide", "Victims of Intentional Homicide - Regional Estimate"] - ) - ) - ] - df = df.rename( - columns={ - "Country": "country", - "Year": "year", - } - ).drop(columns=["Iso3_code"]) - return df diff --git a/etl/steps/archive/meadow/homicide/2023-01-27/unodc.meta.yml b/etl/steps/archive/meadow/homicide/2023-01-27/unodc.meta.yml deleted file mode 100644 index 81c3a4a1c92..00000000000 --- a/etl/steps/archive/meadow/homicide/2023-01-27/unodc.meta.yml +++ /dev/null @@ -1,87 +0,0 @@ -dataset: - title: United Nations Office on Drugs and Crime - Intentional Homicides (2022) - description: > - The United Nations Office on Drugs and Crime Intentional Homicide data are sourced from either criminal justice or public health systems. In the former, data are generated by law enforcement or criminal justice authorities in the process of recording and investigating a crime event, whereas in the latter, data are produced by health authorities certifying the cause of death of an individual. - - - The criminal justice data was collected from national authorities with the annual United Nations Survey of Crime Trends and Operations of Criminal Justice Systems (UN-CTS). National focal points working in national agencies responsible for statistics on crime and the criminal justice system and nominated by the Permanent Mission to UNODC are responsible for compiling the data from the other relevant agencies before transmitting the UN-CTS to UNODC. Following the submission, UNODC checks for consistency and coherence with other data sources. - - - Data on homicide from public health sources were primarily obtained from the WHO Mortality Database.10 This dataset is a comprehensive collection of mortality data by cause of death, sex, and age group conducted yearly by the WHO with Member States. Deaths coded with Internatioanl Classification of Disease (ICD10) codes X85-Y09 (injuries inflicted by another person with intent to injure or kill), and ICD10 code Y87.1 (sequelae of assault), generally correspond to the definition of intentional homicide - - - The population data used to calculate homicide rates is sourced from the World Population Prospect, Population Division, United Nations Department of Economic and Social Affairs. - - - The statistical definition contains three elements that characterize the killing of a person as “intentional homicide”: - - - 1. The killing of a person by another person (objective element). - - - 2. The intent of the perpetrator to kill or seriously injure the victim (subjective element). - - - 3. The unlawfulness of the killing (legal element). - - - For recording purposes, all killings that meet the criteria listed above are to be considered intentional homicides, irrespective of definitions provided by national legislations or practices. Killings as a result of terrorist activities are also to be classified as a form of intentional homicide. - - - In order to compile consistent time series of total homicides back to 1990, in several cases data from multiple sources were combined to expand the number of available years within a country’s time series. Time series adjustments were performed when a country had two sources coveringdifferent year-ranges, which had very similar trends in an overlapping time period, but where these trends were at different levels. - - - The countries for which adjusted series for total homicide counts prior to the year 2000 have been produced were the following: Belgium, Brazil, China, Ecuador, Germany, Netherlands, New Zealand, Portugal, South Korea, Spain, Thailand, and United Kingdom. - - licenses: - - url: https://www.un.org/en/about-us/terms-of-use - sources: - - name: United Nations Office on Drugs and Crime (2022) - url: https://dataunodc.un.org/dp-intentional-homicide-victims - source_data_url: https://dataunodc.un.org/sites/dataunodc.un.org/files/data_cts_intentional_homicide.xlsx - date_accessed: '2023-01-04' - publication_date: '2022-08-08' - publication_year: 2022 -tables: - unodc: - variables: - region: - title: Region - short_unit: '' - unit: '' - subregion: - title: Subregion - short_unit: '' - unit: '' - indicator: - title: Indicator - short_unit: '' - unit: '' - dimension: - title: Dimension - short_unit: '' - unit: '' - category: - title: Category - short_unit: '' - unit: '' - sex: - title: Sex - short_unit: '' - unit: '' - age: - title: Age - short_unit: '' - unit: '' - unit_of_measurement: - title: Unit of measurement - short_unit: '' - unit: '' - value: - title: VALUE - short_unit: '' - unit: '' - source: - title: Source - short_unit: '' - unit: '' diff --git a/etl/steps/archive/meadow/homicide/2023-01-27/unodc.py b/etl/steps/archive/meadow/homicide/2023-01-27/unodc.py deleted file mode 100644 index efbe2cec41f..00000000000 --- a/etl/steps/archive/meadow/homicide/2023-01-27/unodc.py +++ /dev/null @@ -1,64 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table -from structlog import get_logger - -from etl.helpers import PathFinder -from etl.snapshot import Snapshot -from etl.steps.data.converters import convert_snapshot_metadata - -log = get_logger() - -# naming conventions -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - log.info("unodc.start") - - # retrieve snapshot - snap: Snapshot = paths.load_dependency("unodc.xlsx") - - # Snapshot("homicide/2023-01-04/unodc.xlsx") - df = pd.read_excel(snap.path, skiprows=2) - - # clean and transform data - df = clean_data(df) - - # reset index so the data can be saved in feather format - df = df.reset_index().drop(columns="index") - - # create new dataset and reuse walden metadata - ds = Dataset.create_empty(dest_dir, metadata=convert_snapshot_metadata(snap.metadata)) - ds.metadata.version = paths.version - - # # create table with metadata from dataframe and underscore all columns - tb = Table(df, short_name=snap.metadata.short_name, underscore=True) - - # add table to a dataset - ds.add(tb) - - # update metadata - ds.update_metadata(paths.metadata_path) - - # finally save the dataset - ds.save() - - log.info("unodc.end") - - -def clean_data(df: pd.DataFrame) -> pd.DataFrame: - df = df[ - (df["Dimension"].isin(["Total", "by mechanisms", "by relationship to perpetrator", "by situational context"])) - & ( - df["Indicator"].isin( - ["Victims of intentional homicide", "Victims of Intentional Homicide - Regional Estimate"] - ) - ) - ] - df = df.rename( - columns={ - "Country": "country", - "Year": "year", - } - ).drop(columns=["Iso3_code"]) - return df diff --git a/etl/steps/archive/meadow/ihme_gbd/2023-03-29/gbd_drug_disorders.meta.yml b/etl/steps/archive/meadow/ihme_gbd/2023-03-29/gbd_drug_disorders.meta.yml deleted file mode 100644 index 9b393a11114..00000000000 --- a/etl/steps/archive/meadow/ihme_gbd/2023-03-29/gbd_drug_disorders.meta.yml +++ /dev/null @@ -1,11 +0,0 @@ -dataset: - title: Drug Use Disorders - Global Burden of Disease Study 2019 (GBD 2019) - description: '' - sources: - - name: IHME, Global Burden of Disease (2019) - - url: https://vizhub.healthdata.org/gbd-results/ -tables: - gbd_drug_disorders: - val: - title: The annual number of deaths from drug use disorders per 100,000 people aged 15- to 49-years. - unit: 'deaths per 100,000 population' diff --git a/etl/steps/archive/meadow/ihme_gbd/2023-03-29/gbd_drug_disorders.py b/etl/steps/archive/meadow/ihme_gbd/2023-03-29/gbd_drug_disorders.py deleted file mode 100644 index ded585bc222..00000000000 --- a/etl/steps/archive/meadow/ihme_gbd/2023-03-29/gbd_drug_disorders.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Load a snapshot and create a meadow dataset.""" - -import pandas as pd -from owid.catalog import Table -from structlog import get_logger - -from etl.helpers import PathFinder, create_dataset -from etl.snapshot import Snapshot - -# Initialize logger. -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - log.info("gbd_drug_disorders.start") - - # - # Load inputs. - # - # Retrieve snapshot. - snap: Snapshot = paths.load_dependency("gbd_drug_disorders.csv") - - # Load data from snapshot. - df = pd.read_csv(snap.path) - # Drop ID columns - df = df[["location_name", "year", "val"]] - df = df.rename(columns={"location_name": "country"}) - # - # Process data. - # - # Create a new table and ensure all columns are snake-case. - tb = Table(df, short_name=paths.short_name, underscore=True) - - # - # Save outputs. - # - # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) - - # Save changes in the new garden dataset. - ds_meadow.save() - - log.info("gbd_drug_disorders.end") diff --git a/etl/steps/archive/meadow/irena/2022-10-20/renewable_electricity_capacity_and_generation.py b/etl/steps/archive/meadow/irena/2022-10-20/renewable_electricity_capacity_and_generation.py deleted file mode 100644 index bd7a3f19286..00000000000 --- a/etl/steps/archive/meadow/irena/2022-10-20/renewable_electricity_capacity_and_generation.py +++ /dev/null @@ -1,124 +0,0 @@ -"""Extract capacity data from IRENA's Renewable Electricity Capacity and Generation 2022 dataset. - -""" - -from typing import cast - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from shared import CURRENT_DIR - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_walden_metadata - -# Details of input dataset. -WALDEN_VERSION = "2022-10-07" -WALDEN_DATASET_NAME = "renewable_electricity_capacity_and_generation" -# Details of output dataset. -VERSION = "2022-10-20" -DATASET_NAME = WALDEN_DATASET_NAME -# Get naming conventions. -N = PathFinder(str(CURRENT_DIR / DATASET_NAME)) - - -def prepare_pv_capacity_data(data_file: str) -> None: - """Prepare yearly solar photovoltaic capacity data. - - Parameters - ---------- - data_file : str - Path to raw data (IRENA's excel file on renewable electricity capacity and generation). - - Returns - ------- - df : pd.DataFrame - PV capacity. - - """ - pass - - -def extract_capacity_from_sheet(excel_object: pd.ExcelFile, sheet_name: str) -> pd.DataFrame: - # The name of the energy source is given in the very first cell. - # To get that, I load the file, skipping all rows from the bottom. - # The first column is the content of the first cell. - technology = excel_object.parse(sheet_name, skipfooter=10000).columns[0] # type: ignore - - # The format of this dataset is inconvenient and requires some adjustment that may not work on the next update. - df = excel_object.parse(sheet_name, skiprows=4) # type: ignore - - # There are two tables put together: One for capacity and one for production. - # Keep only columns for capacity. - columns_to_keep = [df.columns[0]] - for column in df.columns[1:]: - if str(column).startswith("PROD"): - break - if not str(column).startswith("Unnamed"): - columns_to_keep.append(column) - df = df[columns_to_keep].rename(columns={"CAP (MW)": "country"}) - - # Remove empty rows. - df = df.dropna(subset="country").reset_index(drop=True) - - # Restructure dataframe. - df = df.melt(id_vars="country", var_name="year", value_name="capacity") - - # Add technology (referring to the name of the energy source) as a new column. - df["technology"] = technology - - return cast(pd.DataFrame, df) - - -def extract_capacity_from_all_sheets(data_file: str) -> pd.DataFrame: - # Select sheets that contain data (their names are numbers). - excel_object = pd.ExcelFile(data_file) - sheet_names = [sheet for sheet in excel_object.sheet_names if sheet.isdigit()] - - # Extract data sheet by sheet. - all_data = pd.DataFrame() - for sheet_name in sheet_names: - data = extract_capacity_from_sheet(excel_object=excel_object, sheet_name=sheet_name) - all_data = pd.concat([all_data, data], ignore_index=True) - - # Some rows are repeated (it seems that with identical values, at least for the case found, Uruguay on sheet 18). - # Therefore, drop duplicates. - # Set an appropriate index and sort conveniently. - all_data = ( - all_data.drop_duplicates(subset=["country", "year", "technology"], keep="first") - .set_index(["technology", "country", "year"], verify_integrity=True) - .sort_index() - .sort_index(axis=1) - ) - - return all_data - - -def run(dest_dir: str) -> None: - # Retrieve raw data from Walden. - walden_ds = WaldenCatalog().find_one(namespace="irena", short_name=WALDEN_DATASET_NAME, version=WALDEN_VERSION) - local_file = walden_ds.ensure_downloaded() - - # Extract capacity data. - df = extract_capacity_from_all_sheets(data_file=local_file) - - # Create a new Meadow dataset and reuse walden metadata. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - - # Create a new table with metadata from Walden. - table_metadata = TableMeta( - short_name=walden_ds.short_name, - title=walden_ds.name, - description=walden_ds.description, - ) - tb = Table(df, metadata=table_metadata) - - # Underscore all table columns. - tb = underscore_table(tb) - - # Add table to the dataset and save dataset. - ds.add(tb) - ds.save() diff --git a/etl/steps/archive/meadow/irena/2022-10-20/renewable_power_generation_costs.py b/etl/steps/archive/meadow/irena/2022-10-20/renewable_power_generation_costs.py deleted file mode 100644 index f3834683aa5..00000000000 --- a/etl/steps/archive/meadow/irena/2022-10-20/renewable_power_generation_costs.py +++ /dev/null @@ -1,253 +0,0 @@ -"""Extract global (as well as at the country level for some countries) weighted-average levelized cost of electricity -(LCOE) for all energy sources from IRENA's Renewable Power Generation Costs 2022 dataset. - -NOTE: The original data is poorly formatted. Each energy source is given as a separate sheet, with a different -structure. So it's likely that, on the next update, this script will not work. - -""" - -from typing import cast - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from shared import CURRENT_DIR - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_walden_metadata - -# Details of input dataset. -WALDEN_VERSION = "2022-10-07" -# Details of output dataset. -VERSION = "2022-10-20" -# Get naming conventions. -N = PathFinder(str(CURRENT_DIR / "renewable_power_generation_costs")) - -# It's unclear if this data will be used. If so, it could be a separate table. -# def prepare_pv_data(data_file: str) -> pd.DataFrame: -# """Prepare yearly data on solar photovoltaic costs. - -# Monthly data will be averaged, and only complete years (with 12 informed months) will be considered. - -# Parameters -# ---------- -# data_file : str -# Path to raw data (IRENA's excel file on renewable power generation costs). - -# Returns -# ------- -# pv_prices : pd.DataFrame -# PV prices. - -# """ -# # Photovoltaic technologies to choose for average monthly prices. -# pv_technologies = ["Thin film a-Si/u-Si or Global Index (from Q4 2013)"] -# # Load upper table in sheet from Figure 3.2, which is: -# # Average monthly solar PV module prices by technology and manufacturing country sold in Europe, 2010 to 2021. -# pv_prices = pd.read_excel( -# data_file, sheet_name="Fig 3.2", skiprows=4, skipfooter=18, usecols=lambda column: "Unnamed" not in column -# ) - -# # Transpose dataframe so that each row corresponds to a month. -# pv_prices = pv_prices.rename(columns={"2021 USD/W": "technology"}).melt( -# id_vars="technology", var_name="month", value_name="cost" -# ) - -# # Select PV technologies. -# pv_prices = pv_prices[pv_prices["technology"].isin(pv_technologies)].reset_index(drop=True) - -# # Get year from dates. -# pv_prices["year"] = pd.to_datetime(pv_prices["month"], format="%b %y").dt.year - -# # For each year get the average cost over all months. -# pv_prices = ( -# pv_prices.groupby(["technology", "year"]) -# .agg({"cost": "mean", "year": "count"}) -# .rename(columns={"year": "n_months"}) -# .reset_index() -# ) - -# # Ignore years for which we don't have 12 months. -# pv_prices = pv_prices[pv_prices["n_months"] == 12].drop(columns=["n_months"]).reset_index(drop=True) - -# # Set an appropriate index and sort conveniently. -# pv_prices = pv_prices.set_index(["technology", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - -# return cast(pd.DataFrame, pv_prices) - - -def extract_global_cost_for_all_sources_from_excel_file(local_file: str) -> pd.DataFrame: - """Extract global weighted-average LCOE of all energy sources from the excel file. - - Each energy source is given in a separate sheet, in a different way, to each needs a different treatment. - - Parameters - ---------- - local_file : str - Path to excel file with raw data. - - Returns - ------- - df : pd.DataFrame - LCOE for different energy sources. - """ - # Load file as an excel object. - excel_object = pd.ExcelFile(local_file) - - # Extract weighted average LCOE for different sources (each one requires a slightly different processing): - - # Solar photovoltaic. - solar_pv = ( - excel_object.parse("Fig 3.1", skiprows=22).dropna(how="all", axis=1).rename(columns={"Unnamed: 1": "temp"}) # type: ignore - ) - solar_pv = solar_pv[solar_pv["temp"] == "Weighted average"].melt( - id_vars="temp", var_name="year", value_name="cost" - )[["year", "cost"]] - solar_pv["technology"] = "Solar photovoltaic" - - # Onshore wind. - onshore_wind = excel_object.parse("Fig 2.12", skiprows=3, usecols=lambda column: "Unnamed" not in column).rename( # type: ignore - columns={"Year": "year", "Weighted average": "cost"} - ) - onshore_wind["technology"] = "Onshore wind" - - # Concentrated solar power. - csp = excel_object.parse("Fig 5.7", skiprows=4).dropna(how="all", axis=1) # type: ignore - csp = ( - csp[csp["2021 USD/kWh"] == "Weighted average"] - .melt(id_vars="2021 USD/kWh", var_name="year", value_name="cost")[["year", "cost"]] - .reset_index(drop=True) - ) - csp["technology"] = "Concentrated solar power" - - # Offshore wind. - offshore_wind = excel_object.parse("Fig 4.13", skiprows=3).rename( # type: ignore - columns={"Year": "year", "Weighted average": "cost"} - )[["year", "cost"]] - offshore_wind["technology"] = "Offshore wind" - - # Geothermal. - geothermal = excel_object.parse("Fig 7.4", skiprows=5).rename(columns={"Year": "year", "Weighted average": "cost"})[ # type: ignore - ["year", "cost"] - ] - geothermal["technology"] = "Geothermal" - - # Bioenergy. - bioenergy = ( - excel_object.parse("Fig 8.1", skiprows=20).dropna(axis=1, how="all").rename(columns={"Unnamed: 1": "temp"}) # type: ignore - ) - bioenergy = bioenergy[bioenergy["temp"] == "Weighted average"].melt( - id_vars="temp", var_name="year", value_name="cost" - )[["year", "cost"]] - bioenergy["technology"] = "Bioenergy" - - # Hydropower. - hydropower = ( - excel_object.parse("Fig 6.1", skiprows=20).dropna(how="all", axis=1).rename(columns={"Unnamed: 1": "temp"}) # type: ignore - ) - hydropower = hydropower[hydropower["temp"] == "Weighted average"].melt( - id_vars="temp", var_name="year", value_name="cost" - )[["year", "cost"]] - hydropower["technology"] = "Hydropower" - - # Concatenate all sources into one dataframe. - df = pd.concat([solar_pv, onshore_wind, csp, offshore_wind, geothermal, bioenergy, hydropower], ignore_index=True) - - # Add country column. - df["country"] = "World" - - return cast(pd.DataFrame, df) - - -def extract_country_cost_from_excel_file(local_file: str) -> pd.DataFrame: - """Extract weighted-average LCOE of certain countries and certain energy sources from the excel file. - - Only onshore wind and solar photovoltaic seem to have this data, and only for specific countries. - - Parameters - ---------- - local_file : str - Path to excel file with raw data. - - Returns - ------- - df : pd.DataFrame - LCOE for different energy sources. - """ - # Extract LCOE for specific countries and technologies (those that are available in original data). - - # Load file as an excel object. - excel_object = pd.ExcelFile(local_file) - - # Solar photovoltaic. - solar_pv = ( - excel_object.parse("Fig 3.8", skiprows=5).dropna(how="all", axis=1).rename(columns={"2021 USD/kWh": "country"}) # type: ignore - ) - - # Last column is the difference between the cost in the last two years. Remove that column. - solar_pv = solar_pv.drop(columns="2020-2021") - - # Onshore wind. - onshore_wind = ( - excel_object.parse("Fig 2.13", skiprows=6).dropna(how="all", axis=1).rename(columns={"Country": "country"}) # type: ignore - ) - - # Country column is repeated. Drop it, and drop column of percentage decrease. - onshore_wind = onshore_wind.drop(columns=["Country.1", "% decrease "]) - - # Add a technology column and concatenate different technologies. - solar_pv["technology"] = "Solar photovoltaic" - onshore_wind["technology"] = "Onshore wind" - combined = pd.concat([solar_pv, onshore_wind], ignore_index=True) - - # Rearrange dataframe to have year as a column. - combined = combined.melt(id_vars=["technology", "country"], var_name="year", value_name="cost") - - return cast(pd.DataFrame, combined) - - -def run(dest_dir: str) -> None: - # Retrieve raw data from Walden. - walden_ds = WaldenCatalog().find_one( - namespace="irena", short_name="renewable_power_generation_costs", version=WALDEN_VERSION - ) - local_file = walden_ds.ensure_downloaded() - - # Extract global, weighted-average LCOE cost for all energy sources. - costs_global = extract_global_cost_for_all_sources_from_excel_file(local_file=local_file) - - # Extract national LCOE for specific countries and technologies. - costs_national = extract_country_cost_from_excel_file(local_file=local_file) - - # Combine global and national data. - combined = pd.concat([costs_global, costs_national], ignore_index=True).astype({"year": int}) - - # Convert from long to wide format. - combined = combined.pivot(index=["country", "year"], columns="technology", values="cost").reset_index() - - # Remove name of dummy index. - combined.columns.names = [None] - - # Set an appropriate index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create a new Meadow dataset and reuse walden metadata. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - - # Create a new table with metadata from Walden. - table_metadata = TableMeta( - short_name=walden_ds.short_name, - title=walden_ds.name, - description=walden_ds.description, - ) - tb = Table(combined, metadata=table_metadata) - - # Underscore all table columns. - tb = underscore_table(tb) - - # Add table to the dataset and save dataset. - ds.add(tb) - ds.save() diff --git a/etl/steps/archive/meadow/irena/2022-10-20/shared.py b/etl/steps/archive/meadow/irena/2022-10-20/shared.py deleted file mode 100644 index 7e7f4d18c5b..00000000000 --- a/etl/steps/archive/meadow/irena/2022-10-20/shared.py +++ /dev/null @@ -1,3 +0,0 @@ -from pathlib import Path - -CURRENT_DIR = Path(__file__).parent diff --git a/etl/steps/archive/meadow/irena/2023-01-04/renewable_electricity_capacity_and_generation.py b/etl/steps/archive/meadow/irena/2023-01-04/renewable_electricity_capacity_and_generation.py deleted file mode 100644 index 941177de9f9..00000000000 --- a/etl/steps/archive/meadow/irena/2023-01-04/renewable_electricity_capacity_and_generation.py +++ /dev/null @@ -1,104 +0,0 @@ -"""Extract capacity data from IRENA's Renewable Electricity Capacity and Generation 2022 dataset. - -""" - -from typing import cast - -import pandas as pd -from owid import catalog -from owid.walden import catalog as WaldenCatalog - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_walden_metadata - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def extract_capacity_from_sheet(excel_object: pd.ExcelFile, sheet_name: str) -> pd.DataFrame: - # The name of the energy source is given in the very first cell. - # To get that, I load the file, skipping all rows from the bottom. - # The first column is the content of the first cell. - technology = excel_object.parse(sheet_name, skipfooter=10000).columns[0] # type: ignore - - # The format of this dataset is inconvenient and requires some adjustment that may not work on the next update. - df = excel_object.parse(sheet_name, skiprows=4) # type: ignore - - # There are two tables put together: One for capacity and one for production. - # Keep only columns for capacity. - columns_to_keep = [df.columns[0]] - for column in df.columns[1:]: - if str(column).startswith("PROD"): - break - if not str(column).startswith("Unnamed"): - columns_to_keep.append(column) - df = df[columns_to_keep].rename(columns={"CAP (MW)": "country"}) - - # Remove empty rows. - df = df.dropna(subset="country").reset_index(drop=True) - - # Restructure dataframe. - df = df.melt(id_vars="country", var_name="year", value_name="capacity") - - # Add technology (referring to the name of the energy source) as a new column. - df["technology"] = technology - - return cast(pd.DataFrame, df) - - -def extract_capacity_from_all_sheets(data_file: str) -> pd.DataFrame: - # Select sheets that contain data (their names are numbers). - excel_object = pd.ExcelFile(data_file) - sheet_names = [sheet for sheet in excel_object.sheet_names if sheet.strip().isdigit()] - - # Extract data sheet by sheet. - all_data = pd.DataFrame() - for sheet_name in sheet_names: - data = extract_capacity_from_sheet(excel_object=excel_object, sheet_name=sheet_name) - all_data = pd.concat([all_data, data], ignore_index=True) - - # Some rows are repeated (it seems that with identical values, at least for the case found, Uruguay on sheet 18). - # Therefore, drop duplicates. - # Set an appropriate index and sort conveniently. - all_data = ( - all_data.drop_duplicates(subset=["country", "year", "technology"], keep="first") - .set_index(["technology", "country", "year"], verify_integrity=True) - .sort_index() - .sort_index(axis=1) - ) - - return all_data - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Retrieve raw data from Walden. - ds_walden: WaldenCatalog.Dataset = paths.load_dependency("renewable_electricity_capacity_and_generation") - local_file = ds_walden.ensure_downloaded() - - # - # Process data. - # - # Extract capacity data. - df = extract_capacity_from_all_sheets(data_file=local_file) - - # - # Save outputs. - # - # Create a new Meadow dataset and reuse walden metadata. - ds = catalog.Dataset.create_empty(dest_dir, metadata=convert_walden_metadata(ds_walden)) - ds.metadata.version = paths.version - - # Create a new table with metadata from Walden. - table_metadata = catalog.TableMeta( - short_name=ds_walden.short_name, - title=ds_walden.name, - description=ds_walden.description, - ) - tb = catalog.Table(df, metadata=table_metadata, underscore=True) - - # Add table to the dataset and save dataset. - ds.add(tb) - ds.save() diff --git a/etl/steps/archive/meadow/irena/2023-01-04/renewable_power_generation_costs.py b/etl/steps/archive/meadow/irena/2023-01-04/renewable_power_generation_costs.py deleted file mode 100644 index c8784fef8cc..00000000000 --- a/etl/steps/archive/meadow/irena/2023-01-04/renewable_power_generation_costs.py +++ /dev/null @@ -1,256 +0,0 @@ -"""Extract global (as well as at the country level for some countries) weighted-average levelized cost of electricity -(LCOE) for all energy sources from IRENA's Renewable Power Generation Costs 2022 dataset. - -Extract solar photovoltaic module prices too. - -NOTE: The original data is poorly formatted. Each energy source is given as a separate sheet, with a different -structure. So it's likely that, on the next update, this script will not work. - -""" - -from typing import cast - -import pandas as pd -from owid import catalog -from owid.walden import catalog as WaldenCatalog - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_walden_metadata - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def prepare_solar_pv_module_prices(data_file: str) -> pd.DataFrame: - """Prepare yearly data on average solar photovoltaic module prices. - - Monthly data will be averaged, and only complete years (with 12 informed months) will be considered. - - Parameters - ---------- - data_file : str - Path to raw data (IRENA's excel file on renewable power generation costs). - - Returns - ------- - pv_prices : pd.DataFrame - PV prices. - - """ - # Photovoltaic technologies to choose for average monthly prices. - pv_technologies = ["Thin film a-Si/u-Si or Global Index (from Q4 2013)"] - # Load upper table in sheet from Figure 3.2, which is: - # Average monthly solar PV module prices by technology and manufacturing country sold in Europe, 2010 to 2021. - pv_prices = pd.read_excel( - data_file, sheet_name="Fig 3.2", skiprows=4, skipfooter=18, usecols=lambda column: "Unnamed" not in column - ) - - # Transpose dataframe so that each row corresponds to a month. - pv_prices = pv_prices.rename(columns={"2021 USD/W": "technology"}).melt( - id_vars="technology", var_name="month", value_name="cost" - ) - - # Select PV technologies. - pv_prices = pv_prices[pv_prices["technology"].isin(pv_technologies)].reset_index(drop=True) - - # Get year from dates. - pv_prices["year"] = pd.to_datetime(pv_prices["month"], format="%b %y").dt.year - - # For each year get the average cost over all months. - pv_prices = ( - pv_prices.groupby(["technology", "year"]) - .agg({"cost": "mean", "year": "count"}) - .rename(columns={"year": "n_months"}) - .reset_index() - ) - - # Remove unnecessary column and add column for region. - pv_prices = pv_prices.drop(columns="technology").assign(**{"country": "World"}) - - # Ignore years for which we don't have 12 months. - pv_prices = pv_prices[pv_prices["n_months"] == 12].drop(columns=["n_months"]).reset_index(drop=True) - - # Set an appropriate index and sort conveniently. - pv_prices = pv_prices.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - return cast(pd.DataFrame, pv_prices) - - -def extract_global_cost_for_all_sources_from_excel_file(local_file: str) -> pd.DataFrame: - """Extract global weighted-average LCOE of all energy sources from the excel file. - - Each energy source is given in a separate sheet, in a different way, to each needs a different treatment. - - Parameters - ---------- - local_file : str - Path to excel file with raw data. - - Returns - ------- - df : pd.DataFrame - LCOE for different energy sources. - """ - # Load file as an excel object. - excel_object = pd.ExcelFile(local_file) - - # Extract weighted average LCOE for different sources (each one requires a slightly different processing): - - # Solar photovoltaic. - solar_pv = ( - excel_object.parse("Fig 3.1", skiprows=22).dropna(how="all", axis=1).rename(columns={"Unnamed: 1": "temp"}) # type: ignore - ) - solar_pv = solar_pv[solar_pv["temp"] == "Weighted average"].melt( - id_vars="temp", var_name="year", value_name="cost" - )[["year", "cost"]] - solar_pv["technology"] = "Solar photovoltaic" - - # Onshore wind. - onshore_wind = excel_object.parse("Fig 2.12", skiprows=3, usecols=lambda column: "Unnamed" not in column).rename( # type: ignore - columns={"Year": "year", "Weighted average": "cost"} - ) - onshore_wind["technology"] = "Onshore wind" - - # Concentrated solar power. - csp = excel_object.parse("Fig 5.7", skiprows=4).dropna(how="all", axis=1) # type: ignore - csp = ( - csp[csp["2021 USD/kWh"] == "Weighted average"] - .melt(id_vars="2021 USD/kWh", var_name="year", value_name="cost")[["year", "cost"]] - .reset_index(drop=True) - ) - csp["technology"] = "Concentrated solar power" - - # Offshore wind. - offshore_wind = excel_object.parse("Fig 4.13", skiprows=3).rename( # type: ignore - columns={"Year": "year", "Weighted average": "cost"} - )[["year", "cost"]] - offshore_wind["technology"] = "Offshore wind" - - # Geothermal. - geothermal = excel_object.parse("Fig 7.4", skiprows=5).rename(columns={"Year": "year", "Weighted average": "cost"})[ # type: ignore - ["year", "cost"] - ] - geothermal["technology"] = "Geothermal" - - # Bioenergy. - bioenergy = ( - excel_object.parse("Fig 8.1", skiprows=20).dropna(axis=1, how="all").rename(columns={"Unnamed: 1": "temp"}) # type: ignore - ) - bioenergy = bioenergy[bioenergy["temp"] == "Weighted average"].melt( - id_vars="temp", var_name="year", value_name="cost" - )[["year", "cost"]] - bioenergy["technology"] = "Bioenergy" - - # Hydropower. - hydropower = ( - excel_object.parse("Fig 6.1", skiprows=20).dropna(how="all", axis=1).rename(columns={"Unnamed: 1": "temp"}) # type: ignore - ) - hydropower = hydropower[hydropower["temp"] == "Weighted average"].melt( - id_vars="temp", var_name="year", value_name="cost" - )[["year", "cost"]] - hydropower["technology"] = "Hydropower" - - # Concatenate all sources into one dataframe. - df = pd.concat([solar_pv, onshore_wind, csp, offshore_wind, geothermal, bioenergy, hydropower], ignore_index=True) - - # Add country column. - df["country"] = "World" - - return cast(pd.DataFrame, df) - - -def extract_country_cost_from_excel_file(local_file: str) -> pd.DataFrame: - """Extract weighted-average LCOE of certain countries and certain energy sources from the excel file. - - Only onshore wind and solar photovoltaic seem to have this data, and only for specific countries. - - Parameters - ---------- - local_file : str - Path to excel file with raw data. - - Returns - ------- - df : pd.DataFrame - LCOE for different energy sources. - """ - # Extract LCOE for specific countries and technologies (those that are available in original data). - - # Load file as an excel object. - excel_object = pd.ExcelFile(local_file) - - # Solar photovoltaic. - solar_pv = ( - excel_object.parse("Fig 3.8", skiprows=5).dropna(how="all", axis=1).rename(columns={"2021 USD/kWh": "country"}) # type: ignore - ) - - # Last column is the difference between the cost in the last two years. Remove that column. - solar_pv = solar_pv.drop(columns="2020-2021") - - # Onshore wind. - onshore_wind = ( - excel_object.parse("Fig 2.13", skiprows=6).dropna(how="all", axis=1).rename(columns={"Country": "country"}) # type: ignore - ) - - # Country column is repeated. Drop it, and drop column of percentage decrease. - onshore_wind = onshore_wind.drop(columns=["Country.1", "% decrease "]) - - # Add a technology column and concatenate different technologies. - solar_pv["technology"] = "Solar photovoltaic" - onshore_wind["technology"] = "Onshore wind" - combined = pd.concat([solar_pv, onshore_wind], ignore_index=True) - - # Rearrange dataframe to have year as a column. - combined = combined.melt(id_vars=["technology", "country"], var_name="year", value_name="cost") - - return cast(pd.DataFrame, combined) - - -def run(dest_dir: str) -> None: - # Retrieve raw data from Walden. - ds_walden: WaldenCatalog.Dataset = paths.load_dependency("renewable_power_generation_costs") - local_file = ds_walden.ensure_downloaded() - - # Extract global, weighted-average LCOE cost for all energy sources. - costs_global = extract_global_cost_for_all_sources_from_excel_file(local_file=local_file) - - # Extract national LCOE for specific countries and technologies. - costs_national = extract_country_cost_from_excel_file(local_file=local_file) - - # Combine global and national data. - combined = pd.concat([costs_global, costs_national], ignore_index=True).astype({"year": int}) - - # Convert from long to wide format. - combined = combined.pivot(index=["country", "year"], columns="technology", values="cost").reset_index() - - # Remove name of dummy index. - combined.columns.names = [None] - - # Set an appropriate index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Extract global data on solar photovoltaic module prices. - solar_pv_prices = prepare_solar_pv_module_prices(data_file=local_file) - - # - # Save outputs. - # - # Create a new Meadow dataset and reuse walden metadata. - ds = catalog.Dataset.create_empty(dest_dir, metadata=convert_walden_metadata(ds_walden)) - ds.metadata.version = paths.version - - # Create a new table of LCOE with metadata from Walden. - table_metadata = catalog.TableMeta( - short_name=ds_walden.short_name, - title=ds_walden.name, - description=ds_walden.description, - ) - tb = catalog.Table(combined, metadata=table_metadata, underscore=True, short_name=paths.short_name) - - # Create an additional table of solar photovoltaic module prices. - tb_solar_pv_prices = catalog.Table(solar_pv_prices, underscore=True, short_name="solar_photovoltaic_module_prices") - - # Add tables to the dataset and save dataset. - ds.add(tb) - ds.add(tb_solar_pv_prices) - ds.save() diff --git a/etl/steps/archive/meadow/met_office_hadley_centre/2023-01-02/near_surface_temperature.py b/etl/steps/archive/meadow/met_office_hadley_centre/2023-01-02/near_surface_temperature.py deleted file mode 100644 index a6acedf5163..00000000000 --- a/etl/steps/archive/meadow/met_office_hadley_centre/2023-01-02/near_surface_temperature.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Load near surface temperature dataset (northern hemisphere, southern hemisphere, and global) by Met Office Hadley -Centre and create a single table. - -""" - -import pandas as pd -from owid.catalog import Dataset, Table - -from etl.helpers import PathFinder -from etl.snapshot import Snapshot -from etl.steps.data.converters import convert_snapshot_metadata - -# Get naming conventions. -N = PathFinder(__file__) - -# Snapshot and Meadow dataset versions. -SNAPSHOT_VERSION = "2023-01-02" -MEADOW_VERSION = SNAPSHOT_VERSION -MEADOW_SHORT_NAME = "near_surface_temperature" - -# Columns to select and how to name them. -COLUMNS = { - # Additional column. - "region": "region", - # Original column names and new names. - "Time": "year", - "Anomaly (deg C)": "temperature_anomaly", - "Lower confidence limit (2.5%)": "lower_limit", - "Upper confidence limit (97.5%)": "upper_limit", -} - -# Names of snapshot files. -REGION_FILE_NAMES = { - "Global": "near_surface_temperature_global.csv", - "Northern hemisphere": "near_surface_temperature_northern_hemisphere.csv", - "Southern hemisphere": "near_surface_temperature_southern_hemisphere.csv", -} - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load snapshots. - snapshots = { - region: Snapshot(f"met_office_hadley_centre/{SNAPSHOT_VERSION}/{file_name}") - for region, file_name in REGION_FILE_NAMES.items() - } - df = pd.concat( - [pd.read_csv(snapshot.path).assign(**{"region": region}) for region, snapshot in snapshots.items()], - ignore_index=True, - ) - - # - # Prepare data. - # - # Select and rename required columns. - df = df.rename(columns=COLUMNS, errors="raise")[COLUMNS.values()] - - # Set an appropriate index and sort conveniently. - df = df.set_index(["region", "year"], verify_integrity=True).sort_index() - - # - # Save outputs. - # - # Create new meadow dataset, using metadata from one of the snapshots. - ds = Dataset.create_empty(dest_dir, metadata=convert_snapshot_metadata(snapshots["Global"].metadata)) - ds.metadata.version = MEADOW_VERSION - ds.metadata.short_name = MEADOW_SHORT_NAME - - # Create new table with metadata and underscore all columns. - tb = Table(df, short_name=MEADOW_SHORT_NAME, underscore=True) - - # Add table to new meadow dataset and save dataset. - ds.add(tb) - ds.save() diff --git a/etl/steps/archive/meadow/met_office_hadley_centre/2023-01-17/near_surface_temperature.py b/etl/steps/archive/meadow/met_office_hadley_centre/2023-01-17/near_surface_temperature.py deleted file mode 100644 index b62f1d55bdf..00000000000 --- a/etl/steps/archive/meadow/met_office_hadley_centre/2023-01-17/near_surface_temperature.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Load a snapshot and create a meadow dataset.""" - -import pandas as pd -from owid.catalog import Dataset, Table -from structlog import get_logger - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_snapshot_metadata - -# Initialize logger. -log = get_logger() - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -# Columns to select and how to name them. -COLUMNS = { - # Additional column. - "region": "region", - # Original column names and new names. - "Time": "year", - "Anomaly (deg C)": "temperature_anomaly", - "Lower confidence limit (2.5%)": "lower_limit", - "Upper confidence limit (97.5%)": "upper_limit", -} - -# Names of snapshot files. -REGION_FILE_NAMES = { - "Global": "near_surface_temperature_global.csv", - "Northern hemisphere": "near_surface_temperature_northern_hemisphere.csv", - "Southern hemisphere": "near_surface_temperature_southern_hemisphere.csv", -} - - -def run(dest_dir: str) -> None: - log.info("near_surface_temperature.start") - - # - # Load inputs. - # - # Retrieve snapshots. - snapshots = {region: paths.load_dependency(file_name) for region, file_name in REGION_FILE_NAMES.items()} - - # Load data from snapshots. - df = pd.concat( - [pd.read_csv(snapshot.path).assign(**{"region": region}) for region, snapshot in snapshots.items()], - ignore_index=True, - ) - - # - # Process data. - # - # Select and rename required columns. - df = df.rename(columns=COLUMNS, errors="raise")[COLUMNS.values()] - - # Set an appropriate index and sort conveniently. - df = df.set_index(["region", "year"], verify_integrity=True).sort_index() - - # Create a new table and ensure all columns are snake-case. - tb = Table(df, short_name=paths.short_name, underscore=True) - - # - # Save outputs. - # - # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = Dataset.create_empty(dest_dir, metadata=convert_snapshot_metadata(snapshots["Global"].metadata)) - - # Ensure the short name and version of the new dataset correspond to the ones of the current step. - ds_meadow.metadata.short_name = paths.short_name - ds_meadow.metadata.version = paths.version - - # Add the new table to the meadow dataset. - ds_meadow.add(tb) - - # Save changes in the new garden dataset. - ds_meadow.save() - - log.info("near_surface_temperature.end") diff --git a/etl/steps/archive/meadow/papers/2022-11-04/riley_2005/__init__.py b/etl/steps/archive/meadow/papers/2022-11-04/riley_2005/__init__.py deleted file mode 100644 index aa967c89c90..00000000000 --- a/etl/steps/archive/meadow/papers/2022-11-04/riley_2005/__init__.py +++ /dev/null @@ -1,108 +0,0 @@ -"""For this pipeline, the Walden data comes as a PDF. - -Considering that: - -- Extracting the data from the PDF was complex. Several libraries failed to correctly -recognize and extract the data from the table (in page 2) -- The data contained is very little (<60 rows) - -I decided to manually extract the data from the PDF and save it as a CSV and get feedback -from authors. -""" -import hashlib - -import pandas as pd -import PyPDF2 -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from structlog import get_logger - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_walden_metadata - -log = get_logger() - -# naming conventions -N = PathFinder(__file__) - - -# Dataset details from Walden -NAMESPACE = "papers" -SHORT_NAME = "riley_2005" -VERSION_WALDEN = "2022-11-01" -# Meadow version -VERSION_MEADOW = "2022-11-04" -# Data file -DATA_FILE = N.directory / f"{SHORT_NAME}.data.csv" -HASH_EXPECTED = "b80b1796b1a2ce683db5ea9c5dc5ac2d" - - -def run(dest_dir: str) -> None: - log.info(f"{SHORT_NAME}.start") - - # retrieve raw data from walden - walden_ds = WaldenCatalog().find_one(namespace=NAMESPACE, short_name=SHORT_NAME, version=VERSION_WALDEN) - local_file = walden_ds.ensure_downloaded() - - # Load data - check_expected_data(local_file) - - # Load data - df = load_data() - - # Create table - tb = make_table(df, walden_ds) - - # initialize meadow dataset - ds = init_meadow_dataset(dest_dir, walden_ds) - # add table to a dataset - ds.add(tb) - # finally save the dataset - ds.save() - - log.info(f"{SHORT_NAME}.end") - - -def check_expected_data(local_file: str) -> None: - """Check that table in PDF is as expected. - - We do this by comparing the raw text extracted from the PDF with a copy, previously generated, - of what is to be expected. - """ - # Extract text from PDF (Walden) - with open(local_file, "rb") as f: - pdfReader = PyPDF2.PdfReader(f) - text_pdf = pdfReader.pages[2].extract_text() - # Load text from PDF as expected - hash = hashlib.md5(text_pdf.encode()).hexdigest() - assert hash == HASH_EXPECTED, "Text from PDF does not match expected text." - - -def load_data() -> pd.DataFrame: - """Data loaded from a manually generated CSV. - - This CSV was generated manually by looking at the PDF and transcribing its values. - """ - df = pd.read_csv(DATA_FILE) - return df - - -def init_meadow_dataset(dest_dir: str, walden_ds: WaldenCatalog) -> Dataset: - """Initialize meadow dataset.""" - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION_MEADOW - return ds - - -def make_table(df: pd.DataFrame, walden_ds: WaldenCatalog) -> Table: - """Create table from dataframe and Walden metadata.""" - table_metadata = TableMeta( - short_name=walden_ds.short_name, - title="Life expectancy at birth", - description="Life expectancy at birth estimates.", - ) - tb = Table(df, metadata=table_metadata) - tb = underscore_table(tb) - return tb diff --git a/etl/steps/archive/meadow/papers/2022-11-04/riley_2005/riley_2005.data.csv b/etl/steps/archive/meadow/papers/2022-11-04/riley_2005/riley_2005.data.csv deleted file mode 100644 index cd35805f5ca..00000000000 --- a/etl/steps/archive/meadow/papers/2022-11-04/riley_2005/riley_2005.data.csv +++ /dev/null @@ -1,56 +0,0 @@ -year,entity,life_expectancy -1770,Europe,34.3 -1800,Europe,33.3 -1820,Europe,35.6 -1850,Europe,36.3 -1870,Europe,36.2 -1900,Europe,42.7 -1913,Europe,46.8 -1950,Europe,64.7 -1973,Europe,70.9 -1990,Europe,74.2 -2001,Europe,76.8 -1885,Asia,27.5 -1900,Asia,28 -1913,Asia,28.1 -1950,Asia,41.6 -1973,Asia,57.5 -1990,Asia,64.5 -2001,Asia,67.1 -1925,Africa,26.4 -1950,Africa,35.6 -1973,Africa,46.9 -1990,Africa,52.9 -2001,Africa,50.5 -1830,Americas,34.8 -1850,Americas,35.1 -1870,Americas,35.1 -1900,Americas,41 -1913,Americas,45.1 -1950,Americas,58.4 -1973,Americas,66 -1990,Americas,70.8 -2001,Americas,73.2 -1900,Soviet Union,29 -1913,Soviet Union,36.4 -1950,Soviet Union,56.1 -1973,Soviet Union,68.9 -1990,Soviet Union,69.1 -2001,Soviet Union,66.6 -1870,Oceania,34.7 -1900,Oceania,47.6 -1913,Oceania,51 -1950,Oceania,63.4 -1973,Oceania,68.2 -1990,Oceania,72.8 -2001,Oceania,74.6 -1800,World,28.5 -1820,World,29 -1850,World,29.3 -1870,World,29.7 -1900,World,32 -1913,World,34.1 -1950,World,48 -1973,World,60 -1990,World,65.2 -2001,World,66.6 \ No newline at end of file diff --git a/etl/steps/archive/meadow/papers/2023-01-04/farmer_lafond_2016.py b/etl/steps/archive/meadow/papers/2023-01-04/farmer_lafond_2016.py deleted file mode 100644 index 8f473096406..00000000000 --- a/etl/steps/archive/meadow/papers/2023-01-04/farmer_lafond_2016.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Load snapshot of Farmer & Lafond (2016) data and create a table. - -""" - -import pandas as pd -from owid import catalog - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_snapshot_metadata - -# Get paths and naming conventions for current data step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load snapshot. - snap = paths.load_dependency("farmer_lafond_2016.csv") - df = pd.read_csv(snap.path) - - # - # Prepare data. - # - # Store the unit of each technology cost from the zeroth row. - units = dict(zip(df.columns.tolist()[1:], df.loc[0][1:])) - - # The zeroth row will be added as metadata, and the first row is not useful, so drop both. - df = df.drop(index=[0, 1]).reset_index(drop=True) - - # Rename year column and make it integer. - df = df.rename(columns={"YEAR": "year"}).astype({"year": int}) - - # Create a new table with metadata. - tb = catalog.Table(df, short_name=paths.short_name, underscore=False) - - # Add title, units and description to metadata. - for column in tb.drop(columns=["year"]).columns: - tb[column].metadata.title = column - tb[column].metadata.unit = units[column] - tb[column].metadata.description = f"Cost for {column}, measured in {units[column]}." - - # Ensure all columns are snake-case. - tb = catalog.utils.underscore_table(tb) - - # Set an appropriate index and sort conveniently. - tb = tb.set_index(["year"], verify_integrity=True).sort_index() - - # - # Save outputs. - # - # Create a new meadow dataset and reuse snapshot metadata. - ds = catalog.Dataset.create_empty(dest_dir, metadata=convert_snapshot_metadata(snap.metadata)) - ds.metadata.version = "2023-01-04" - - # Add table to dataset and save dataset. - ds.add(tb) - ds.save() diff --git a/etl/steps/archive/meadow/papers/2023-01-04/nemet_2009.py b/etl/steps/archive/meadow/papers/2023-01-04/nemet_2009.py deleted file mode 100644 index b3e9db79aff..00000000000 --- a/etl/steps/archive/meadow/papers/2023-01-04/nemet_2009.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Load snapshot of Nemet (2009) data and create a table. - -""" - -import pandas as pd -from owid import catalog - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_snapshot_metadata - -# Get paths and naming conventions for current data step. -paths = PathFinder(__file__) - -# Columns to select from snapshot, and how to rename them. -COLUMNS = { - "Cost (2004 USD/Watt)": "cost", - "Time (Year)": "year", - "Yearly Capacity (MW)": "yearly_capacity", - "Previous Capacity (MW)": "previous_capacity", -} - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load snapshot. - snap = paths.load_dependency("nemet_2009.csv") - df = pd.read_csv(snap.path) - - # - # Process data. - # - df = df.rename(columns=COLUMNS, errors="raise")[COLUMNS.values()] - - # Set an appropriate index and sort conveniently. - df = df.set_index(["year"], verify_integrity=True).sort_index() - - # - # Save outputs. - # - # Create a new meadow dataset and reuse snapshot metadata. - ds = catalog.Dataset.create_empty(dest_dir, metadata=convert_snapshot_metadata(snap.metadata)) - ds.metadata.version = "2023-01-04" - - # Create a new table. - tb = catalog.Table(df, short_name=paths.short_name, underscore=True) - - # Add table to dataset and save dataset. - ds.add(tb) - ds.save() diff --git a/etl/steps/archive/meadow/rff/2022-09-14/emissions_weighted_carbon_price__coverage.py b/etl/steps/archive/meadow/rff/2022-09-14/emissions_weighted_carbon_price__coverage.py deleted file mode 100644 index c848014c5ae..00000000000 --- a/etl/steps/archive/meadow/rff/2022-09-14/emissions_weighted_carbon_price__coverage.py +++ /dev/null @@ -1,50 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog - -from etl.steps.data.converters import convert_walden_metadata - -# Details for dataset to be created. -VERSION = "2022-09-14" -DATASET_NAME = "emissions_weighted_carbon_price__coverage" -# Details of dataset in walden. -WALDEN_VERSION = "2022-09-14" -WALDEN_DATASET_NAME = DATASET_NAME - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load data from walden. - walden_ds = WaldenCatalog().find_one(namespace="rff", short_name=WALDEN_DATASET_NAME, version=WALDEN_VERSION) - df = pd.read_csv(walden_ds.ensure_downloaded(), dtype=object) - - # - # Process data. - # - # Sanity check. - error = "There should be one row per jurisdiction-year." - assert df[df.duplicated(subset=["jurisdiction", "year"])].empty, error - error = "There should not be any row that only has nan data." - assert df[df.drop(columns=["jurisdiction", "year"]).isnull().all(axis=1)].empty, error - - # Set an index and sort conveniently. - df = df.set_index(["jurisdiction", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # - # Save outputs. - # - # Create new dataset with metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - # Create new table with metadata. - table_metadata = TableMeta(short_name=walden_ds.short_name, title=walden_ds.name, description=walden_ds.description) - tb = Table(df, metadata=table_metadata) - # Prepare table and add it to the dataset. - tb = underscore_table(tb) - ds.add(tb) - # Save the dataset. - ds.save() diff --git a/etl/steps/archive/meadow/rff/2022-09-14/emissions_weighted_carbon_price__economy.py b/etl/steps/archive/meadow/rff/2022-09-14/emissions_weighted_carbon_price__economy.py deleted file mode 100644 index 56bbca6f6c8..00000000000 --- a/etl/steps/archive/meadow/rff/2022-09-14/emissions_weighted_carbon_price__economy.py +++ /dev/null @@ -1,50 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog - -from etl.steps.data.converters import convert_walden_metadata - -# Details for dataset to be created. -VERSION = "2022-09-14" -DATASET_NAME = "emissions_weighted_carbon_price__economy" -# Details of dataset in walden. -WALDEN_VERSION = "2022-09-14" -WALDEN_DATASET_NAME = DATASET_NAME - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load data from walden. - walden_ds = WaldenCatalog().find_one(namespace="rff", short_name=WALDEN_DATASET_NAME, version=WALDEN_VERSION) - df = pd.read_csv(walden_ds.ensure_downloaded(), dtype=object) - - # - # Process data. - # - # Sanity check. - error = "There should be one row per jurisdiction-year." - assert df[df.duplicated(subset=["jurisdiction", "year"])].empty, error - error = "There should not be any row that only has nan data." - assert df[df.drop(columns=["jurisdiction", "year"]).isnull().all(axis=1)].empty, error - - # Set an index and sort conveniently. - df = df.set_index(["jurisdiction", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # - # Save outputs. - # - # Create new dataset with metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - # Create new table with metadata. - table_metadata = TableMeta(short_name=walden_ds.short_name, title=walden_ds.name, description=walden_ds.description) - tb = Table(df, metadata=table_metadata) - # Prepare table and add it to the dataset. - tb = underscore_table(tb) - ds.add(tb) - # Save the dataset. - ds.save() diff --git a/etl/steps/archive/meadow/rff/2022-09-14/world_carbon_pricing.py b/etl/steps/archive/meadow/rff/2022-09-14/world_carbon_pricing.py deleted file mode 100644 index b7077fb8304..00000000000 --- a/etl/steps/archive/meadow/rff/2022-09-14/world_carbon_pricing.py +++ /dev/null @@ -1,56 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog - -from etl.steps.data.converters import convert_walden_metadata - -# Version for dataset to be created. -VERSION = "2022-09-14" -# Version of dataset in walden. -WALDEN_VERSION = "2022-09-14" - -# Columns to select and rename in ipcc column names. -IPCC_COLUMNS = { - "IPCC_CODE": "ipcc_code", - "FULLNAME": "sector_name", -} - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load data from walden. - walden_ds = WaldenCatalog().find_one(namespace="rff", short_name="world_carbon_pricing", version=WALDEN_VERSION) - df = pd.read_csv(walden_ds.ensure_downloaded(), dtype=object) - # Load IPCC codes from walden. - walden_ipcc_ds = WaldenCatalog().find_one(namespace="rff", short_name="ipcc_codes", version=WALDEN_VERSION) - ipcc_codes = pd.read_csv(walden_ipcc_ds.ensure_downloaded(), dtype=object) - - # - # Process data. - # - # Prepare IPCC codes dataframe. - ipcc_codes = ipcc_codes[list(IPCC_COLUMNS)].rename(columns=IPCC_COLUMNS) - # Sanity check. - error = "IPCC codes found in data that are missing in IPCC codes file." - assert set(df["ipcc_code"]) <= set(ipcc_codes["ipcc_code"]), error - # Add sector names to data, mapping IPCC codes. - df = pd.merge(df, ipcc_codes, on="ipcc_code", how="left") - - # - # Save outputs. - # - # Create new dataset with metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - # Create new table with metadata. - table_metadata = TableMeta(short_name=walden_ds.short_name, title=walden_ds.name, description=walden_ds.description) - tb = Table(df, metadata=table_metadata) - # Prepare table and add it to the dataset. - tb = underscore_table(tb) - ds.add(tb) - # Save the dataset. - ds.save() diff --git a/etl/steps/archive/meadow/rff/2022-09-14/world_carbon_pricing__subnational.py b/etl/steps/archive/meadow/rff/2022-09-14/world_carbon_pricing__subnational.py deleted file mode 100644 index 4f23155b407..00000000000 --- a/etl/steps/archive/meadow/rff/2022-09-14/world_carbon_pricing__subnational.py +++ /dev/null @@ -1,58 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog - -from etl.steps.data.converters import convert_walden_metadata - -# Version for dataset to be created. -VERSION = "2022-09-14" -# Version of dataset in walden. -WALDEN_VERSION = "2022-09-14" - -# Columns to select and rename in ipcc column names. -IPCC_COLUMNS = { - "IPCC_CODE": "ipcc_code", - "FULLNAME": "sector_name", -} - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load data from walden. - walden_ds = WaldenCatalog().find_one( - namespace="rff", short_name="world_carbon_pricing__subnational", version=WALDEN_VERSION - ) - df = pd.read_csv(walden_ds.ensure_downloaded(), dtype=object) - # Load IPCC codes from walden. - walden_ipcc_ds = WaldenCatalog().find_one(namespace="rff", short_name="ipcc_codes", version=WALDEN_VERSION) - ipcc_codes = pd.read_csv(walden_ipcc_ds.ensure_downloaded(), dtype=object) - - # - # Process data. - # - # Prepare IPCC codes dataframe. - ipcc_codes = ipcc_codes[list(IPCC_COLUMNS)].rename(columns=IPCC_COLUMNS) - # Sanity check. - error = "IPCC codes found in data that are missing in IPCC codes file." - assert set(df["ipcc_code"]) <= set(ipcc_codes["ipcc_code"]), error - # Add sector names to data, mapping IPCC codes. - df = pd.merge(df, ipcc_codes, on="ipcc_code", how="left") - - # - # Save outputs. - # - # Create new dataset with metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - # Create new table with metadata. - table_metadata = TableMeta(short_name=walden_ds.short_name, title=walden_ds.name, description=walden_ds.description) - tb = Table(df, metadata=table_metadata) - # Prepare table and add it to the dataset. - tb = underscore_table(tb) - ds.add(tb) - # Save the dataset. - ds.save() diff --git a/etl/steps/archive/meadow/rff/2022-10-11/emissions_weighted_carbon_price__coverage.py b/etl/steps/archive/meadow/rff/2022-10-11/emissions_weighted_carbon_price__coverage.py deleted file mode 100644 index be10b6e7dc3..00000000000 --- a/etl/steps/archive/meadow/rff/2022-10-11/emissions_weighted_carbon_price__coverage.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from shared import VERSION, WALDEN_VERSION - -from etl.steps.data.converters import convert_walden_metadata - -# Details for dataset to be created. -DATASET_NAME = "emissions_weighted_carbon_price__coverage" -WALDEN_DATASET_NAME = DATASET_NAME - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load data from walden. - walden_ds = WaldenCatalog().find_one(namespace="rff", short_name=WALDEN_DATASET_NAME, version=WALDEN_VERSION) - df = pd.read_csv(walden_ds.ensure_downloaded(), dtype=object) - - # - # Process data. - # - # Sanity check. - error = "There should be one row per jurisdiction-year." - assert df[df.duplicated(subset=["jurisdiction", "year"])].empty, error - error = "There should not be any row that only has nan data." - assert df[df.drop(columns=["jurisdiction", "year"]).isnull().all(axis=1)].empty, error - - # Set an index and sort conveniently. - df = df.set_index(["jurisdiction", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # - # Save outputs. - # - # Create new dataset with metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - # Create new table with metadata. - table_metadata = TableMeta(short_name=walden_ds.short_name, title=walden_ds.name, description=walden_ds.description) - tb = Table(df, metadata=table_metadata) - # Prepare table and add it to the dataset. - tb = underscore_table(tb) - ds.add(tb) - # Save the dataset. - ds.save() diff --git a/etl/steps/archive/meadow/rff/2022-10-11/emissions_weighted_carbon_price__economy.py b/etl/steps/archive/meadow/rff/2022-10-11/emissions_weighted_carbon_price__economy.py deleted file mode 100644 index 315e4ee67bc..00000000000 --- a/etl/steps/archive/meadow/rff/2022-10-11/emissions_weighted_carbon_price__economy.py +++ /dev/null @@ -1,49 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from shared import VERSION, WALDEN_VERSION - -from etl.steps.data.converters import convert_walden_metadata - -# Details for dataset to be created. -DATASET_NAME = "emissions_weighted_carbon_price__economy" -# Details of dataset in walden. -WALDEN_DATASET_NAME = DATASET_NAME - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load data from walden. - walden_ds = WaldenCatalog().find_one(namespace="rff", short_name=WALDEN_DATASET_NAME, version=WALDEN_VERSION) - df = pd.read_csv(walden_ds.ensure_downloaded(), dtype=object) - - # - # Process data. - # - # Sanity check. - error = "There should be one row per jurisdiction-year." - assert df[df.duplicated(subset=["jurisdiction", "year"])].empty, error - error = "There should not be any row that only has nan data." - assert df[df.drop(columns=["jurisdiction", "year"]).isnull().all(axis=1)].empty, error - - # Set an index and sort conveniently. - df = df.set_index(["jurisdiction", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # - # Save outputs. - # - # Create new dataset with metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - # Create new table with metadata. - table_metadata = TableMeta(short_name=walden_ds.short_name, title=walden_ds.name, description=walden_ds.description) - tb = Table(df, metadata=table_metadata) - # Prepare table and add it to the dataset. - tb = underscore_table(tb) - ds.add(tb) - # Save the dataset. - ds.save() diff --git a/etl/steps/archive/meadow/rff/2022-10-11/shared.py b/etl/steps/archive/meadow/rff/2022-10-11/shared.py deleted file mode 100644 index 6318ecc58a8..00000000000 --- a/etl/steps/archive/meadow/rff/2022-10-11/shared.py +++ /dev/null @@ -1,10 +0,0 @@ -# Version for dataset to be created. -VERSION = "2022-10-11" -# Version of dataset in walden. -WALDEN_VERSION = VERSION - -# Columns to select and rename in ipcc column names. -IPCC_COLUMNS = { - "IPCC_CODE": "ipcc_code", - "FULLNAME": "sector_name", -} diff --git a/etl/steps/archive/meadow/rff/2022-10-11/world_carbon_pricing.py b/etl/steps/archive/meadow/rff/2022-10-11/world_carbon_pricing.py deleted file mode 100644 index d7bfce8c84f..00000000000 --- a/etl/steps/archive/meadow/rff/2022-10-11/world_carbon_pricing.py +++ /dev/null @@ -1,46 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from shared import IPCC_COLUMNS, VERSION, WALDEN_VERSION - -from etl.steps.data.converters import convert_walden_metadata - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load data from walden. - walden_ds = WaldenCatalog().find_one(namespace="rff", short_name="world_carbon_pricing", version=WALDEN_VERSION) - df = pd.read_csv(walden_ds.ensure_downloaded(), dtype=object) - # Load IPCC codes from walden. - walden_ipcc_ds = WaldenCatalog().find_one(namespace="rff", short_name="ipcc_codes", version=WALDEN_VERSION) - ipcc_codes = pd.read_csv(walden_ipcc_ds.ensure_downloaded(), dtype=object) - - # - # Process data. - # - # Prepare IPCC codes dataframe. - ipcc_codes = ipcc_codes[list(IPCC_COLUMNS)].rename(columns=IPCC_COLUMNS) - # Sanity check. - error = "IPCC codes found in data that are missing in IPCC codes file." - assert set(df["ipcc_code"]) <= set(ipcc_codes["ipcc_code"]), error - # Add sector names to data, mapping IPCC codes. - df = pd.merge(df, ipcc_codes, on="ipcc_code", how="left") - - # - # Save outputs. - # - # Create new dataset with metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - # Create new table with metadata. - table_metadata = TableMeta(short_name=walden_ds.short_name, title=walden_ds.name, description=walden_ds.description) - tb = Table(df, metadata=table_metadata) - # Prepare table and add it to the dataset. - tb = underscore_table(tb) - ds.add(tb) - # Save the dataset. - ds.save() diff --git a/etl/steps/archive/meadow/rff/2022-10-11/world_carbon_pricing__subnational.py b/etl/steps/archive/meadow/rff/2022-10-11/world_carbon_pricing__subnational.py deleted file mode 100644 index 0764ebba516..00000000000 --- a/etl/steps/archive/meadow/rff/2022-10-11/world_carbon_pricing__subnational.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from shared import IPCC_COLUMNS, VERSION, WALDEN_VERSION - -from etl.steps.data.converters import convert_walden_metadata - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load data from walden. - walden_ds = WaldenCatalog().find_one( - namespace="rff", short_name="world_carbon_pricing__subnational", version=WALDEN_VERSION - ) - df = pd.read_csv(walden_ds.ensure_downloaded(), dtype=object) - # Load IPCC codes from walden. - walden_ipcc_ds = WaldenCatalog().find_one(namespace="rff", short_name="ipcc_codes", version=WALDEN_VERSION) - ipcc_codes = pd.read_csv(walden_ipcc_ds.ensure_downloaded(), dtype=object) - - # - # Process data. - # - # Prepare IPCC codes dataframe. - ipcc_codes = ipcc_codes[list(IPCC_COLUMNS)].rename(columns=IPCC_COLUMNS) - # Sanity check. - error = "IPCC codes found in data that are missing in IPCC codes file." - assert set(df["ipcc_code"]) <= set(ipcc_codes["ipcc_code"]), error - # Add sector names to data, mapping IPCC codes. - df = pd.merge(df, ipcc_codes, on="ipcc_code", how="left") - - # - # Save outputs. - # - # Create new dataset with metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - # Create new table with metadata. - table_metadata = TableMeta(short_name=walden_ds.short_name, title=walden_ds.name, description=walden_ds.description) - tb = Table(df, metadata=table_metadata) - # Prepare table and add it to the dataset. - tb = underscore_table(tb) - ds.add(tb) - # Save the dataset. - ds.save() diff --git a/etl/steps/archive/meadow/shift/2022-07-18/fossil_fuel_production.py b/etl/steps/archive/meadow/shift/2022-07-18/fossil_fuel_production.py deleted file mode 100644 index 796eb78aeac..00000000000 --- a/etl/steps/archive/meadow/shift/2022-07-18/fossil_fuel_production.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Meadow step for Shift data on energy production from fossil fuels. - -""" -from pathlib import Path - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from structlog import get_logger - -from etl.steps.data.converters import convert_walden_metadata - -log = get_logger() - -NAMESPACE = "shift" -DATASET_SHORT_NAME = "fossil_fuel_production" -VERSION = Path(__file__).parent.name - - -def run(dest_dir: str) -> None: - log.info(f"{DATASET_SHORT_NAME}.start") - - # Load data from walden. - walden_ds = WaldenCatalog().find_one(namespace=NAMESPACE, short_name=DATASET_SHORT_NAME, version=VERSION) - local_file = walden_ds.ensure_downloaded() - df = pd.read_csv(local_file) - - # Create new dataset using metadata from walden. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.save() - - # Create a table in the dataset with the same metadata as the dataset. - table_metadata = TableMeta( - short_name=walden_ds.short_name, - title=walden_ds.name, - description=walden_ds.description, - ) - tb = Table(df, metadata=table_metadata) - - # Ensure all columns are lower-case and snake-case. - tb = underscore_table(tb) - - # Add table to a dataset. - ds.add(tb) - - log.info(f"{DATASET_SHORT_NAME}.end") diff --git a/etl/steps/archive/meadow/smil/2017-01-01/global_primary_energy.py b/etl/steps/archive/meadow/smil/2017-01-01/global_primary_energy.py deleted file mode 100644 index 93eb31c696c..00000000000 --- a/etl/steps/archive/meadow/smil/2017-01-01/global_primary_energy.py +++ /dev/null @@ -1,51 +0,0 @@ -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog -from structlog import get_logger - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_walden_metadata - -log = get_logger() - -# naming conventions -N = PathFinder(__file__) - -# Version of meadow dataset to be created. -VERSION = "2017-01-01" -# Walden version of the dataset. -WALDEN_VERSION = "2017-01-01" - - -def run(dest_dir: str) -> None: - log.info("global_primary_energy.start") - - # Retrieve raw data from walden. - walden_ds = WaldenCatalog().find_one(namespace="smil", short_name="global_primary_energy", version=WALDEN_VERSION) - local_file = walden_ds.ensure_downloaded() - df = pd.read_csv(local_file) - - # Create a new meadow dataset and reuse walden metadata. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = VERSION - - # Create a new table with metadata from the dataset. - table_metadata = TableMeta(short_name=walden_ds.short_name, title=walden_ds.name, description=walden_ds.description) - tb = Table(df, metadata=table_metadata) - # Use the current names of the columns as the variable titles in the metadata. - for column in tb.columns: - tb[column].metadata.title = column - # Change all columns to be lower snake case. - tb = underscore_table(tb) - # Set table index. - tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index() - - # Add table to dataset. - ds.add(tb) - - # Save dataset. - ds.save() - - log.info("global_primary_energy.end") diff --git a/etl/steps/archive/meadow/uk_beis/2022-07-28/uk_historical_electricity.py b/etl/steps/archive/meadow/uk_beis/2022-07-28/uk_historical_electricity.py deleted file mode 100644 index a9924e7ff55..00000000000 --- a/etl/steps/archive/meadow/uk_beis/2022-07-28/uk_historical_electricity.py +++ /dev/null @@ -1,175 +0,0 @@ -from typing import Dict, List - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore_table -from owid.walden import Catalog as WaldenCatalog - -from etl.helpers import PathFinder -from etl.steps.data.converters import convert_walden_metadata - -# Get relevant paths for current file. -paths = PathFinder(__file__) - - -def prepare_data(df: pd.DataFrame, expected_content: Dict[int, List[str]], columns: Dict[int, str]) -> pd.DataFrame: - """Prepare raw content of a specific sheet in the the BEIS excel file (loaded with a simple pd.read_excel(...)). - - It contains some sanity checks due to the poor formatting of the original file, and some basic processing (like - removing footnote marks from the years, e.g. "2000 (5)" -> 2000). Duplicate rows are not removed. - - Parameters - ---------- - df : pd.DataFrame - expected_content : dict - Words that are expected to be found in any row of specific columns in the data. The dictionary key should be the - column number (where the 0th column is expected to be the years variable), and the value should be a list of - words (for example, a column that contains data for natural gas may have the words "Natural" and "gas" spread - across two different rows). This is used to check that the columns are in the order we expected. - columns : dict - Columns to select from data, and how to rename them. The dictionary key should be the column number, and the - value should be the new name for that column. - - Returns - ------- - df : pd.DataFrame - Clean data extracted, with proper column names. - - """ - df = df.copy() - - # Check that certain words are contained in specific columns, to ensure that they contain the data we expect. - for column in expected_content: - expected_elements = expected_content[column] - for element in expected_elements: - error = f"Excel file may have changed structure (expected {element} in column {column})." - assert df[df.columns[column]].str.contains(element, regex=False).any(), error - - # Select columns and how to rename them. - df = df.loc[:, df.columns[list(columns)]].rename(columns={df.columns[i]: columns[i] for i in columns}) - - # Remove all rows for which the year column does not start with an integer of 4 digits. - df = df.loc[df["year"].astype(str).str.contains(r"^\d{4}", regex=True, na=False)].reset_index(drop=True) - # Remove annotations from years (e.g. replace "1987 (5)" by 1987). - df["year"] = df["year"].astype(str).str[0:4].astype(int) - - # Make all columns float (except year column). - df.astype({column: float for column in df.columns if column != "year"}) - - return df - - -def run(dest_dir: str) -> None: - # - # Load data. - # - # Retrieve raw data from walden. - walden_ds: WaldenCatalog = paths.load_dependency("uk_historical_electricity") - local_file = walden_ds.ensure_downloaded() - - # Load data from the two relevant sheets of the excel file. - # The original excel file is poorly formatted and will be hard to parse automatically. - fuel_input = pd.read_excel(local_file, sheet_name="Fuel input") - supply = pd.read_excel(local_file, sheet_name="Supply, availability & consump") - efficiency = pd.read_excel(local_file, sheet_name="Generated and supplied") - - # - # Process data. - # - # Process data from the sheet about fuels input for electricity generation. - fuel_input = prepare_data( - df=fuel_input, - expected_content={ - 1: ["Total", "all", "fuels"], - 2: ["Coal"], - 3: ["Oil"], - 4: ["Natural", "gas"], - 5: ["Nuclear"], - 6: ["Natural", "flow hydro"], - 7: ["Wind", "and solar"], - 9: ["Other", "fuels"], - }, - columns={ - 0: "year", - 1: "all_sources", - 2: "coal", - 3: "oil", - 4: "gas", - 5: "nuclear", - 6: "hydro", - 7: "wind_and_solar", - 9: "other", - }, - ) - - # Prepare data from the sheet about electricity supply, availability and consumption. - supply = prepare_data( - df=supply, - expected_content={ - 1: ["Electricity", "supplied"], - 3: ["Net", "Imports"], - }, - columns={ - 0: "year", - 1: "electricity_generation", - 3: "net_imports", - }, - ) - - # Prepare data from the sheet about electricity generated and supplied. - efficiency = prepare_data( - df=efficiency, - expected_content={ - 33: ["Implied", "Efficiency"], - }, - columns={ - 0: "year", - 33: "implied_efficiency", - }, - ) - - # - # Save outputs. - # - # Create new dataset and reuse walden metadata. - ds = Dataset.create_empty(dest_dir) - ds.metadata = convert_walden_metadata(walden_ds) - ds.metadata.version = "2022-07-28" - - # Create tables using metadata from walden. - tb_fuel_input = Table( - fuel_input, - metadata=TableMeta( - short_name="fuel_input", - title="Fuel input for electricity generation", - description=walden_ds.description, - ), - ) - tb_supply = Table( - supply, - metadata=TableMeta( - short_name="supply", - title="Electricity supply, availability and consumption", - description=walden_ds.description, - ), - ) - tb_efficiency = Table( - efficiency, - metadata=TableMeta( - short_name="efficiency", - title="Electricity generated and supplied", - description=walden_ds.description, - ), - ) - - # Underscore all table columns. - tb_fuel_input = underscore_table(tb_fuel_input) - tb_supply = underscore_table(tb_supply) - - # Add tables to a dataset. - ds.add(tb_fuel_input) - ds.add(tb_supply) - ds.add(tb_efficiency) - - # Save the dataset. - ds.save() diff --git a/etl/steps/archive/meadow/un/2019/un_wpp.py b/etl/steps/archive/meadow/un/2019/un_wpp.py deleted file mode 100644 index fc8af657536..00000000000 --- a/etl/steps/archive/meadow/un/2019/un_wpp.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env python -# pyright: reportUnusedExpression=false -# coding: utf-8 -# %% [markdown] -# # UN World Population Prospects (2019) - -# %% -import tempfile -import zipfile - -import pandas as pd -from owid import walden -from owid.catalog import Dataset, Table - -from etl.steps.data import converters - -# %% [markdown] -# ## Find walden file - -# %% -walden_ds = walden.Catalog().find_one("wpp", "2019", "standard_projections") -walden_ds - - -# %% [markdown] -# ## Unzip the data - -# %% -temp_dir = tempfile.mkdtemp() - -zipfile.ZipFile(walden_ds.local_path).extractall(temp_dir) - -# %% -# !ls {temp_dir}/WPP2019 - -# %% [markdown] -# ## Total population - -# %% -df = pd.read_csv(f"{temp_dir}/WPP2019/WPP2019_TotalPopulationBySex.csv") - - -# %% -df.head() - - -# %% -df.columns = [ - "loc_id", - "location", - "var_id", - "variant", - "year", - "mid_period", - "population_male", - "population_female", - "population_total", - "population_density", -] - - -# %% -t1 = Table(df[["loc_id", "location"]].drop_duplicates().set_index("loc_id")) -t1.metadata.short_name = "location_codes" - - -# %% -t2 = Table(df[["var_id", "variant"]].drop_duplicates().set_index("var_id")) -t2.metadata.short_name = "variant_codes" - - -# %% -df.drop(columns=["loc_id", "var_id"], inplace=True) - - -# %% -for col in ["location", "variant"]: - df[col] = df[col].astype("category") - - -# %% -df.set_index(["variant", "location", "year"], inplace=True) - - -# %% -df - - -# %% -df.index.levels[0] # type: ignore - - -# %% -t3 = Table(df) -t3.metadata.short_name = "total_population" - -# %% [markdown] -# ## Fertility by age - -# %% -df = pd.read_csv(f"{temp_dir}/WPP2019/WPP2019_Fertility_by_Age.csv") - - -# %% -df.head() - - -# %% -df.drop(columns=["LocID", "VarID", "MidPeriod", "AgeGrpStart", "AgeGrpSpan"], inplace=True) - - -# %% -df.columns = [ - "location", - "variant", - "year_range", - "age_group", - "asfr", - "pasfr", - "births", -] - - -# %% -df.head() - - -# %% -for col in ["location", "variant", "year_range", "age_group"]: - df[col] = df[col].astype("category") - - -# %% -df.set_index(["variant", "location", "year_range", "age_group"], inplace=True) - - -# %% -t4 = Table(df) -t4.metadata.short_name = "fertility_by_age" - -# %% [markdown] -# ## Population by age and sex - -# %% -df = pd.read_csv(f"{temp_dir}/WPP2019/WPP2019_PopulationByAgeSex_Medium.csv") - - -# %% -df.head() - - -# %% -df.drop(columns=["LocID", "VarID", "MidPeriod", "AgeGrpStart", "AgeGrpSpan"], inplace=True) - - -# %% -df.columns = [ - "location", - "variant", - "year", - "age_group", - "population_male", - "population_female", - "population_total", -] - - -# %% -df.head() - - -# %% -for col in ["location", "variant", "age_group"]: - df[col] = df[col].astype("category") - - -# %% -df.set_index(["variant", "location", "year", "age_group"], inplace=True) - - -# %% -df.head() - - -# %% -t5 = Table(df) -t5.metadata.short_name = "population_by_age_sex" - - -# %% [markdown] -# ## Save the dataset to disk - - -# %% -def run(dest_dir: str) -> None: - ds = Dataset.create_empty(dest_dir) - ds.metadata = converters.convert_walden_metadata(walden_ds) - ds.metadata.namespace = "un" - ds.metadata.short_name = "un_wpp" - ds.add(t1) - ds.add(t2) - ds.add(t3) - ds.add(t4) - ds.add(t5) - ds.save() diff --git a/etl/steps/archive/meadow/un/2022-07-07/un_sdg.py b/etl/steps/archive/meadow/un/2022-07-07/un_sdg.py deleted file mode 100644 index f983713cad7..00000000000 --- a/etl/steps/archive/meadow/un/2022-07-07/un_sdg.py +++ /dev/null @@ -1,66 +0,0 @@ -import re -from pathlib import Path - -import pandas as pd -from owid.catalog import Dataset, Table, TableMeta -from owid.catalog.utils import underscore -from owid.walden import Catalog -from structlog import get_logger - -from etl.steps.data.converters import convert_walden_metadata - -BASE_URL = "https://unstats.un.org/sdgapi" -log = get_logger() - - -def run(dest_dir: str, query: str = "") -> None: - # retrieves raw data from walden - version = Path(__file__).parent.stem - fname = Path(__file__).stem - namespace = "un_sdg" - - walden_ds = Catalog().find_one(namespace=namespace, short_name=fname, version=version) - - log.info("un_sdg.start") - local_file = walden_ds.ensure_downloaded() - df = pd.read_feather(local_file) - - if query: - df = df.query(query) - - log.info("un_sdg.load_and_clean") - df = load_and_clean(df) - log.info("Size of dataframe", rows=df.shape[0], colums=df.shape[1]) - df.columns = [underscore(c) for c in df.columns] - df = df.reset_index() - ds = Dataset.create_empty(dest_dir) - - ds.metadata = convert_walden_metadata(walden_ds) - tb = Table(df) - tb.metadata = TableMeta( - short_name=Path(__file__).stem, - title=walden_ds.name, - description=walden_ds.description, - ) - ds.add(tb) - ds.save() - log.info("un_sdg.end") - - -def load_and_clean(original_df: pd.DataFrame) -> pd.DataFrame: - # Load and clean the data - log.info("Reading in original data...") - original_df = original_df.copy(deep=False) - - # removing values that aren't numeric e.g. Null and N values - original_df.dropna(subset=["Value"], inplace=True) - original_df.dropna(subset=["TimePeriod"], how="all", inplace=True) - original_df = original_df.loc[pd.to_numeric(original_df["Value"], errors="coerce").notnull()] - original_df.rename(columns={"GeoAreaName": "Country", "TimePeriod": "Year"}, inplace=True) - original_df = original_df.rename(columns=lambda k: re.sub(r"[\[\]]", "", k)) # type: ignore - return original_df - - -if __name__ == "__main__": - # test script for a single indicator with `python etl/steps/data/meadow/un_sdg/2022-05-26/un_sdg.py` - run("/tmp/un_sdg", query="Indicator == '1.1.1'") diff --git a/etl/steps/data/garden/agriculture/2023-10-04/share_of_agriculture_in_gdp.py b/etl/steps/data/garden/agriculture/2023-10-04/share_of_agriculture_in_gdp.py index a4968d65180..02d9f00c5ef 100644 --- a/etl/steps/data/garden/agriculture/2023-10-04/share_of_agriculture_in_gdp.py +++ b/etl/steps/data/garden/agriculture/2023-10-04/share_of_agriculture_in_gdp.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # snap = paths.load_snapshot() - tb = snap.read().set_index(["country", "year"]) + tb = snap.read(safe_types=False).set_index(["country", "year"]) # # Save outputs. diff --git a/etl/steps/data/garden/animal_welfare/2023-09-05/bullfighting_laws.meta.yml b/etl/steps/data/garden/animal_welfare/2023-09-05/bullfighting_laws.meta.yml index 39a73daabd9..20bc15814d3 100644 --- a/etl/steps/data/garden/animal_welfare/2023-09-05/bullfighting_laws.meta.yml +++ b/etl/steps/data/garden/animal_welfare/2023-09-05/bullfighting_laws.meta.yml @@ -27,6 +27,6 @@ tables: note: "Countries where bullfighting is not banned may have sub-national bans. Partially banned means that the bull cannot be severely injured or killed during the event." subtitle: >- Bullfighting is a physical contest that involves a bullfighter attempting to subdue, immobilize, or kill a bull. - hasChartTab: false + chartTypes: [] hasMapTab: true tab: map diff --git a/etl/steps/data/garden/animal_welfare/2024-06-04/bullfighting_laws.meta.yml b/etl/steps/data/garden/animal_welfare/2024-06-04/bullfighting_laws.meta.yml index 39a73daabd9..20bc15814d3 100644 --- a/etl/steps/data/garden/animal_welfare/2024-06-04/bullfighting_laws.meta.yml +++ b/etl/steps/data/garden/animal_welfare/2024-06-04/bullfighting_laws.meta.yml @@ -27,6 +27,6 @@ tables: note: "Countries where bullfighting is not banned may have sub-national bans. Partially banned means that the bull cannot be severely injured or killed during the event." subtitle: >- Bullfighting is a physical contest that involves a bullfighter attempting to subdue, immobilize, or kill a bull. - hasChartTab: false + chartTypes: [] hasMapTab: true tab: map diff --git a/etl/steps/data/garden/animal_welfare/2024-09-13/fur_laws.py b/etl/steps/data/garden/animal_welfare/2024-09-13/fur_laws.py index 8c1e3633348..6cae3e4bffd 100644 --- a/etl/steps/data/garden/animal_welfare/2024-09-13/fur_laws.py +++ b/etl/steps/data/garden/animal_welfare/2024-09-13/fur_laws.py @@ -65,11 +65,11 @@ def run(dest_dir: str) -> None: # # Load meadow dataset and read its main table. ds_meadow = paths.load_dataset("fur_laws") - tb = ds_meadow.read_table("fur_laws") + tb = ds_meadow.read("fur_laws") # Load regions dataset and read its main table. ds_regions = paths.load_dataset("regions") - tb_regions = ds_regions.read_table("regions") + tb_regions = ds_regions.read("regions") # # Process data. diff --git a/etl/steps/data/garden/antibiotics/2024-10-09/gram.meta.yml b/etl/steps/data/garden/antibiotics/2024-10-09/gram.meta.yml index 7de8a98ab7f..418856201ac 100644 --- a/etl/steps/data/garden/antibiotics/2024-10-09/gram.meta.yml +++ b/etl/steps/data/garden/antibiotics/2024-10-09/gram.meta.yml @@ -17,15 +17,36 @@ tables: variables: antibiotic_consumption__ddd_1_000_day: title: Antibiotic consumption (DDD per 1,000 inhabitants per day) - description_short: Estimated defined daily doses (DDD) per 1,000 people per day. + description_short: Estimated [Defined Daily Doses](#dod:defined-daily-doses) per 1,000 people per day. unit: DDD per 1,000 people per day display: numDecimalPlaces: 1 + origins: + - title: Global Research on Antimicrobial Resistance (GRAM) + date_published: '2021-11-11' + producer: Browne AJ et al. (2021) + citation_full: "Browne AJ, Chipeta MG, Haines-Woodhouse G, et al. Global antibiotic consumption and usage in humans, 2000 to 2018: a spatial modelling study. Lancet Planetary Health 2021." + url_main: https://www.tropicalmedicine.ox.ac.uk/gram/research/visualisation-app-antibiotic-usage-and-consumption + date_accessed: '2024-10-09' lower_uncertainty_interval: title: Lower uncertainty interval unit: DDD per 1,000 people per day + origins: + - title: Global Research on Antimicrobial Resistance (GRAM) + date_published: '2021-11-11' + producer: Browne AJ et al. (2021) + citation_full: "Browne AJ, Chipeta MG, Haines-Woodhouse G, et al. Global antibiotic consumption and usage in humans, 2000 to 2018: a spatial modelling study. Lancet Planetary Health 2021." + url_main: https://www.tropicalmedicine.ox.ac.uk/gram/research/visualisation-app-antibiotic-usage-and-consumption + date_accessed: '2024-10-09' upper_uncertainty_interval: title: Upper uncertainty interval unit: DDD per 1,000 people per day + origins: + - title: Global Research on Antimicrobial Resistance (GRAM) + date_published: '2021-11-11' + producer: Browne AJ et al. (2021) + citation_full: "Browne AJ, Chipeta MG, Haines-Woodhouse G, et al. Global antibiotic consumption and usage in humans, 2000 to 2018: a spatial modelling study. Lancet Planetary Health 2021." + url_main: https://www.tropicalmedicine.ox.ac.uk/gram/research/visualisation-app-antibiotic-usage-and-consumption + date_accessed: '2024-10-09' diff --git a/etl/steps/data/garden/antibiotics/2024-10-09/gram.py b/etl/steps/data/garden/antibiotics/2024-10-09/gram.py index e080214eece..3d6609f55c2 100644 --- a/etl/steps/data/garden/antibiotics/2024-10-09/gram.py +++ b/etl/steps/data/garden/antibiotics/2024-10-09/gram.py @@ -1,10 +1,15 @@ """Load a meadow dataset and create a garden dataset.""" +from owid.catalog import Dataset, Table +from owid.catalog import processing as pr + from etl.data_helpers import geo +from etl.data_helpers.geo import add_population_to_table, list_members_of_region from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +REGIONS = ["Africa", "North America", "South America", "Asia", "Europe", "Oceania"] def run(dest_dir: str) -> None: @@ -13,7 +18,10 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("gram") - + # Add population dataset + ds_population = paths.load_dataset("population") + # Add regions dataset + ds_regions = paths.load_dataset("regions") # Read table from meadow dataset. tb = ds_meadow["gram"].reset_index() @@ -23,6 +31,11 @@ def run(dest_dir: str) -> None: tb = geo.harmonize_countries( df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path ) + # Add population to the table + tb = add_population_to_table(tb, ds_population) + # Calculate total DDDs + tb = add_regional_totals(tb, ds_regions) + tb = tb.format(["country", "year"]) # @@ -35,3 +48,43 @@ def run(dest_dir: str) -> None: # Save changes in the new garden dataset. ds_garden.save() + + +def add_regional_totals(tb: Table, ds_regions: Dataset) -> Table: + """Add regional totals to the table.""" + # First back-calculate the total DDDs + tb["antibiotics_ddd"] = (tb["antibiotic_consumption__ddd_1_000_day"] / 1000) * tb["population"] + tb["lower_uncertainty_interval_ddd"] = (tb["lower_uncertainty_interval"] / 1000) * tb["population"] + tb["upper_uncertainty_interval_ddd"] = (tb["upper_uncertainty_interval"] / 1000) * tb["population"] + # Then calculate the regional totals + for region in REGIONS: + countries = list_members_of_region(region=region, ds_regions=ds_regions) + tb_region = tb.loc[tb["country"].isin(countries)] + tb_region = ( + tb_region.groupby(["year"])[ + ["population", "antibiotics_ddd", "lower_uncertainty_interval_ddd", "upper_uncertainty_interval_ddd"] + ] + .sum() + .reset_index() + ) + tb_region["antibiotic_consumption__ddd_1_000_day"] = ( + tb_region["antibiotics_ddd"] / tb_region["population"] * 1000 + ) + tb_region["lower_uncertainty_interval"] = ( + tb_region["lower_uncertainty_interval_ddd"] / tb_region["population"] * 1000 + ) + tb_region["upper_uncertainty_interval"] = ( + tb_region["upper_uncertainty_interval_ddd"] / tb_region["population"] * 1000 + ) + tb_region["country"] = region + + tb = pr.concat([tb, tb_region]) + tb = tb.drop( + columns=[ + "population", + "antibiotics_ddd", + "lower_uncertainty_interval_ddd", + "upper_uncertainty_interval_ddd", + ] + ) + return tb diff --git a/etl/steps/data/garden/antibiotics/2024-10-09/gram_level.meta.yml b/etl/steps/data/garden/antibiotics/2024-10-09/gram_level.meta.yml index 00c16cb547f..5ca37db3831 100644 --- a/etl/steps/data/garden/antibiotics/2024-10-09/gram_level.meta.yml +++ b/etl/steps/data/garden/antibiotics/2024-10-09/gram_level.meta.yml @@ -17,10 +17,17 @@ tables: variables: antibiotic_consumption__ddd_1_000_day: title: Antibiotic consumption of << atc_level_3_class >> (DDD per 1,000 people) - description_short: Estimated defined daily doses (DDD) of << atc_level_3_class >> per 1,000 people. + description_short: Estimated [Defined Daily Doses](#dod:defined-daily-doses) of << atc_level_3_class >> per 1,000 people. unit: DDD per 1,000 people per day display: numDecimalPlaces: 1 name: << atc_level_3_class >> + origins: + - title: Global Research on Antimicrobial Resistance (GRAM) - by antibiotic group + date_published: '2021-11-11' + producer: Browne AJ et al. (2021) + citation_full: "Browne AJ, Chipeta MG, Haines-Woodhouse G, et al. Global antibiotic consumption and usage in humans, 2000 to 2018: a spatial modelling study. Lancet Planetary Health 2021." + url_main: https://www.tropicalmedicine.ox.ac.uk/gram/research/visualisation-app-antibiotic-usage-and-consumption + date_accessed: '2024-10-09' diff --git a/etl/steps/data/garden/antibiotics/2024-10-09/gram_level.py b/etl/steps/data/garden/antibiotics/2024-10-09/gram_level.py index bbab33052fe..fdedd02d373 100644 --- a/etl/steps/data/garden/antibiotics/2024-10-09/gram_level.py +++ b/etl/steps/data/garden/antibiotics/2024-10-09/gram_level.py @@ -1,10 +1,14 @@ """Load a meadow dataset and create a garden dataset.""" +from owid.catalog import Dataset, Table +from owid.catalog import processing as pr from etl.data_helpers import geo +from etl.data_helpers.geo import add_population_to_table, list_members_of_region from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +REGIONS = ["Africa", "North America", "South America", "Asia", "Europe", "Oceania"] def run(dest_dir: str) -> None: @@ -13,7 +17,10 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("gram_level") - + # Add population dataset + ds_population = paths.load_dataset("population") + # Add regions dataset + ds_regions = paths.load_dataset("regions") # Read table from meadow dataset. tb = ds_meadow["gram_level"].reset_index() @@ -23,6 +30,10 @@ def run(dest_dir: str) -> None: tb = geo.harmonize_countries( df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path ) + # Add population to the table + tb = add_population_to_table(tb, ds_population) + # Calculate total DDDs + tb = add_regional_totals(tb, ds_regions) tb = tb.format(["country", "year", "atc_level_3_class"]) # @@ -35,3 +46,29 @@ def run(dest_dir: str) -> None: # Save changes in the new garden dataset. ds_garden.save() + + +def add_regional_totals(tb: Table, ds_regions: Dataset) -> Table: + """Add regional totals to the table.""" + # First back-calculate the total DDDs + tb["antibiotics_ddd"] = (tb["antibiotic_consumption__ddd_1_000_day"] / 1000) * tb["population"] + # Then calculate the regional totals + for region in REGIONS: + countries = list_members_of_region(region=region, ds_regions=ds_regions) + tb_region = tb.loc[tb["country"].isin(countries)] + tb_region = ( + tb_region.groupby(["year", "atc_level_3_class"])[["population", "antibiotics_ddd"]].sum().reset_index() + ) + tb_region["antibiotic_consumption__ddd_1_000_day"] = ( + tb_region["antibiotics_ddd"] / tb_region["population"] * 1000 + ) + tb_region["country"] = region + + tb = pr.concat([tb, tb_region]) + tb = tb.drop( + columns=[ + "population", + "antibiotics_ddd", + ] + ) + return tb diff --git a/etl/steps/data/garden/antibiotics/2024-10-18/who_glass.countries.json b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass.countries.json new file mode 100644 index 00000000000..e3944dffd50 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass.countries.json @@ -0,0 +1,107 @@ +{ + "Afghanistan": "Afghanistan", + "Argentina": "Argentina", + "Australia": "Australia", + "Austria": "Austria", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Belgium": "Belgium", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Burkina Faso": "Burkina Faso", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "China, Hong Kong SAR": "Hong Kong", + "Colombia": "Colombia", + "Croatia": "Croatia", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Egypt": "Egypt", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Finland": "Finland", + "France": "France", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Italy": "Italy", + "Japan": "Japan", + "Jordan": "Jordan", + "Kenya": "Kenya", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Mauritius": "Mauritius", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nepal": "Nepal", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Papua New Guinea": "Papua New Guinea", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saudi Arabia": "Saudi Arabia", + "Singapore": "Singapore", + "Somalia": "Somalia", + "South Africa": "South Africa", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Cte dIvoire": "Cote d'Ivoire", + "Netherlands (Kingdom of the)": "Netherlands", + "Trkiye": "Turkey", + "occupied Palestinian territory, including east Jerusalem": "Palestine" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-10-18/who_glass.meta.yml b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass.meta.yml new file mode 100644 index 00000000000..cac0c8cc3e4 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass.meta.yml @@ -0,0 +1,59 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Health + - Antibiotics + display: + name: << syndrome.capitalize() >> + syndrome: + <%- if syndrome == "BLOOD" %> + bloodstream + <%- elif syndrome == "STOOL" %> + gastrointestinal + <%- elif syndrome == "URINE" %> + urinary tract + <%- elif syndrome == "UROGENITAL" %> + gonorrohea + <% endif %> + + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + who_glass: + variables: + bcispermillion: + title: Samples tested and confirmed to be {definitions.syndrome} infections per million population + unit: "confirmed infections per million" + presentation: + title_public: Samples tested and confirmed to be {definitions.syndrome} infections per million population + totalspecimenisolates: + title: Total specimens collected of {definitions.syndrome} infections + unit: "specimens" + presentation: + title_public: Total specimen collected of {definitions.syndrome} infections + isolspermillion: + title: Samples tested and confirmed to be {definitions.syndrome} infections with antibiotic susceptibility test results per million population + unit: "confirmed infections per million" + presentation: + title_public: Samples tested and confirmed to be {definitions.syndrome} infections with antibiotic susceptibility test results per million population + totalspecimenisolateswithast: + title: Samples tested and confirmed to be {definitions.syndrome} infections with antibiotic susceptibility test results + unit: "confirmed infections" + presentation: + title_public: Samples tested and confirmed to be {definitions.syndrome} infections with antibiotic susceptibility test results + astresult: + title: Share of samples tested and confirmed to be {definitions.syndrome} infections with antibiotic susceptibility test results + unit: "%" + short_unit: "%" + display: + numDecimalPlaces: 1 + presentation: + title_public: Share of samples tested and confirmed to be {definitions.syndrome} infections with antibiotic susceptibility test results \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-10-18/who_glass.py b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass.py new file mode 100644 index 00000000000..ced77166136 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass.py @@ -0,0 +1,35 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("who_glass") + # Add population dataset. + # Read table from meadow dataset. + tb = ds_meadow["who_glass"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb = tb.format(["country", "year", "syndrome"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-10-18/who_glass_by_antibiotic.countries.json b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass_by_antibiotic.countries.json new file mode 100644 index 00000000000..e3944dffd50 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass_by_antibiotic.countries.json @@ -0,0 +1,107 @@ +{ + "Afghanistan": "Afghanistan", + "Argentina": "Argentina", + "Australia": "Australia", + "Austria": "Austria", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Belgium": "Belgium", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Burkina Faso": "Burkina Faso", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "China, Hong Kong SAR": "Hong Kong", + "Colombia": "Colombia", + "Croatia": "Croatia", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Egypt": "Egypt", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Finland": "Finland", + "France": "France", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Italy": "Italy", + "Japan": "Japan", + "Jordan": "Jordan", + "Kenya": "Kenya", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Mauritius": "Mauritius", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nepal": "Nepal", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Papua New Guinea": "Papua New Guinea", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saudi Arabia": "Saudi Arabia", + "Singapore": "Singapore", + "Somalia": "Somalia", + "South Africa": "South Africa", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Cte dIvoire": "Cote d'Ivoire", + "Netherlands (Kingdom of the)": "Netherlands", + "Trkiye": "Turkey", + "occupied Palestinian territory, including east Jerusalem": "Palestine" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-10-18/who_glass_by_antibiotic.meta.yml b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass_by_antibiotic.meta.yml new file mode 100644 index 00000000000..f9d4d45bc1a --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass_by_antibiotic.meta.yml @@ -0,0 +1,40 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + bci_table: + variables: + bcis_per_million: + title: Rate of << syndrome.lower() >> infections where the pathogen is << pathogen >> + description_short: Bacteriologically confirmed << syndrome.lower() >> infections where the pathogen is << pathogen >> per million people. + unit: infections per million + total_bcis: + title: Total << syndrome.lower() >> infections where the pathogen is << pathogen >> + description_short: Total bacteriologically confirmed << syndrome.lower() >> infections where the pathogen is << pathogen >> + unit: infections + antibiotic_table: + variables: + bcis_with_ast_per_million: + title: Rate of << syndrome.lower() >> infections with susceptibility to << antibiotic.lower() >> where the pathogen is << pathogen >> + description_short: Bacteriologically confirmed << syndrome.lower() >> infections with susceptibility to << antibiotic.lower() >> where the pathogen is << pathogen >> per million people. + unit: infections per million + total_bcis_with_ast: + title: Total << syndrome.lower() >> infections with susceptibility to << antibiotic >> where the pathogen is << pathogen >> + description_short: Total bacteriologically confirmed << syndrome.lower() >> infections with susceptibility to << antibiotic >> where the pathogen is << pathogen >>. + unit: infections + share_bcis_with_ast: + title: Share of << syndrome.lower() >> infections with susceptibility to << antibiotic >> where the pathogen is << pathogen >> + description_short: Share of bacteriologically confirmed << syndrome.lower() >> infections with susceptibility to << antibiotic >> where the pathogen is << pathogen >> + unit: "%" + short_unit: "%" \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-10-18/who_glass_by_antibiotic.py b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass_by_antibiotic.py new file mode 100644 index 00000000000..7d270084e69 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-10-18/who_glass_by_antibiotic.py @@ -0,0 +1,52 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("who_glass_by_antibiotic") + + # Read table from meadow dataset. + tb = ds_meadow["who_glass_by_antibiotic"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + # Split the table into two, one where antibiotic is disregarded (it creates unnecessary duplicates for bcis_per_million and total_bcis) + tb_bci = ( + tb[["country", "year", "syndrome", "pathogen", "bcis_per_million", "total_bcis"]] + .drop_duplicates() + .format(["country", "year", "syndrome", "pathogen"], short_name="bci_table") + ) + tb_anti = tb[ + [ + "country", + "year", + "syndrome", + "pathogen", + "antibiotic", + "bcis_with_ast_per_million", + "total_bcis_with_ast", + "share_bcis_with_ast", + ] + ].format(["country", "year", "syndrome", "pathogen", "antibiotic"], short_name="antibiotic_table") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb_bci, tb_anti], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-11-12/antimicrobial_usage.countries.json b/etl/steps/data/garden/antibiotics/2024-11-12/antimicrobial_usage.countries.json new file mode 100644 index 00000000000..e7da9cd7bfc --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-12/antimicrobial_usage.countries.json @@ -0,0 +1,69 @@ +{ + "Armenia": "Armenia", + "Austria": "Austria", + "Bangladesh": "Bangladesh", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Burkina Faso": "Burkina Faso", + "Canada": "Canada", + "China, Hong Kong SAR": "Hong Kong", + "Colombia": "Colombia", + "Croatia": "Croatia", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Egypt": "Egypt", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Georgia": "Georgia", + "Germany": "Germany", + "Hungary": "Hungary", + "Iceland": "Iceland", + "Iran (Islamic Republic of)": "Iran", + "Ireland": "Ireland", + "Italy": "Italy", + "Jordan": "Jordan", + "Kenya": "Kenya", + "Kuwait": "Kuwait", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lithuania": "Lithuania", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Montenegro": "Montenegro", + "Nepal": "Nepal", + "Norway": "Norway", + "Oman": "Oman", + "Papua New Guinea": "Papua New Guinea", + "Peru": "Peru", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saudi Arabia": "Saudi Arabia", + "Slovenia": "Slovenia", + "South Africa": "South Africa", + "Spain": "Spain", + "Sudan": "Sudan", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Tajikistan": "Tajikistan", + "Timor-Leste": "East Timor", + "Tunisia": "Tunisia", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "C\ufffdte d'Ivoire": "Cote d'Ivoire", + "occupied Palestinian territory, including east Jerusalem": "Palestine" +} diff --git a/etl/steps/data/garden/antibiotics/2024-11-12/antimicrobial_usage.meta.yml b/etl/steps/data/garden/antibiotics/2024-11-12/antimicrobial_usage.meta.yml new file mode 100644 index 00000000000..0dae03f5732 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-12/antimicrobial_usage.meta.yml @@ -0,0 +1,133 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Health + aware_description: <% if awarelabel == "Access" %> + Access antibiotics have activity against a wide range of common pathogens and show lower resistance potential than antibiotics in the other groups. + <% elif awarelabel == "Watch" %> + Watch antibiotic have higher resistance potential and include most of the highest priority agents among the Critically Important Antimicrobials for Human Medicine and/or antibiotics that are at relatively high risk of bacterial resistance. + <% elif awarelabel == "Reserve" %> + Reserve antibiotics should be reserved for treatment of confirmed or suspected infections due to multi-drug-resistant organisms. Reserve group antibiotics should be treated as “last resort” options. + <% elif awarelabel == "Not classified/recommended" %> + The use of the Not classified/Not recommended antibiotics is not evidence-based, nor recommended in high-quality international guidelines. WHO does not recommend the use of these antibiotics in clinical practice. + <% endif %> + routeofadministration: <% if routeofadministration == "O" %> + orally administered + <% elif routeofadministration == "P" %> + parentearally administered + <% elif routeofadministration == "R" %> + rectally administered4 + <% elif routeofadministration == "I" %> + inhaled + <% endif %> + antimicrobialclass: + <% if antimicrobialclass == "Antibacterials (ATC J01, A07AA, P01AB, ATC J04A)" %> + antibiotics including antituberculosis drugs + <% elif antimicrobialclass == "Antimalarials (ATC P01B)" %> + antimalarials + <% elif antimicrobialclass == "Antimycotics and antifungals for systemic use (J02, D01B)" %> + antifungals + <% elif antimicrobialclass == "Antivirals for systemic use (ATC J05)" %> + antivirals + <% elif antimicrobialclass == "Drugs for the treatment of tuberculosis (ATC J04A)" %> + antituberculosis drugs + <% endif %> + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + class: + variables: + ddd: + title: Defined daily doses of {definitions.routeofadministration} << antimicrobialclass.lower()>> - << atc4name.lower() >> used + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antimicrobials used in a given year. + unit: defined daily doses + display: + numDecimalPlaces: 0 + did: + title: Defined daily doses per 1000 inhabitants per day of {definitions.routeofadministration} << antimicrobialclass>> - << atc4name.lower() >> used + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antimicrobials used per 1000 inhabitants per day. + unit: defined daily doses per 1000 inhabitants per day + display: + numDecimalPlaces: 1 + class_aggregated: + variables: + ddd_anti_malarials: + title: Defined daily doses of antimalarials used + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antimalarials used in a given year. + unit: defined daily doses + display: + numDecimalPlaces: 0 + ddd_antibacterials_and_antituberculosis: + title: Defined daily doses of antibiotics and antituberculosis drugs used + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antibiotics and antituberculosis drugs used in a given year. + unit: defined daily doses + display: + numDecimalPlaces: 0 + ddd_antifungals: + title: Defined daily doses of antifungals used + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antifungals used in a given year. + unit: defined daily doses + display: + numDecimalPlaces: 0 + ddd_antituberculosis: + title: Defined daily doses of antituberculosis drugs used + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antituberculosis drugs used in a given year. + unit: defined daily doses + display: + numDecimalPlaces: 0 + ddd_antivirals: + title: Defined daily doses of antivirals used + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antivirals used in a given year. + unit: defined daily doses + display: + numDecimalPlaces: 0 + did_anti_malarials: + title: Defined daily doses of antimalarials used per 1,000 inhabitants per day + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antimalarials used in a given year per 1,000 inhabitants per day. + unit: defined daily doses per 1,000 inhabitants per day + display: + numDecimalPlaces: 1 + did_antibacterials_and_antituberculosis: + title: Defined daily doses of antibiotics and antituberculosis drugs used per 1,000 inhabitants per day + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antibiotics and antituberculosis drugs used in a given year per 1,000 inhabitants per day. + unit: defined daily doses per 1,000 inhabitants per day + display: + numDecimalPlaces: 1 + did_antifungals: + title: Defined daily doses of antifungals used per 1,000 inhabitants per day + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antifungals used in a given year per 1,000 inhabitants per day. + unit: defined daily doses per 1,000 inhabitants per day + display: + numDecimalPlaces: 1 + did_antituberculosis: + title: Defined daily doses of antituberculosis drugs used per 1,000 inhabitants per day + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antituberculosis drugs used in a given year per 1,000 inhabitants per day. + unit: defined daily doses per 1,000 inhabitants per day + display: + numDecimalPlaces: 1 + did_antivirals: + title: Defined daily doses of antivirals used per 1,000 inhabitants per day + description_short: Total [defined daily doses](#dod:defined-daily-doses) of antivirals used in a given year per 1,000 inhabitants per day. + unit: defined daily doses per 1,000 inhabitants per day + display: + numDecimalPlaces: 1 + aware: + variables: + ddd: + title: Defined daily doses of << awarelabel >> antibiotics used + description_short: "Total [defined daily doses](#dod:defined-daily-doses) of AWaRe category: << awarelabel >> antibiotics used in a given year. {definitions.aware_description}" + unit: defined daily doses + display: + numDecimalPlaces: 0 + did: + title: Defined daily doses per 1000 inhabitants per day of << awarelabel>> antibiotics used + description_short: "Total [defined daily doses](#dod:defined-daily-doses) of AWaRe category: <> used per 1000 inhabitants per day. {definitions.aware_description}" + unit: defined daily doses per 1000 inhabitants per day + display: + numDecimalPlaces: 1 diff --git a/etl/steps/data/garden/antibiotics/2024-11-12/antimicrobial_usage.py b/etl/steps/data/garden/antibiotics/2024-11-12/antimicrobial_usage.py new file mode 100644 index 00000000000..2191393054b --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-12/antimicrobial_usage.py @@ -0,0 +1,189 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table +from owid.catalog import processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("antimicrobial_usage") + + # Read table from meadow dataset. + tb_class = ds_meadow["class"].reset_index() + tb_aware = ds_meadow["aware"].reset_index() + # + # Process data. + # + tb_class = geo.harmonize_countries(df=tb_class, countries_file=paths.country_mapping_path) + tb_aware = geo.harmonize_countries(df=tb_aware, countries_file=paths.country_mapping_path) + + # Tidy notes column + tb_class = tidy_notes(tb_class) + # Aggregate by antimicrobial class + tb_class_agg, tb_notes = aggregate_antimicrobial_classes(tb_class) + # Save the origins of the aggregated table to insert back in later + # Drop columns that are not needed in the garden dataset. + tb_class = tb_class.drop( + columns=["whoregioncode", "whoregionname", "countryiso3", "incomeworldbankjune", "atc4", "notes"] + ) + tb_aware = tb_aware.drop(columns=["whoregioncode", "whoregionname", "incomeworldbankjune", "aware", "notes"]) + + tb_class = tb_class.format(["country", "year", "antimicrobialclass", "atc4name", "routeofadministration"]) + tb_aware = tb_aware.format(["country", "year", "awarelabel"]) + tb_class_agg = pivot_aggregated_table(tb_class_agg, tb_notes) + tb_class_agg = tb_class_agg.format(["country", "year"], short_name="class_aggregated") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb_class, tb_aware, tb_class_agg], + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def pivot_aggregated_table(tb_class_agg: Table, tb_notes: Table) -> Table: + """ + Pivot the aggregated table to have a column for each antimicrobial class, then add the description_processing metadata + """ + + tb_notes_dict = { + "Antibacterials (ATC J01, A07AA, P01AB)": "antibacterials", + "Antimalarials (ATC P01B)": "anti_malarials", + "Antimycotics and antifungals for systemic use (J02, D01B)": "antifungals", + "Antivirals for systemic use (ATC J05)": "antivirals", + "Drugs for the treatment of tuberculosis (ATC J04A)": "antituberculosis", + "Antibacterials (ATC J01, A07AA, P01AB, ATC J04A)": "antibacterials_and_antituberculosis", + } + tb_notes["category"] = tb_notes["antimicrobialclass"].map(tb_notes_dict) + tb_class_agg = tb_class_agg.copy(deep=True) + tb_class_agg["antimicrobialclass"] = tb_class_agg["antimicrobialclass"].replace(tb_notes_dict) + tb_class_agg = tb_class_agg.pivot( + index=["country", "year"], columns="antimicrobialclass", values=["ddd", "did"], join_column_levels_with="_" + ) + tb_class_agg = tb_class_agg.reset_index(drop=True) + + for key in tb_notes_dict.values(): + if f"ddd_{key}" in tb_class_agg.columns: + tb_class_agg[f"ddd_{key}"].metadata.description_key = tb_notes["description_processing"][ + tb_notes["category"] == key + ] + if f"did_{key}" in tb_class_agg.columns: + tb_class_agg[f"did_{key}"].metadata.description_key = tb_notes["description_processing"][ + tb_notes["category"] == key + ] + return tb_class_agg + + +def aggregate_antimicrobial_classes(tb: Table) -> Table: + """ + Aggregating by antimicrobial class, we want to combine antibacterials and antituberculosis, but also keep antituberculosis separately + """ + tb = tb.copy(deep=True) + # Convert the column to strings (if not already done) + tb["antimicrobialclass"] = tb["antimicrobialclass"].astype("string") + + # Create a completely independent copy of antituberculosis rows and reset its index + msk = tb["antimicrobialclass"] == "Drugs for the treatment of tuberculosis (ATC J04A)" + tb_anti_tb = tb[msk].reset_index(drop=True) + assert len(tb_anti_tb["antimicrobialclass"].unique()) == 1 + + # Modify antimicrobialclass in tb + tb["antimicrobialclass"] = tb["antimicrobialclass"].replace( + { + "Drugs for the treatment of tuberculosis (ATC J04A)": "Antibacterials (ATC J01, A07AA, P01AB, ATC J04A)", + "Antibacterials (ATC J01, A07AA, P01AB)": "Antibacterials (ATC J01, A07AA, P01AB, ATC J04A)", + }, + ) + expected_class_values = { + "Antibacterials (ATC J01, A07AA, P01AB, ATC J04A)", + "Antimalarials (ATC P01B)", + "Antimycotics and antifungals for systemic use (J02, D01B)", + "Antivirals for systemic use (ATC J05)", + } + actual_values = set(tb["antimicrobialclass"].unique()) + assert actual_values == expected_class_values + # Format the notes tables before it's removed + tb_notes = tb[["country", "year", "antimicrobialclass", "notes"]].dropna(subset=["notes"]) + tb_notes = format_notes(tb_notes) + + # Aggregate the data + tb = tb.groupby(["country", "year", "antimicrobialclass"], dropna=False)[["ddd", "did"]].sum().reset_index() + assert len(tb["antimicrobialclass"].unique()) == 4 + # Add the antituberculosis data back to tb + tb_anti_tb = ( + tb_anti_tb.groupby(["country", "year", "antimicrobialclass"], dropna=False)[["ddd", "did"]].sum().reset_index() + ) + tb_combined = pr.concat([tb, tb_anti_tb]) + + tb_combined.set_index(["country", "year", "antimicrobialclass"], verify_integrity=True) + + return tb_combined, tb_notes + + +def format_notes(tb_notes: Table) -> Table: + """ + Format notes column + """ + for note in tb_notes["notes"].unique(): + msk = tb_notes["notes"] == note + tb_note = tb_notes[msk] + countries = tb_note["country"].unique() + countries_formatted = combine_countries(countries) + description_processing_string = f"For {countries_formatted}: {note}" + tb_notes.loc[msk, "description_processing"] = description_processing_string + # Creating onedescription processing for each antimicrobial class, the variable unit + tb_desc = ( + tb_notes.dropna(subset=["description_processing"]) # Remove NaNs + .groupby(["antimicrobialclass"])["description_processing"] + .apply(lambda x: list(set(x))) # Combine unique values into a list + .reset_index() + ) + + return tb_desc + + +def tidy_notes(tb_class: Table) -> Table: + """ + Tidy notes column - improve the syntax and fix spelling errors + """ + notes_dict = { + "Only consumption in the community reported": "only antimicrobial consumption in the community is reported.", + "For antimycotics and antifungals: only J02 reported": "for antimycotics and antifungals, only antimycotics for systemic use (ATC code J02) are reported.", + "For antibiotics: only J01 and P01AB reported": "for antibiotics, only antibiotics for systemic use (ATC code J01) and nitroimidazole derivatives (ATC code P01AB) are reported.", + "For antibiotics: only J01 reported": "for antibiotics, only antibiotics for systemic use (ATC code J01) are reported", + "For antifungals: only use in the hospital reported": "for antifungals, only those used in hospitals are reported.", + "Data incomplete since not collected from all sources of data": "data is incomplete since it's not collected from all sources.", + "Only consumption in the public sector reported and this is estimated to reppresent less than 90% of the antimicrobial used in the country ": "only consumption in the public sector reported and this is estimated to represent less than 90% of total antimicrobial usage.", + "Data incomplete: not all antibiotics reported systematically": "data is incomplete, not all antibiotics reported systematically.", + "For antituberculosis medicines: data are incomplete": "data are incomplete for antituberculosis medicines.", + } + tb_class["notes"] = tb_class["notes"].replace(notes_dict) + return tb_class + + +def combine_countries(countries): + # Combine countries into a string + if not countries: + return "" + elif len(countries) == 1: + return countries[0] + elif len(countries) == 2: + return " and ".join(countries) + else: + return ", ".join(countries[:-1]) + " and " + countries[-1] diff --git a/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.countries.json b/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.countries.json new file mode 100644 index 00000000000..c42f33d5cd2 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.countries.json @@ -0,0 +1,9 @@ +{ + "African Region": "African Region (WHO)", + "All": "World", + "Eastern Mediterranean Region": "Eastern Mediterranean (WHO)", + "European Region": "European Region (WHO)", + "Region of the Americas": "Region of the Americas (WHO)", + "South-East Asia Region": "South-East Asia Region (WHO)", + "Western Pacific Region": "Western Pacific Region (WHO)" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.meta.yml b/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.meta.yml new file mode 100644 index 00000000000..da24af0d82f --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.meta.yml @@ -0,0 +1,207 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + description_key_ast_bloodstream: &description_key_ast_bloodstream + - Acinetobacter spp. infections are tested for their susceptibility to carbapanem antibiotics. + - Salmonella spp. infections are tested for their susceptibility to fluoroquinolone antibiotics. + - Staphylococcus aureus infections are tested for their susceptibility to methicillin antibiotics. + - Escheria coli and Klebsiella pneumoniae infections are tested for their susceptibility to third-generation cephalosporin antibiotics and carbapanem antibiotics. + - Streptococcus pneumoniae infections are tested for their susceptibility to penicillin antibiotics. + description_key_ast_stool: &description_key_ast_stool + - Salmonella spp. infections are tested for their susceptibility to fluoroquinolone antibiotics. + - Shigella spp. infections are tested for their susceptibility to third-generation cephalosporin antibiotics. + description_key_ast_urinary_tract: &description_key_ast_urinary_tract + - Escheria coli infections are tested for their susceptibility to fluoroquinolone antibiotics, sulfonamides and trimethoprim antibiotics and third-generation cephalosporin antibiotics. + - Klebsiella pneumoniae infections are tested for their susceptibility to fluoroquinolone antibiotics, sulfonamides and trimethoprim antibiotics and third-generation cephalosporin antibiotics. + description_key_ast_gonorrhea: &description_key_ast_gonorrhea + - Neisseria gonorrhoeae infections are tested for their susceptibility to macrolide antibiotics and third-generation cephalosporin antibiotics. + + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + testing_coverage: + variables: + ctas_with_reported_bcis_bloodstream: + title: Number of countries reporting confirmed bloodstream infections + description_short: Number of countries reporting bacteriologically confirmed bloodstream infections to WHO. + unit: countries + display: + name: Bloodstream infections + ctas_with_reported_bcis_gonorrhea: + title: Number of countries reporting confirmed gonorrhea infections + description_short: Number of countries reporting bacteriologically confirmed gonorrhea infections to WHO. + unit: countries + display: + name: Gonorrhea infections + ctas_with_reported_bcis_stool: + title: Number of countries reporting confirmed gastrointestinal infections + description_short: Number of countries reporting bacteriologically confirmed gastrointestinal infections to WHO. + unit: countries + display: + name: Gastrointestinal infections + ctas_with_reported_bcis_urinary_tract: + title: Number of countries reporting confirmed urinary tract infections + description_short: Number of countries reporting bacteriologically confirmed urinary tract infections to WHO. + unit: countries + display: + name: Urinary tract infections + ctas_with_reported_bcis_with_ast__gt__80_bcis_bloodstream: + title: Number of countries testing >80% of confirmed bloodstream infections for antibiotic susceptibility + description_short: Number of countries testing >80% bacteriologically confirmed bloodstream infections for antibiotic susceptibility. + description_key: *description_key_ast_bloodstream + unit: countries + display: + name: Bloodstream infections + ctas_with_reported_bcis_with_ast__gt__80_bcis_gonorrhea: + title: Number of countries testing >80% of confirmed gonorrhea infections for antibiotic susceptibility + description_short: Number of countries testing >80% bacteriologically confirmed gonorrhea infections for antibiotic susceptibility. + description_key: *description_key_ast_gonorrhea + unit: countries + display: + name: Gonorrhea infections + ctas_with_reported_bcis_with_ast__gt__80_bcis_stool: + title: Number of countries testing >80% of confirmed gastrointestinal infections for antibiotic susceptibility + description_short: Number of countries testing >80% bacteriologically confirmed gastrointestinal infections for antibiotic susceptibility. + description_key: *description_key_ast_stool + unit: countries + display: + name: Gastrointestinal infections + ctas_with_reported_bcis_with_ast__gt__80_bcis_urinary_tract: + title: Number of countries testing >80% of confirmed urinary tract infections for antibiotic susceptibility + description_short: Number of countries testing >80% bacteriologically confirmed urinary tract infections for antibiotic susceptibility. + description_key: *description_key_ast_urinary_tract + unit: countries + display: + name: Urinary tract infections + total_bcis_bloodstream: + title: Total confirmed bloodstream infections + description_short: Total bacteriologically confirmed bloodstream infections tested for antibiotic susceptibility. + unit: infections + display: + name: Bloodstream infections + total_bcis_gonorrhea: + title: Total confirmed gonorrhea infections + description_short: Total bacteriologically confirmed gonorrhea infections tested for antibiotic susceptibility. + unit: infections + display: + name: Gonorrhea infections + total_bcis_stool: + title: Total confirmed gastrointestinal infections + description_short: Total bacteriologically confirmed gastrointestinal infections tested for antibiotic susceptibility. + unit: infections + display: + name: Gastrointestinal infections + total_bcis_urinary_tract: + title: Total confirmed urinary tract infections + description_short: Total bacteriologically confirmed urinary tract infections tested for antibiotic susceptibility. + unit: infections + display: + name: Urinary tract infections + total_bcis_with_ast_bloodstream: + title: Total confirmed bloodstream infections tested for antibiotic susceptibility + description_short: Total bacteriologically confirmed bloodstream infections tested for antibiotic susceptibility. + description_key: *description_key_ast_bloodstream + unit: infections + display: + name: Bloodstream infections + total_bcis_with_ast_gonorrhea: + title: Total confirmed gonorrhea infections tested for antibiotic susceptibility + description_short: Total bacteriologically confirmed gonorrhea infections tested for antibiotic susceptibility. + description_key: *description_key_ast_gonorrhea + unit: infections + display: + name: Gonorrhea infections + total_bcis_with_ast_stool: + title: Total confirmed gastrointestinal infections tested for antibiotic susceptibility + description_short: Total bacteriologically confirmed gastrointestinal infections tested for antibiotic susceptibility. + description_key: *description_key_ast_stool + unit: infections + display: + name: Gastrointestinal infections + total_bcis_with_ast_urinary_tract: + title: Total confirmed urinary tract infections tested for antibiotic susceptibility + description_short: Total bacteriologically confirmed urinary tract infections tested for antibiotic susceptibility. + description_key: *description_key_ast_urinary_tract + unit: infections + display: + name: Urinary tract infections + share_ctas_with_reported_bcis_bloodstream: + title: Share of countries reporting confirmed bloodstream infections + description_short: Share of countries reporting bacteriologically confirmed bloodstream infections to WHO. + unit: "%" + short_unit: "%" + display: + name: Bloodstream infections + share_ctas_with_reported_bcis_gonorrhea: + title: Share of countries reporting confirmed gonorrhea infections + description_short: Share of countries reporting bacteriologically confirmed gonorrhea infections to WHO. + unit: "%" + short_unit: "%" + display: + name: Gonorrhea infections + share_ctas_with_reported_bcis_stool: + title: Share of countries reporting confirmed gastrointestinal infections + description_short: Share of countries reporting bacteriologically confirmed gastrointestinal infections to WHO. + unit: "%" + short_unit: "%" + display: + name: Gastrointestinal infections + share_ctas_with_reported_bcis_urinary_tract: + title: Share of countries reporting confirmed urinary tract infections + description_short: Share of countries reporting bacteriologically confirmed urinary tract infections to WHO. + unit: "%" + short_unit: "%" + display: + name: Urinary tract infections + share_ctas_with_reported_bcis_with_ast__gt__80_bcis_bloodstream: + title: Share of countries testing >80% of confirmed bloodstream infections for antibiotic susceptibility + description_short: Share of countries testing >80% bacteriologically confirmed bloodstream infections for antibiotic susceptibility. + description_key: *description_key_ast_bloodstream + unit: "%" + short_unit: "%" + display: + name: Bloodstream infections + share_ctas_with_reported_bcis_with_ast__gt__80_bcis_gonorrhea: + title: Share of countries testing >80% of confirmed gonorrhea infections for antibiotic susceptibility + description_short: Share of countries testing >80% bacteriologically confirmed gonorrhea infections for antibiotic susceptibility. + description_key: *description_key_ast_gonorrhea + unit: "%" + short_unit: "%" + display: + name: Gonorrhea infections + share_ctas_with_reported_bcis_with_ast__gt__80_bcis_stool: + title: Share of countries testing >80% of confirmed gastrointestinal infections for antibiotic susceptibility + description_short: Share of countries testing >80% bacteriologically confirmed gastrointestinal infections for antibiotic susceptibility. + description_key: *description_key_ast_stool + unit: "%" + short_unit: "%" + display: + name: Gastrointestinal infections + share_ctas_with_reported_bcis_with_ast__gt__80_bcis_urinary_tract: + title: Share of countries testing >80% of confirmed urinary tract infections for antibiotic susceptibility + description_short: Share of countries testing >80% bacteriologically confirmed urinary tract infections for antibiotic susceptibility. + description_key: *description_key_ast_urinary_tract + unit: "%" + short_unit: "%" + display: + name: Urinary tract infections + specimen: + variables: + total_bcis_with_ast: + title: Confirmed infections tested for antibiotic susceptibility + description_short: Total bacteriologically confirmed infections tested for antibiotic susceptibility. + unit: infections + infections_not_tested_for_susceptibility: + title: Confirmed infections not tested for antibiotic susceptibility + description_short: Total bacteriologically confirmed infections that have not been tested for antibiotic susceptibility. + unit: infections + diff --git a/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.py b/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.py new file mode 100644 index 00000000000..591d82b7d91 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.py @@ -0,0 +1,132 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Number of countries in each WHO region. +WHO_REGION_MEMBERS = { + "African Region (WHO)": 47, + "World": 194, + "Eastern Mediterranean (WHO)": 22, + "European Region (WHO)": 53, + "Region of the Americas (WHO)": 35, + "South-East Asia Region (WHO)": 11, + "Western Pacific Region (WHO)": 27, +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("testing_coverage") + + # Read table from meadow dataset. + tb = ds_meadow["testing_coverage"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb = format_specimen(tb) + tb = tb.drop(columns=["min", "q1", "median", "q3", "max"]) + # A table where the specimen column is the country, to make stacked bar chart. + tb_specimen = calculate_number_infections_not_tested_for_susceptibility(tb) + + # Pivot the table to have one row per country and year. + tb = tb.pivot( + index=["country", "year"], + columns="specimen", + values=[ + "ctas_with_reported_bcis", + "ctas_with_reported_bcis_with_ast__gt__80_bcis", + "total_bcis", + "total_bcis_with_ast", + ], + join_column_levels_with="_", + ) + # Add the number of countries in each WHO region to calculate the share of countries that are reporting data. + tb = add_number_of_countries_in_each_region(tb) + # Calculate the share of countries in each WHO region that are reporting data. + tb = calculate_share_of_countries(tb) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb, tb_specimen], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def format_specimen(tb: Table) -> Table: + """ + Format the syndrome column. + """ + specimen_dict = {"BLOOD": "bloodstream", "STOOL": "stool", "URINE": "urinary_tract", "UROGENITAL": "gonorrhea"} + tb["specimen"] = tb["specimen"].astype(str) + tb["specimen"] = tb["specimen"].replace(specimen_dict) + assert tb["specimen"].isin(specimen_dict.values()).all() + + return tb + + +def add_number_of_countries_in_each_region(tb: Table) -> Table: + """ + Adding number of countries in each WHO region in order to calculate the share that are reporting data. + """ + tb["number_of_countries_in_region"] = tb["country"].map(WHO_REGION_MEMBERS) + tb["number_of_countries_in_region"] = tb["number_of_countries_in_region"].astype("Int64") + assert tb["number_of_countries_in_region"].notnull().all(), "Missing WHO region! Check spelling." + + return tb + + +def calculate_share_of_countries(tb: Table) -> Table: + """ + Calculate the share of countries in each WHO region that are reporting data. + """ + columns_with_number_of_countries = tb.columns[tb.columns.str.startswith("ctas")] + for column in columns_with_number_of_countries: + new_column = "share_" + column + tb[new_column] = (tb[column] / tb["number_of_countries_in_region"]) * 100 + + tb = tb.drop(columns="number_of_countries_in_region") + return tb + + +def calculate_number_infections_not_tested_for_susceptibility(tb: Table) -> Table: + """ + Calculate the number of infections not tested for susceptibility to make stacked bar chart. + """ + tb = tb[tb["country"] == "World"] + tb["infections_not_tested_for_susceptibility"] = tb["total_bcis"] - tb["total_bcis_with_ast"] + tb = tb.drop( + columns=[ + "country", + "ctas_with_reported_bcis", + "ctas_with_reported_bcis_with_ast__gt__80_bcis", + "total_bcis", + ] + ) + tb = tb.rename(columns={"specimen": "country"}) + tb["country"] = tb["country"].replace( + { + "bloodstream": "Bloodstream", + "stool": "Stool", + "urinary_tract": "Urinary tract", + "gonorrhea": "Gonorrhea", + } + ) + tb = tb.format(["country", "year"], short_name="specimen") + + return tb diff --git a/etl/steps/data/garden/antibiotics/2024-11-20/microbe.countries.json b/etl/steps/data/garden/antibiotics/2024-11-20/microbe.countries.json new file mode 100644 index 00000000000..a7f8eced782 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-20/microbe.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-11-20/microbe.meta.yml b/etl/steps/data/garden/antibiotics/2024-11-20/microbe.meta.yml new file mode 100644 index 00000000000..7a32139565a --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-20/microbe.meta.yml @@ -0,0 +1,62 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Health + display: + numSignificantFigures: 3 + pathogen_type: <%- if pathogen_type == "Fungi" -%> + fungus + <%- elif pathogen_type == "Viruses" -%> + virus + <%- else -%> + << pathogen_type.lower() >> + <%- endif -%> + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + microbe: + variables: + value: + title: Neonatal deaths from << pathogen >> infections + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from << pathogen >> infections. << pathogen >> is a {definitions.pathogen_type}. + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> + upper: + title: Upper bound of neonatal deaths from << pathogen >> infections + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from << pathogen >> infections. << pathogen >> is a {definitions.pathogen_type}. + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> + lower: + title: Lower bound of neonatal deaths from << pathogen >> infections + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from << pathogen >> infections. << pathogen >> is a {definitions.pathogen_type}. + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> + pathogen_entity: + variables: + value: + title: Global neonatal deaths from bloodstream infections, by pathogen + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from bloodstream infections. + upper: + title: Upper bound of global neonatal deaths from bloodstream infections, by pathogen + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from bloodstream infections. + lower: + title: Lower bound of global neonatal deaths from bloodstream infections, by pathogen + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from bloodstream infections. diff --git a/etl/steps/data/garden/antibiotics/2024-11-20/microbe.py b/etl/steps/data/garden/antibiotics/2024-11-20/microbe.py new file mode 100644 index 00000000000..fd25d05a3ff --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-20/microbe.py @@ -0,0 +1,39 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("microbe") + + # Read table from meadow dataset. + tb = ds_meadow.read("microbe") + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb = tb.drop(columns=["age", "sex", "measure", "metric", "counterfactual", "infectious_syndrome"]) + # Create a table where the pathogen is the entity + tb_pathogen = tb.drop(columns=["country", "pathogen_type"]).rename(columns={"pathogen": "country"}) + tb = tb.format(["country", "year", "pathogen_type", "pathogen"]) + tb_pathogen = tb_pathogen.format(["country", "year"], short_name="pathogen_entity") + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb, tb_pathogen], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-11-20/pathogen_bloodstream.countries.json b/etl/steps/data/garden/antibiotics/2024-11-20/pathogen_bloodstream.countries.json new file mode 100644 index 00000000000..a7f8eced782 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-20/pathogen_bloodstream.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-11-20/pathogen_bloodstream.meta.yml b/etl/steps/data/garden/antibiotics/2024-11-20/pathogen_bloodstream.meta.yml new file mode 100644 index 00000000000..fd08ad170a9 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-20/pathogen_bloodstream.meta.yml @@ -0,0 +1,40 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + display: + numSignificantFigures: 3 + name: << pathogen >> + + pathogen_type: + <% if pathogen_type == "Fungi" %> + fungus + <% elif pathogen_type == "Viruses" %> + virus + <% else %> + << pathogen_type.lower() >> + <% endif %> + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + pathogen_bloodstream: + variables: + value: + title: Neonatal deaths from << pathogen >> infections + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from << pathogen >> infections. << pathogen >> is a {definitions.pathogen_type}. + upper: + title: Upper bound of neonatal deaths from << pathogen >> infections + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from << pathogen >> infections. << pathogen >> is a {definitions.pathogen_type}. + lower: + title: Lower bound of neonatal deaths from << pathogen >> infections + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from << pathogen >> infections. << pathogen >> is a {definitions.pathogen_type}. \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-11-20/pathogen_bloodstream.py b/etl/steps/data/garden/antibiotics/2024-11-20/pathogen_bloodstream.py new file mode 100644 index 00000000000..3ccb1ae40bf --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-11-20/pathogen_bloodstream.py @@ -0,0 +1,37 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("pathogen_bloodstream") + + # Read table from meadow dataset. + tb = ds_meadow.read("pathogen_bloodstream") + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb = tb.drop(columns=["age", "sex", "measure", "metric", "counterfactual", "infectious_syndrome"]) + tb = tb.format(["country", "year", "pathogen_type", "pathogen"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/microbe_amr.countries.json b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_amr.countries.json new file mode 100644 index 00000000000..a7f8eced782 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_amr.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/microbe_amr.meta.yml b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_amr.meta.yml new file mode 100644 index 00000000000..0d28fc93f14 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_amr.meta.yml @@ -0,0 +1,44 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + description_key: + - The pathogen and antimicrobial pairs that were tested for resistance can be found on pages 51-52 in the [Supplementary Materials (part 1)](#https://www.thelancet.com/cms/10.1016/S0140-6736(24)01867-1/attachment/005e2559-204f-46d7-b2e3-27b2091209aa/mmc1.pdf) of the original publication. + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + microbe_amr: + variables: + amr_attributable_deaths: + title: Global neonatal deaths from infections attributed to AMR, by syndrome + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from infections that are attributed to antimicrobial resistance. + presentation: + title_public: Global neonatal deaths from infections attributed to AMR + display: + roundingMode: significantFigures + numSignificantFigures: 3 + non_amr_attributable_deaths: + title: Global neonatal deaths from infections not attributed to AMR, by syndrome + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from infections not attributed to antimicrobial resistance. + presentation: + title_public: Global neonatal deaths from infections not attributed to AMR + display: + roundingMode: significantFigures + numSignificantFigures: 3 + total_deaths: + title: Total global neonatal deaths from infections + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from infections. + presentation: + title_public: Total global neonatal deaths from infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/microbe_amr.py b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_amr.py new file mode 100644 index 00000000000..62db655bce1 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_amr.py @@ -0,0 +1,98 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("microbe_amr") + ds_total = paths.load_dataset("total_syndrome") + # Read table from meadow dataset. + tb = ds_meadow.read("microbe_amr") + tb_total = ds_total.read("total_syndrome") + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb_total = geo.harmonize_countries(df=tb_total, countries_file=paths.country_mapping_path) + + # We want three variables, total, amr attributable and amr non-attributable + + tb_amr = ( + tb[tb["counterfactual"] == "Attributable"] + .rename(columns={"value": "amr_attributable_deaths"}, errors="raise") + .drop(columns=["lower", "upper"]) + ) + tb_total = tb_total.rename(columns={"value": "total_deaths"}, errors="raise").drop(columns=["lower", "upper"]) + + tb = tb_amr.merge(tb_total, on=["country", "year", "infectious_syndrome"], how="inner") + tb["non_amr_attributable_deaths"] = tb["total_deaths"] - tb["amr_attributable_deaths"] + # Rename syndromes to be shorter for use in stacked bar charts + tb = rename_syndromes(tb) + + # Reformatting the data so it can be used in stacked bar charts + tb = ( + tb.drop(columns=["country"]).rename(columns={"infectious_syndrome": "country"}).drop(columns=["counterfactual"]) + ) + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def rename_syndromes(tb: Table) -> Table: + """ + Rename syndromes to be shorter for use in stacked bar charts. + Ensure all infectious syndromes are replaced. + """ + name_dict = { + "Bloodstream infections": "Bloodstream infections", + "Lower respiratory infections": "Lower respiratory infections", + "Diarrhea": "Diarrhea", + "Meningitis": "Meningitis", + "Infections of the skin and subcutaneous systems": "Skin infections", + "Urinary tract infections and pyelonephritis": "Kidney and urinary tract infections", + "Peritoneal and intra-abdominal infections": "Abdominal infections", + "Tuberculosis": "Tuberculosis", + "Endocarditis": "Endocarditis", + "Typhoid fever, paratyphoid fever, and invasive non-typhoidal Salmonella": "Typhoid, paratyphoid, and iNTS", + "Infections of bones, joints, and related organs": "Bone and joint infections", + "Other unspecified site infections": "Other infections", + "Other parasitic infections": "Other parasitic infections", + "Oral infections": "Oral infections", + "Myelitis, meningoencephalitis, and other central nervous system infections": "Central nervous system infections", + "Upper respiratory infections": "Upper respiratory infections", + "Hepatitis": "Hepatitis", + "Eye infections": "Eye infections", + "Encephalitis": "Encephalitis", + "Carditis, myocarditis, and pericarditis": "Heart inflammation", + "Sexually transmitted infections": "Sexually transmitted infections", + } + + # Find unmatched syndromes + unmatched_syndromes = set(tb["infectious_syndrome"].unique()) - set(name_dict.keys()) + if unmatched_syndromes: + raise ValueError(f"The following syndromes were not found in the name dictionary: {unmatched_syndromes}") + + # Replace syndromes + tb["infectious_syndrome"] = tb["infectious_syndrome"].replace(name_dict) + + return tb diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/microbe_neonatal_amr.countries.json b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_neonatal_amr.countries.json new file mode 100644 index 00000000000..a7f8eced782 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_neonatal_amr.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/microbe_neonatal_amr.meta.yml b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_neonatal_amr.meta.yml new file mode 100644 index 00000000000..20fca81f6b3 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_neonatal_amr.meta.yml @@ -0,0 +1,42 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + microbe_neonatal_amr: + variables: + amr_attributable_deaths: + title: Global neonatal deaths from bloodstream infections attributed to AMR, by pathogen + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from bloodstream infections that are attributed to antimicrobial resistance. + presentation: + title_public: Global neonatal deaths from bloodstream infections attributed to AMR + display: + roundingMode: significantFigures + numSignificantFigures: 3 + non_amr_attributable_deaths: + title: Global neonatal deaths from bloodstream infections not attributed to AMR, by pathogen + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from bloodstream infections not attributed to antimicrobial resistance. + presentation: + title_public: Global neonatal deaths from bloodstream infections not attributed to AMR + display: + roundingMode: significantFigures + numSignificantFigures: 3 + total_deaths: + title: Total global neonatal deaths from bloodstream infections + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from bloodstream infections. + presentation: + title_public: Total global neonatal deaths from bloodstream infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/microbe_neonatal_amr.py b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_neonatal_amr.py new file mode 100644 index 00000000000..aab5b70b399 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/microbe_neonatal_amr.py @@ -0,0 +1,59 @@ +"""Load a meadow dataset and create a garden dataset.""" +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("microbe_neonatal_amr") + ds_total = paths.load_dataset("microbe") + # Read table from meadow dataset. + tb = ds_meadow.read("microbe_neonatal_amr") + tb_total = ds_total.read("microbe") + # + # Process data. + # + # We want three variables, total, amr attributable and amr non-attributable + + tb_amr = tb.rename(columns={"value": "amr_attributable_deaths"}, errors="raise").drop(columns=["lower", "upper"]) + tb_total = tb_total.rename(columns={"value": "total_deaths"}, errors="raise").drop( + columns=[ + "lower", + "upper", + "age", + "sex", + "measure", + "metric", + "pathogen_type", + "infectious_syndrome", + "counterfactual", + ] + ) + tb_total = tb_total[tb_total["year"] == 2021] + + tb = tb_amr.merge(tb_total, on=["country", "year", "pathogen"], how="inner") + tb["non_amr_attributable_deaths"] = tb["total_deaths"] - tb["amr_attributable_deaths"] + + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + # Reformatting the data so it can be used in stacked bar charts + tb = tb.drop(columns=["country"]).rename(columns={"pathogen": "country"}) + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream.countries.json b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream.countries.json new file mode 100644 index 00000000000..a7f8eced782 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream.meta.yml b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream.meta.yml new file mode 100644 index 00000000000..04044bb693f --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream.meta.yml @@ -0,0 +1,46 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + pathogen_type: <% if pathogen_type == "Fungi" %>fungus<% elif pathogen_type == "Viruses" %>virus<% else %><< pathogen_type.lower() >><% endif %> + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + total_pathogen_bloodstream: + variables: + value: + title: Total deaths from << pathogen >> bloodstream infections + unit: deaths + description_short: Estimated number of deaths << pathogen >> bloodstream infections. << pathogen >> is a {definitions.pathogen_type}. + presentation: + title_public: Total deaths from << pathogen >> bloodstream infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> + upper: + title: Upper bound of total deaths from << pathogen >> bloodstream infections + unit: deaths + description_short: Estimated number of deaths << pathogen >> bloodstream infections. << pathogen >> is a {definitions.pathogen_type}. + presentation: + title_public: Upper bound of total deaths from << pathogen >> bloodstream infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> + lower: + title: Lower bound of total deaths from << pathogen >> bloodstream infections + unit: deaths + description_short: Estimated number of deaths << pathogen >> bloodstream infections. << pathogen >> is a {definitions.pathogen_type}. + presentation: + title_public: Lower bound of total deaths from << pathogen >> bloodstream infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream.py b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream.py new file mode 100644 index 00000000000..ec0b1de3f35 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream.py @@ -0,0 +1,35 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("total_pathogen_bloodstream") + + # Read table from meadow dataset. + tb = ds_meadow.read("total_pathogen_bloodstream") + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb = tb.format(["country", "year", "pathogen", "pathogen_type"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.countries.json b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.countries.json new file mode 100644 index 00000000000..a7f8eced782 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.meta.yml b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.meta.yml new file mode 100644 index 00000000000..189b974b11a --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.meta.yml @@ -0,0 +1,42 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + total_pathogen_bloodstream_amr: + variables: + amr_attributable_deaths: + title: Total deaths from bloodstream infections attributed to AMR, by pathogen + unit: deaths + description_short: Estimated number of deaths from bloodstream infections that are attributed to antimicrobial resistance. + presentation: + title_public: Total deaths from bloodstream infections attributed to AMR, by pathogen + display: + roundingMode: significantFigures + numSignificantFigures: 3 + non_amr_attributable_deaths: + title: Total global deaths from bloodstream infections not attributed to AMR, by pathogen + unit: deaths + description_short: Estimated number of deaths from bloodstream infections that are not attributed to antimicrobial resistance. + presentation: + title_public: Total global deaths from bloodstream infections not attributed to AMR, by pathogen + display: + roundingMode: significantFigures + numSignificantFigures: 3 + total_deaths: + title: Total global deaths from bloodstream infections + unit: deaths + description_short: Estimated number of deaths from bloodstream infections. + presentation: + title_public: Total global deaths from bloodstream infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 diff --git a/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py new file mode 100644 index 00000000000..6c09aa1c119 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py @@ -0,0 +1,47 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("total_pathogen_bloodstream_amr") + ds_total = paths.load_dataset("total_pathogen_bloodstream") + # Read table from meadow dataset. + tb = ( + ds_meadow.read("total_pathogen_bloodstream_amr") + .drop(columns=["upper", "lower"]) + .rename(columns={"value": "amr_attributable_deaths"}) + ) + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb_total = ( + ds_total.read("total_pathogen_bloodstream") + .drop(columns=["upper", "lower"]) + .rename(columns={"value": "total_deaths"}) + ) + + tb = tb.merge(tb_total, on=["country", "year", "pathogen", "pathogen_type"], how="inner") + tb["non_amr_attributable_deaths"] = tb["total_deaths"] - tb["amr_attributable_deaths"] + # Process data. + tb = tb.drop(columns=["country", "pathogen_type"]).rename(columns={"pathogen": "country"}) + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-12-03/glass_enrolment.countries.json b/etl/steps/data/garden/antibiotics/2024-12-03/glass_enrolment.countries.json new file mode 100644 index 00000000000..ad9139b95a6 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-03/glass_enrolment.countries.json @@ -0,0 +1,199 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "C\u00f4te d\u2019Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Palestine": "Palestine", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Comoros ": "Comoros", + "Congo ": "Congo", + "Democratic People\u2019s Republic of Korea": "North Korea", + "Hong Kong SAR (China)": "Hong Kong", + "Iran (Islamic Republic)": "Iran", + "Lao People\u2019s Democratic Republic": "Laos", + "Netherlands ": "Netherlands", + "Niger ": "Niger", + "Philippines ": "Philippines", + "Serbia ": "Serbia", + "Sudan ": "Sudan", + "T\u00fcrkiye": "Turkey" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-12-03/glass_enrolment.meta.yml b/etl/steps/data/garden/antibiotics/2024-12-03/glass_enrolment.meta.yml new file mode 100644 index 00000000000..d830f8aaebc --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-03/glass_enrolment.meta.yml @@ -0,0 +1,20 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + glass_enrolment: + variables: + enrolment: + title: Which countries have enrolled in the Global Antimicrobial Resistance Surveillance System (GLASS)? + description_short: "Global Antimicrobial Resistance Surveillance System participation status of each country." + unit: "" + short_unit: "" diff --git a/etl/steps/data/garden/antibiotics/2024-12-03/glass_enrolment.py b/etl/steps/data/garden/antibiotics/2024-12-03/glass_enrolment.py new file mode 100644 index 00000000000..06c4fc342e6 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-03/glass_enrolment.py @@ -0,0 +1,72 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import numpy as np +import pandas as pd +from owid.catalog import Table + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("glass_enrolment") + + # Read table from meadow dataset. + tb = ds_meadow.read("glass_enrolment") + + # + # Process data. + # + tb = geo.harmonize_countries(tb, countries_file=paths.country_mapping_path) + origins = tb["amc"].metadata.origins + + # Make data meaningful. + tb = tb[["country", "year", "amr", "amc"]] + # Check there's no weird values, it should be only Y and NA + assert len(tb["amr"].unique()) == 2, "amr column should have only two unique values" + assert len(tb["amc"].unique()) == 2, "amc column should have only two unique values" + tb = combine_data(tb) + tb = tb.drop(columns=["amr", "amc"]) + tb["enrolment"].metadata.origins = origins + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def combine_data(tb: Table) -> Table: + """Combine data the amr and amc columns into a single column.""" + # Define conditions + tb["amr"] = tb["amr"].fillna("") + tb["amc"] = tb["amc"].fillna("") + conditions = [ + (tb["amr"] == "Y") & (tb["amc"] == "Y"), # Both AMR and AMC + (tb["amr"] == "Y") & (tb["amc"] != "Y"), # AMR only + (tb["amr"] != "Y") & (tb["amc"] == "Y"), # AMC only + (tb["amr"] != "Y") & (tb["amc"] != "Y"), # Neither + ] + + # Define corresponding outputs + choices = ["Both", "AMR only", "AMC only", "Neither"] + + # Apply row-wise conditions + tb["enrolment"] = np.select(conditions, choices, default=pd.NA) + assert all(tb["enrolment"].notna()), "There should be no missing values in the enrolment column" + + return tb diff --git a/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens.countries.json b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens.countries.json new file mode 100644 index 00000000000..a7f8eced782 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens.meta.yml b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens.meta.yml new file mode 100644 index 00000000000..e29b06ed86b --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens.meta.yml @@ -0,0 +1,46 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + pathogen_type: <% if pathogen_type == "Fungi" %>fungus<% elif pathogen_type == "Viruses" %>virus<% else %><< pathogen_type.lower() >><% endif %> + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + microbe_total_pathogens: + variables: + value: + title: Total deaths from << pathogen >> infections + unit: deaths + description_short: Estimated number of deaths << pathogen >> infections. << pathogen >> is a {definitions.pathogen_type}. + presentation: + title_public: Total deaths from << pathogen >> infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> + upper: + title: Upper bound of total deaths from << pathogen >> infections + unit: deaths + description_short: Estimated number of deaths << pathogen >> infections. << pathogen >> is a {definitions.pathogen_type}. + presentation: + title_public: Upper bound of total deaths from << pathogen >> infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> + lower: + title: Lower bound of total deaths from << pathogen >> infections + unit: deaths + description_short: Estimated number of deaths << pathogen >> infections. << pathogen >> is a {definitions.pathogen_type}. + presentation: + title_public: Lower bound of total deaths from << pathogen >> infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> diff --git a/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens.py b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens.py new file mode 100644 index 00000000000..b6883c23419 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens.py @@ -0,0 +1,35 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("microbe_total_pathogens") + + # Read table from meadow dataset. + tb = ds_meadow.read("microbe_total_pathogens") + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb = tb.format(["country", "year", "pathogen", "pathogen_type"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens_amr.countries.json b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens_amr.countries.json new file mode 100644 index 00000000000..a7f8eced782 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens_amr.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens_amr.meta.yml b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens_amr.meta.yml new file mode 100644 index 00000000000..2edb137309c --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens_amr.meta.yml @@ -0,0 +1,42 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + microbe_total_pathogens_amr: + variables: + amr_attributable_deaths: + title: Total deaths from infections attributed to AMR, by pathogen + unit: deaths + description_short: Estimated number of deaths from infections that are attributed to antimicrobial resistance. + presentation: + title_public: Total deaths from infections attributed to AMR, by pathogen + display: + roundingMode: significantFigures + numSignificantFigures: 3 + non_amr_attributable_deaths: + title: Total global deaths from infections not attributed to AMR, by pathogen + unit: deaths + description_short: Estimated number of deaths from infections that are not attributed to antimicrobial resistance. + presentation: + title_public: Total global deaths from infections not attributed to AMR, by pathogen + display: + roundingMode: significantFigures + numSignificantFigures: 3 + total_deaths: + title: Total global deaths from infections + unit: deaths + description_short: Estimated number of deaths from infections. + presentation: + title_public: Total global deaths from infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 diff --git a/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens_amr.py b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens_amr.py new file mode 100644 index 00000000000..0c71e6b28f7 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-04/microbe_total_pathogens_amr.py @@ -0,0 +1,53 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("microbe_total_pathogens_amr") + ds_total = paths.load_dataset("microbe_total_pathogens") + + # Read table from meadow dataset. + tb = ( + ds_meadow.read("microbe_total_pathogens_amr") + .drop(columns=["upper", "lower"]) + .rename(columns={"value": "amr_attributable_deaths"}) + ) + tb_total = ( + ds_total.read("microbe_total_pathogens") + .drop(columns=["upper", "lower"]) + .rename(columns={"value": "total_deaths"}) + ) + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + + tb = tb.merge(tb_total, on=["country", "year", "pathogen", "pathogen_type"], how="inner") + tb["non_amr_attributable_deaths"] = tb["total_deaths"] - tb["amr_attributable_deaths"] + # Process data. + tb = tb.drop(columns=["country", "pathogen_type"]).rename(columns={"pathogen": "country"}) + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-12-05/microbe_neonatal_total_amr.countries.json b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_neonatal_total_amr.countries.json new file mode 100644 index 00000000000..a7f8eced782 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_neonatal_total_amr.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-12-05/microbe_neonatal_total_amr.meta.yml b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_neonatal_total_amr.meta.yml new file mode 100644 index 00000000000..c9de44d96cd --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_neonatal_total_amr.meta.yml @@ -0,0 +1,42 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + microbe_neonatal_total_amr: + variables: + amr_attributable_deaths: + title: Neonatal deaths from infections attributed to AMR, by pathogen + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from infections that are attributed to antimicrobial resistance. + presentation: + title_public: Neonatal deaths from infections attributed to AMR, by pathogen + display: + roundingMode: significantFigures + numSignificantFigures: 3 + non_amr_attributable_deaths: + title: Neonatal global deaths from infections not attributed to AMR, by pathogen + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from infections that are not attributed to antimicrobial resistance. + presentation: + title_public: Neonatal global deaths from infections not attributed to AMR, by pathogen + display: + roundingMode: significantFigures + numSignificantFigures: 3 + total_deaths: + title: Neonatal global deaths from infections + unit: deaths + description_short: Estimated number of [neonates](#dod:neonatal) – newborns under 28 days of age –  who die each year from infections. + presentation: + title_public: Neonatal global deaths from infections + display: + roundingMode: significantFigures + numSignificantFigures: 3 diff --git a/etl/steps/data/garden/antibiotics/2024-12-05/microbe_neonatal_total_amr.py b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_neonatal_total_amr.py new file mode 100644 index 00000000000..0bbaa192ae6 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_neonatal_total_amr.py @@ -0,0 +1,45 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("microbe_neonatal_total_amr") + ds_total = paths.load_dataset("microbe") + + # Read table from meadow dataset. + tb = ( + ds_meadow.read("microbe_neonatal_total_amr") + .drop(columns=["upper", "lower"]) + .rename(columns={"value": "amr_attributable_deaths"}) + ) + tb_total = ds_total.read("microbe").drop(columns=["upper", "lower"]).rename(columns={"value": "total_deaths"}) + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb = tb.merge(tb_total, on=["country", "year", "pathogen", "pathogen_type"], how="inner") + tb["non_amr_attributable_deaths"] = tb["total_deaths"] - tb["amr_attributable_deaths"] + tb = tb.drop(columns=["country", "pathogen_type"]).rename(columns={"pathogen": "country"}) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.countries.json b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.countries.json new file mode 100644 index 00000000000..a7f8eced782 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.meta.yml b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.meta.yml new file mode 100644 index 00000000000..37ceb3b7b84 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.meta.yml @@ -0,0 +1,45 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + microbe_total_deaths_by_syndrome: + variables: + value: + title: Total deaths from << infectious_syndrome.lower() >> + unit: deaths + description_short: Estimated number of deaths from << infectious_syndrome.lower() >>. + presentation: + title_public: Total deaths from << infectious_syndrome.lower() >> + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> + lower: + title: Lower bound of deaths from << infectious_syndrome.lower() >> + unit: deaths + description_short: Estimated number of deaths from << infectious_syndrome.lower() >>. + presentation: + title_public: Lower bound of deaths from << infectious_syndrome.lower() >> + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> + upper: + title: Upper bound of deaths from << infectious_syndrome.lower() >> + unit: deaths + description_short: Estimated number of deaths from << infectious_syndrome.lower() >>. + presentation: + title_public: Upper bound of deaths from << infectious_syndrome.lower() >> + display: + roundingMode: significantFigures + numSignificantFigures: 3 + name: << pathogen >> diff --git a/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.py b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.py new file mode 100644 index 00000000000..1659dd7480e --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.py @@ -0,0 +1,35 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("microbe_total_deaths_by_syndrome") + + # Read table from meadow dataset. + tb = ds_meadow.read("microbe_total_deaths_by_syndrome") + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb = tb.format(["country", "year", "infectious_syndrome"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.countries.json b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.countries.json new file mode 100644 index 00000000000..40711a8e695 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.countries.json @@ -0,0 +1,3 @@ +{ + "Global": "World" +} diff --git a/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.meta.yml b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.meta.yml new file mode 100644 index 00000000000..55fdd39bf53 --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.meta.yml @@ -0,0 +1,42 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Antibiotics + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + microbe_total_deaths_by_syndrome_amr: + variables: + amr_attributable_deaths: + title: Total deaths from infections attributed to AMR, by syndrome + unit: deaths + description_short: Estimated number of deaths each year from infections that are attributed to antimicrobial resistance. + presentation: + title_public: Total deaths from infections attributed to AMR, by syndrome + display: + roundingMode: significantFigures + numSignificantFigures: 3 + non_amr_attributable_deaths: + title: Total deaths from infections not attributed to AMR, by syndrome + unit: deaths + description_short: Estimated number of deaths each year from infections that are not attributed to antimicrobial resistance. + presentation: + title_public: Total deaths from infections not attributed to AMR, by syndrome + display: + roundingMode: significantFigures + numSignificantFigures: 3 + total_deaths: + title: Total deaths from infections, by syndrome + unit: deaths + description_short: Estimated number deaths each year from infections. + presentation: + title_public: Total deaths from infections, by syndrome + display: + roundingMode: significantFigures + numSignificantFigures: 3 diff --git a/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py new file mode 100644 index 00000000000..ef3048c297c --- /dev/null +++ b/etl/steps/data/garden/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py @@ -0,0 +1,93 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("microbe_total_deaths_by_syndrome_amr") + ds_total = paths.load_dataset("microbe_total_deaths_by_syndrome") + + # Read table from meadow dataset. + tb = ( + ds_meadow.read("microbe_total_deaths_by_syndrome_amr") + .drop(columns=["upper", "lower"]) + .rename(columns={"value": "amr_attributable_deaths"}) + ) + + tb_total = ( + ds_total.read("microbe_total_deaths_by_syndrome") + .drop(columns=["upper", "lower"]) + .rename(columns={"value": "total_deaths"}) + ) + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb = tb.merge(tb_total, on=["country", "year", "infectious_syndrome"], how="inner") + + tb = rename_syndromes(tb) + tb["non_amr_attributable_deaths"] = tb["total_deaths"] - tb["amr_attributable_deaths"] + tb = tb.drop(columns=["country"]).rename(columns={"infectious_syndrome": "country"}) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def rename_syndromes(tb: Table) -> Table: + """ + Rename syndromes to be shorter for use in stacked bar charts. + Ensure all infectious syndromes are replaced. + """ + name_dict = { + "Bloodstream infections": "Bloodstream infections", + "Lower respiratory infections": "Lower respiratory infections", + "Diarrhea": "Diarrhea", + "Meningitis": "Meningitis", + "Infections of the skin and subcutaneous systems": "Skin infections", + "Urinary tract infections and pyelonephritis": "Kidney and urinary tract infections", + "Peritoneal and intra-abdominal infections": "Abdominal infections", + "Tuberculosis": "Tuberculosis", + "Endocarditis": "Endocarditis", + "Typhoid fever, paratyphoid fever, and invasive non-typhoidal Salmonella": "Typhoid, paratyphoid, and iNTS", + "Infections of bones, joints, and related organs": "Bone and joint infections", + "Other unspecified site infections": "Other infections", + "Other parasitic infections": "Other parasitic infections", + "Oral infections": "Oral infections", + "Myelitis, meningoencephalitis, and other central nervous system infections": "Central nervous system infections", + "Upper respiratory infections": "Upper respiratory infections", + "Hepatitis": "Hepatitis", + "Eye infections": "Eye infections", + "Encephalitis": "Encephalitis", + "Carditis, myocarditis, and pericarditis": "Heart inflammation", + "Sexually transmitted infections": "Sexually transmitted infections", + } + + # Find unmatched syndromes + unmatched_syndromes = set(tb["infectious_syndrome"].unique()) - set(name_dict.keys()) + if unmatched_syndromes: + raise ValueError(f"The following syndromes were not found in the name dictionary: {unmatched_syndromes}") + + # Replace syndromes + tb["infectious_syndrome"] = tb["infectious_syndrome"].replace(name_dict) + + return tb diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py index 0fc64c8658b..0f048b3d1a4 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py @@ -49,8 +49,8 @@ def run(dest_dir: str) -> None: # Process data. # tb: Table = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) - tb["released_national_strategy_on_ai"] = tb["released_national_strategy_on_ai"].replace( - {0: "In development", 1: "Released"} + tb["released_national_strategy_on_ai"] = ( + tb["released_national_strategy_on_ai"].astype("string").replace({"0": "In development", "1": "Released"}) ) df_merged = pd.merge(countries_national_ai, tb, on=["country", "year"], how="outer") df_merged.sort_values(by=["year"], inplace=True) diff --git a/etl/steps/data/garden/artificial_intelligence/2024-02-05/chess.py b/etl/steps/data/garden/artificial_intelligence/2024-02-05/chess.py index cd28ee54c9f..782d55e52da 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-02-05/chess.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-02-05/chess.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("chess.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/garden/artificial_intelligence/2024-02-15/epoch_llms.py b/etl/steps/data/garden/artificial_intelligence/2024-02-15/epoch_llms.py index 95e712208a4..ab12e32d1a0 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-02-15/epoch_llms.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-02-15/epoch_llms.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch_llms.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb["training_computation_petaflop"] = tb["Approx Compute (FLOP)"] / 1e15 tb = tb.drop("Approx Compute (FLOP)", axis=1) tb["MMLU avg"] *= 100 diff --git a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_adoption.py b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_adoption.py index 1f0dad66ee1..8b203a8fa7a 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_adoption.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_adoption.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # - # Load inputs. + # Load inputs # # Retrieve snapshots for 2024 and 2023 data. snap_2024 = paths.load_snapshot("ai_adoption.csv") @@ -22,14 +22,14 @@ def run(dest_dir: str) -> None: tb_2023 = snap_2023.read() # - # Process data. + # Process data # tb_2023["% of Respondents"] *= 100 tb_2023 = tb_2023.rename(columns={"Geographic Area": "country", "% of Respondents": "pct_of_respondents"}) # Select the rows where 'year' is 2021 as 2022 is already in 2024 AI index data tb_2021 = tb_2023[tb_2023["Year"] == 2021].copy() - tb_2024["% of respondents"] = tb_2024["% of respondents"].str.replace("%", "") + tb_2024["% of respondents"] = tb_2024["% of respondents"].str.replace("%", "").astype(float) tb_2024 = tb_2024.rename(columns={"Geographic Area": "country", "% of respondents": "pct_of_respondents"}) tb_2021.rename(columns={"Geographic Area": "country"}, inplace=True) diff --git a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_bills.py b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_bills.py index d860f1203e5..cc7cc67893e 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_bills.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_bills.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ai_bills.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_conferences.py b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_conferences.py index d40f3501779..e62445c57bf 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_conferences.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_conferences.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ai_conferences.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_incidents.py b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_incidents.py index ba3d993c010..7f905159d3c 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_incidents.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_incidents.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ai_incidents.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_investment.py b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_investment.py index f21d31f921d..293a4730873 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_investment.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_investment.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ai_investment.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # Read US consumer prices table from garden dataset. ds_us_cpi = paths.load_dataset("us_consumer_prices") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_jobs.py b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_jobs.py index fd6d09fe6f6..c018788b426 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_jobs.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_jobs.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ai_jobs.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_phds.py b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_phds.py index a2ef629ef26..b7b9b6481f4 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_phds.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_phds.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ai_phds.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_robots.py b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_robots.py index 5c45df490f9..6a90efd91af 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_robots.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_robots.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ai_robots.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_strategies.py b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_strategies.py index eaaeccbfe64..74f78b73247 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_strategies.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-06-28/ai_strategies.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ai_strategies.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/garden/artificial_intelligence/2024-07-16/cset.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-07-16/cset.meta.yml index 7cd567eab30..2faebe8865c 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-07-16/cset.meta.yml +++ b/etl/steps/data/garden/artificial_intelligence/2024-07-16/cset.meta.yml @@ -9,13 +9,13 @@ definitions: numDecimalPlaces: 0 description_processing_investment: |- - - Reporting a time series of AI investments in nominal prices (i.e., without adjusting for inflation) means it makes little sense to compare observations across time; it is therefore not very useful. To make comparisons across time possible, one has to take into account that prices change (e.g., there is inflation). - - It is not obvious how to adjust this time series for inflation, and we debated it at some length within our team. - - It would be straightforward to adjust the time series for price changes if we knew the prices of the specific goods and services that these investments purchased. This would make it possible to calculate a volume measure of AI investments, and it would tell us how much these investments bought. But such a metric is not available. While a comprehensive price index is not available, we know that the cost for some crucial AI technology has fallen rapidly in price. - - In the absence of a comprehensive price index that captures the price of AI-specific goods and services, one has to rely on one of the available metrics for the price of a bundle of goods and services. In the end we decided to use the US Consumer Price Index (CPI). - - The US CPI does not provide us with a volume measure of AI goods and services, but it does capture the opportunity costs of these investments. The inflation adjustment of this time series of AI investments therefore lets us understand the size of these investments relative to whatever else these sums of money could have purchased. + - Reporting a time series of AI investments in nominal prices would make it difficult to compare observations across time. To make these comparisons possible, one has to take into account that prices change (inflation). + - It is not obvious how to adjust this time series for inflation, and our team discussed the best solutions at our disposal. + - It would be straightforward to adjust the time series for price changes if we knew the prices of the specific goods and services purchased through these investments. This would make it possible to calculate a volume measure of AI investments and tell us how much these investments bought. But such a metric is not available. While a comprehensive price index is not available, we know that the cost of some crucial AI technology has fallen rapidly in price. + - In the absence of a comprehensive price index that captures the price of AI-specific goods and services, one has to rely on one of the available metrics for the price of a bundle of goods and services. Ultimately, we decided to use the US Consumer Price Index (CPI). + - The US CPI does not provide us with a volume measure of AI goods and services, but it does capture the opportunity costs of these investments. The inflation adjustment of this time series of AI investments, therefore, lets us understand the size of these investments relative to whatever else these sums of money could have purchased. - description_short_investment: Only includes private-market investment flows, such as venture capital; excludes all investment in publicly traded companies, such as the "Big Tech" firms. This data is expressed in US dollars, adjusted for inflation. + description_short_investment: Only includes private-market investment such as venture capital; excludes all investment in publicly traded companies, such as "Big Tech" firms. This data is expressed in US dollars, adjusted for inflation. description_short_articles: English- and Chinese-language scholarly publications related to the development and application of AI. This includes journal articles, conference papers, repository publications (such as arXiv), books, and theses. @@ -35,12 +35,12 @@ definitions: World aggregate does not include data for Micronesia, Tonga, Samoa, Kiribati, Fiji, Papua New Guinea, Palau, Tuvalu, Bermuda, Armenia, Belarus, Georgia, Gibraltar, Jersey, Kosovo, Moldova, Isle of Man, Iceland, Albania, Andorra, Bosnia and Herzegovina, Malta, Montenegro, San Marino, North Macedonia, Liechtenstein, Monaco, Vatican City, Guernsey, Afghanistan, Kyrgyzstan, Bahrain, Laos, Bangladesh, Lebanon, Bhutan, Maldives, Cambodia, Syria, Tajikistan, Cyprus, Mongolia, North Korea, Myanmar, Timor-Leste, Nepal, Turkmenistan, Pakistan, Palestine, Iraq, United Arab Emirates, Uzbekistan, Kazakhstan, Qatar, Vietnam, Yemen, Kuwait, Algeria, Cape Verde, Equatorial Guinea, Swaziland, Namibia, Central African Republic (the), Angola, Ethiopia, Niger, Benin, Gabon, Nigeria, Botswana, Gambia, Rwanda, Burkina Faso, Ghana, São Tomé and Príncipe, Burundi, Guinea, Senegal, Guinea-Bissau, Seychelles, Cameroon, Sierra Leone, Lesotho, Somalia, Chad, Liberia, Libya, South Sudan, Congo, Madagascar, Sudan, Côte d'Ivoire, Malawi, Togo, Mali, Djibouti, Mauritania, Uganda, Egypt, Mauritius, Tanzania, Zambia, Eritrea, Mozambique, Zimbabwe, Comoros, Antigua and Barbuda, Bolivia, Suriname, Nicaragua, Dominican Republic, Bahamas, Ecuador, Paraguay, Barbados, Saint Vincent and the Grenadines, El Salvador, Belize, Grenada, Saint Kitts and Nevis, Guatemala, Guyana, Haiti, Honduras, Trinidad and Tobago, Jamaica, Venezuela, Puerto Rico, Cayman Islands (the), Turks and Caicos Islands, Saint Lucia, and Dominica. description_key_investment: &description_key_investment |- - - The dataset only covers private-market investment flows, such as venture capital. It excludes non-equity financing, such as debt and grants, and omits publicly traded companies, including major Big Tech firms (e.g., Amazon, Microsoft, Meta). As a result, significant investments from public companies, corporate R&D, government funding, and broader infrastructure costs (like data centers and hardware) are not captured, limiting the dataset’s coverage of global AI investments. + - The data likely underestimates total global AI investment, as it only captures certain types of private equity transactions, excluding other significant channels and categories of AI-related spending. + - The dataset only covers private-market investment such as venture capital. It excludes non-equity financing, such as debt and grants, and publicly traded companies, including major Big Tech firms. As a result, significant investments from public companies, corporate R&D, government funding, and broader infrastructure costs (like data centers and hardware) are not captured, limiting the data's coverage of global AI investments. + - The data's "World" aggregate reflects the total investment represented in the data, but may not represent global AI efforts comprehensively, especially in countries not included in the data. - Companies are classified as AI-related based on keyword and industry tags, potentially including firms not traditionally seen as AI-focused while missing others due to definitional differences. - Many investment values are undisclosed, so the source relies on median values from similar transactions, introducing some uncertainty. Additionally, investment origin is attributed to company headquarters, which may overlook cross-border structures or varied investor origins. - - One-time events like large acquisitions can skew yearly figures, and macroeconomic conditions (e.g., interest rates, market sentiment) may impact trends independently of AI-related dynamics. - - The dataset’s "World" aggregate reflects the total investment represented but does not encompass global AI efforts comprehensively, especially in countries not included in the data. - - The dataset likely underestimates the total global AI investment, as it only captures certain types of private equity transactions, excluding other significant channels and categories of AI-related spending. + - One-time events, such as large acquisitions, can distort yearly figures, while broader economic factors like interest rates and market sentiment can influence investment trends independently of AI-specific developments. # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch.meta.yml new file mode 100644 index 00000000000..f8d02877235 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch.meta.yml @@ -0,0 +1,98 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch: + variables: + domain: + title: Domain + unit: '' + short_unit: '' + description_short: Refers to the specific area, application, or field in which an AI system is designed to operate. + description_processing: |- + In cases where multiple domains were associated with a system, we consolidated these entries under the label "Multiple domains". We also identified domains associated with fewer than 20 notable systems and grouped these under the category 'Other'. + display: + zeroDay: '1949-01-01' + yearIsDay: true + + organization_categorization: + title: Researcher affiliation + unit: '' + short_unit: '' + description_short: Describes the sector where the authors of an AI system have their primary affiliations. + description_from_producer: |- + Systems are categorized as “Industry” if their authors are affiliated with private sector organizations, “Academia” if the authors are affiliated with universities or academic institutions, or “Industry - Academia Collaboration” when at least 30% of the authors are from each. + + parameters: + title: Number of parameters + unit: '' + description_short: Total number of learnable variables or weights that the model contains. Parameters are adjusted during the training process to optimize the model's performance. + description_key: + - Parameters are internal variables that machine learning models adjust during their training process to improve their ability to make accurate predictions. They act as the model's "knobs" that are fine-tuned based on the provided data. In deep learning, a subset of artificial intelligence (AI), parameters primarily consist of the weights assigned to the connections between the small processing units called neurons. Picture a vast network of interconnected neurons where the strength of each connection represents a parameter. + + - The total number of parameters in a model is influenced by various factors. The model's structure and the number of “layers” of neurons play a significant role. Generally, more complex models with additional layers tend to have a higher number of parameters. Special components of specific deep learning architectures can further contribute to the overall parameter count. + + - Understanding the number of parameters in a model is crucial to design effective models. More parameters can help the model understand complex data patterns, potentially leading to higher accuracy. However, there's a fine balance to strike. If a model has too many parameters, it risks memorizing the specific examples in its training data rather than learning their underlying patterns. Consequently, it may perform poorly when presented with new, unseen data. Achieving the right balance of parameters is a critical consideration in model development. + + - In recent times, the AI community has witnessed the emergence of what are often referred to as "giant models." These models boast an astounding number of parameters, reaching into the billions or even trillions. While these huge models have achieved remarkable performance, they have a significant computational cost. Effectively managing and training such large-scale models has become a prominent and active area of research and discussion within the AI field. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_dataset_size__datapoints: + title: Training dataset size + unit: 'datapoints' + description_short: The number of examples provided to train an AI model. Typically, more data results in a more comprehensive understanding by the model. + description_key: + - Training data size refers to the volume of data employed to train an artificial intelligence (AI) model effectively. It's a representation of the number of examples that the model learns from during its training process. It is a fundamental measure of the scope of the data used in the model's learning phase. + + - To grasp the concept of training data size, imagine teaching a friend the art of distinguishing different types of birds. In this analogy, each bird picture presented to your friend corresponds to an individual piece of training data. If you showed them 100 unique bird photos, then the training data size in this scenario would be quantified as 100. + + - Training data size is an essential indicator in AI and machine learning. First and foremost, it directly impacts the depth of learning achieved by the model. The more extensive the dataset, the more profound and comprehensive the model's understanding of the subject matter becomes. Additionally, a large training data size contributes significantly to improved recognition capabilities. By exposing the model to a diverse array of examples, it becomes adept at identifying subtle nuances, much like how it becomes skilled at distinguishing various bird species through exposure to a large variety of bird images. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_computation_petaflop: + title: Training computation (petaFLOP) + unit: 'petaFLOP' + description_short: Computation is measured in total petaFLOP, which is 10¹⁵ [floating-point operations](#dod:flop) estimated from AI literature, albeit with some uncertainty. + description_key: + - In the context of artificial intelligence (AI), training computation is predominantly measured using floating-point operations or “FLOP”. One FLOP represents a single arithmetic operation involving floating-point numbers, such as addition, subtraction, multiplication, or division. To adapt to the vast computational demands of AI systems, the measurement unit of petaFLOP is commonly used. One petaFLOP stands as a staggering one quadrillion FLOPs, underscoring the magnitude of computational operations within AI. + + - Modern AI systems are rooted in machine learning and deep learning techniques. These methodologies are notorious for their computational intensity, involving complex mathematical processes and algorithms. During the training phase, AI models process large volumes of data, while continuously adapting and refining their parameters to optimize performance, rendering the training process computationally intensive. + + - Many factors influence the magnitude of training computation within AI systems. Notably, the size of the dataset employed for training significantly impacts the computational load. Larger datasets necessitate more processing power. The complexity of the model's architecture also plays a pivotal role; more intricate models lead to more computations. Parallel processing, involving the simultaneous use of multiple processors, also has a substantial effect. Beyond these factors, specific design choices and other variables further contribute to the complexity and scale of training computation within AI. + + description_processing: Training computation was converted from its original measurement in FLOPs (floating-point operations) to a more manageable unit known as petaFLOPs. This conversion is performed by dividing the original training compute value by 1e15, which represents one quadrillion (10^15). The purpose of this conversion is to provide a more human-readable and practical representation of the immense computational efforts involved in training AI systems. By expressing the training computation in petaFLOPs, it becomes easier to grasp the scale and magnitude of the computational resources required for training these systems, especially when dealing with large datasets and complex architectures. + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + presentation: + grapher_config: + title: Training computation + + publication_date: + title: Publication date + unit: '' + description_short: The date when the AI system was first published. + description_from_producer: The publication, announcement, or release date of the model, in YYYY-MM-DD format. If the year and month are known but the day is unknown, the day is filled in as YYYY-MM-15. If the year is known but the month and day are unknown, the month and day are filled in as YYYY-07-01. + + + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch.py b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch.py new file mode 100644 index 00000000000..764785f5ced --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch.py @@ -0,0 +1,144 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + # Filter notable systems by selecting rows where 'notability_criteria' is not nan + tb = tb[tb["notability_criteria"].notna()].reset_index(drop=True) + tb = tb.drop("notability_criteria", axis=1) + + # Convert relevant columns to string type + columns = ["system", "domain", "organization_categorization"] + tb[columns] = tb[columns].astype(str) + + def simplify_entry(entry): + """ + Simplifies an entry of organization categories which can include many entries of Industry, Academia etc. + Removes duplicates, ensures all words except the first one start with a lower case letter,and joins the categories with ", " and " and " before the last one. + """ + # Check for "nan" + if entry == "nan": + return "Not specified" + + # Split the entry into categories, convert to set to remove duplicates + categories = sorted(set(entry.split(","))) + + # Make sure all words except the first one start with a lower case letter + categories = [categories[0]] + [category.lower() for category in categories[1:]] + + # Join the categories with ", " and " and " before the last one + if len(categories) > 1: + simplified_entry = ", ".join(categories[:-1]) + " and " + categories[-1] + " collaboration" + else: + simplified_entry = categories[0] + + return simplified_entry + + tb["organization_categorization"] = tb["organization_categorization"].apply(simplify_entry) + + # Get the unique values in the organization_categorization column and compare them to expected affiliations + unique_values = set(tb["organization_categorization"]) + expected_values = { + "Industry", + "Academia", + "Government", + "Academia and industry collaboration", + "Academia and research collective collaboration", + "Industry and research collective collaboration", + "Academia, industry and research collective collaboration", + "Government and industry collaboration", + "Research collective", + "Academia, government and industry collaboration", + "Academia and government collaboration", + "Academia, government, industry and research collective collaboration", + "Not specified", + } + assert unique_values == expected_values, "Unexpected affiliations in organization_categorization column" + + # Replace affiliation of researchers with less than 20 systems with 'Other' + affiliation_counts = tb["organization_categorization"].value_counts() + + tb["organization_categorization"] = tb["organization_categorization"].where( + tb["organization_categorization"].map(affiliation_counts) >= 20, "Other" + ) + # Get the organizations that were reclassified to 'Other' + reclassified_organizations = affiliation_counts[affiliation_counts < 20].index.tolist() + + paths.log.info( + f"Affiliations of researchers with less than 20 notable systems that were reclassified to 'Other': {', '.join(reclassified_organizations)}" + ) + + # Replace nans with Unspecified in each column to avoid issues when calculating sume of notable systems + columns = ["organization_categorization", "domain", "organization"] + tb[columns] = tb[columns].replace("nan", "Not specified") + + # Check for multiple entries in 'domain' separated by comma + multiple_domains = tb["domain"].str.contains(",") + # Replace entries in 'domain' that contain a comma with 'Multiple Domains' + tb.loc[multiple_domains, "domain"] = "Multiple domains" + + # Replace domains with less than 20 systems with 'Other' + domain_counts = tb["domain"].value_counts() + + tb["domain"] = tb["domain"].where(tb["domain"].map(domain_counts) >= 20, "Other") + # Get the domains that were reclassified to 'Other' + reclassified_domains = domain_counts[domain_counts < 20].index.tolist() + + paths.log.info( + f"Domains with less than 20 notable systems that were reclassified to 'Other': {', '.join(reclassified_domains)}" + ) + # Convert FLOP to petaFLOP and remove the column with FLOPs (along with training time in hours) + tb["training_computation_petaflop"] = tb["training_compute__flop"] / 1e15 + + # Convert publication date to a datetime objects + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + + # Calculate 'days_since_1949' + tb["days_since_1949"] = (tb["publication_date"] - pd.to_datetime("1949-01-01")).dt.days.astype("Int64") + tb = tb.dropna(subset=["days_since_1949"]) + + tb = tb.reset_index(drop=True) + + assert not tb[["system", "days_since_1949"]].isnull().any().any(), "Index columns should not have NaN values" + + # Drop columns that are not needed + tb = tb.drop( + ["training_compute__flop", "organization", "authors", "country__from_organization"], + axis=1, + ) + tb = tb.format(["days_since_1949", "system"]) + + # Add metadata to the publication date column + tb["publication_date"].metadata.origins = tb["domain"].metadata.origins + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation.meta.yml new file mode 100644 index 00000000000..1bf9422d84a --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation.meta.yml @@ -0,0 +1,35 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_short: Describes the sector where the authors of a notable AI system have their primary affiliations. {definitions.desc_update} + description_from_producer: |- + The distinction is documented in [Academia and Industry](https://docs.google.com/document/d/1wyJmDOWDEKItg0QhO5cpsNAgHq4aHOxQQZnTfzm34gI/edit). + Systems are categorized as “Industry” if their authors are affiliated with private sector organizations, “Academia” if the authors are affiliated with universities or academic institutions, or “Industry - Academia Collaboration” when at least 30% of the authors are from each. + Possible values: Industry, Research Collective, Academia, Industry - Academia Collaboration (Industry leaning), Industry - Academia Collaboration (Academia leaning), Non-profit + unit: 'AI systems' + short_unit: '' + display: + numDecimalPlaces: 0 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Notable AI systems by researcher affiliation + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_aggregates_affiliation: + variables: + yearly_count: + title: Annual number of AI systems by researcher affiliation + + cumulative_count: + title: Cumulative number of AI systems by researcher affiliation diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation.py b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation.py new file mode 100644 index 00000000000..7bcbf76a4d8 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation.py @@ -0,0 +1,75 @@ +"""Generate aggregated table for total yearly and cumulative number of notable AI systems in each category of researcher affiliation.""" + +import datetime as dt + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_aggregates_affiliation.start") + + # + # Load inputs. + # + # Load the the garden dataset without aggregations. + ds_garden = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_garden["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + # Store the origins metadata for later use + origins = tb["organization_categorization"].metadata.origins + + # Define the columns that are not needed + unused_columns = [ + "days_since_1949", + "parameters", + "training_dataset_size__datapoints", + "domain", + "training_computation_petaflop", + ] + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Ensure 'publication_date' column type is datetime64 + assert tb["publication_date"].dtype == "datetime64[ns]", "publication_date column is not of type datetime64" + + # Extract the year from the 'publication_date' column + tb["year"] = tb["publication_date"].dt.year + + # Group by year and country and count the number of systems + tb_agg = tb.groupby(["year", "organization_categorization"], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count + tb_agg["cumulative_count"] = tb_agg.groupby("organization_categorization", observed=False)["yearly_count"].cumsum() + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = paths.short_name + + # Set the index to year and country + tb_agg = tb_agg.format(["year", "organization_categorization"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + ds_garden.save() + + paths.log.info("epoch_aggregates_affiliation.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain.meta.yml new file mode 100644 index 00000000000..b1cde5bb5d5 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain.meta.yml @@ -0,0 +1,53 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_short: Describes the specific area, application, or field in which an AI system is designed to operate. An AI system can operate in more than one domain, thus contributing to the count for multiple domains. {definitions.desc_update} + description_key: + - Game systems are specifically designed for games and excel in understanding and strategizing gameplay. For instance, AlphaGo, developed by DeepMind, defeated the world champion in the game of Go. Such systems use complex algorithms to compete effectively, even against skilled human players. + + - Language systems are tailored to process language, focusing on understanding, translating, and interacting with human languages. Examples include chatbots, machine translation tools like Google Translate, and sentiment analysis algorithms that can detect emotions in text. + + - Multimodal systems are artificial intelligence frameworks that integrate and interpret more than one type of data input, such as text, images, and audio. ChatGPT-4 is an example of a multimodal system, as it has the capability to process and generate responses based on both textual and visual inputs. + + - Vision systems focus on processing visual information, playing a pivotal role in image recognition and related areas. For example, Facebook's photo tagging system uses vision AI to identify faces. + + - Speech systems are dedicated to handling spoken language, serving as the backbone of voice assistants and similar applications. They recognize, interpret, and generate spoken language to interact with users. + + - Recommendation systems offer suggestions based on user preferences, prominently seen in online shopping and media streaming. For instance, Netflix's movie suggestions or Amazon's product recommendations are powered by algorithms that analyze users' preferences and past behaviors. + + - Audio systems process and generate sound, with applications in music composition, signal processing, and sound recognition. + + - Biology systems analyze biological data and simulate biological processes, aiding in drug discovery and genetic research. + + - Image generation systems create visual content from text descriptions or other inputs, used in graphic design and content creation. + + - Robotics systems combine AI with mechanical engineering to create autonomous robots for various industries. + + - Video systems analyze and generate video content, aiding in editing, surveillance, and content creation. + description_processing: The count of notable AI systems per domain is derived by tallying the instances of machine learning models classified under each domain category. It's important to note that a single machine learning model can fall under multiple domains. The classification into domains is determined by the specific area, application, or field that the AI system is primarily designed to operate within. System domains with less than 10 systems are grouped under "Other." + description_from_producer: A foreign key field categorizing the system’s domain of machine learning. This field links to the [ML Domains table](https://airtable.com/appDFXXgaG1xLtXGL/shrhzolGiQCVnwOY5/tbleYEsZORsiYRVTM), and domains are selected from the options in that table. + unit: 'AI systems' + short_unit: '' + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Notable AI systems by domain type + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_aggregates_domain: + variables: + yearly_count: + title: Annual number of AI systems by domain + + cumulative_count: + title: Cumulative number of AI systems by domain diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain.py b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain.py new file mode 100644 index 00000000000..86f50241a6d --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain.py @@ -0,0 +1,107 @@ +"""Generate aggregated table for total yearly and cumulative number of notable AI systems for each domain.""" + +import datetime as dt + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_aggregates_domain.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + + # Store the origins metadata for later use + origins = tb["domain"].metadata.origins + + # Select the rows where the 'notability_criteria' column is not null (only consider notable systems) + tb = tb[tb["notability_criteria"].notna()].reset_index(drop=True) + + # Define the columns that are not needed + unused_columns = [ + "authors", + "country__from_organization", + "organization", + "organization_categorization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + "notability_criteria", + ] + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Convert the 'publication_date' column to datetime format and extract the year + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + tb["year"] = tb["publication_date"].dt.year + + # Split the column to be aggregated by comma (several countries/domains can exist in each cell) + tb["domain"] = tb["domain"].str.split(",") + + # Explode the table to create separate rows for each country or domain + tb_exploded = tb.explode("domain") + + # Drop duplicates where the year, system and country/domain are the same + tb_unique = tb_exploded.drop_duplicates(subset=["year", "system", "domain"]) + + # Replace domains with less than 10 systems with 'Other' + domain_counts = tb_unique["domain"].value_counts() + + tb_unique["domain"] = tb_unique["domain"].where(tb_unique["domain"].map(domain_counts) >= 10, "Other") + # Get the domains that were reclassified to 'Other' + reclassified_domains = domain_counts[domain_counts < 10].index.tolist() + domain_counts = tb_unique["domain"].value_counts() + + paths.log.info( + f"Domains with less than 10 notable systems that were reclassified to 'Other': {', '.join(reclassified_domains)}" + ) + # Convert the column to category type so that the missing values will be considered as 0 + tb_unique["domain"] = tb_unique["domain"].astype("category") + + # Group by year and country/domain and count the number of systems (consider all categories which will assume 0 for missing values) + tb_agg = tb_unique.groupby(["year", "domain"], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count (consider all categories which will assume 0 for missing values) + tb_agg["cumulative_count"] = tb_agg.groupby("domain", observed=False)["yearly_count"].cumsum() + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = paths.short_name + # Set the index to year and domain + tb_agg = tb_agg.format(["year", "domain"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_aggregates_domain.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive.meta.yml new file mode 100644 index 00000000000..1c00a1fb21f --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive.meta.yml @@ -0,0 +1,91 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive: + variables: + domain: + title: Domain + unit: '' + short_unit: '' + description_short: Refers to the specific area, application, or field in which an AI system is designed to operate. + display: + zeroDay: '1949-01-01' + yearIsDay: true + + + parameters: + title: Number of parameters + unit: '' + description_short: Total number of learnable variables or weights that the model contains. Parameters are adjusted during the training process to optimize the model's performance. + description_key: + - Parameters are internal variables that machine learning models adjust during their training process to improve their ability to make accurate predictions. They act as the model's "knobs" that are fine-tuned based on the provided data. In deep learning, a subset of artificial intelligence (AI), parameters primarily consist of the weights assigned to the connections between the small processing units called neurons. Picture a vast network of interconnected neurons where the strength of each connection represents a parameter. + + - The total number of parameters in a model is influenced by various factors. The model's structure and the number of “layers” of neurons play a significant role. Generally, more complex models with additional layers tend to have a higher number of parameters. Special components of specific deep learning architectures can further contribute to the overall parameter count. + + - Understanding the number of parameters in a model is crucial to design effective models. More parameters can help the model understand complex data patterns, potentially leading to higher accuracy. However, there's a fine balance to strike. If a model has too many parameters, it risks memorizing the specific examples in its training data rather than learning their underlying patterns. Consequently, it may perform poorly when presented with new, unseen data. Achieving the right balance of parameters is a critical consideration in model development. + + - In recent times, the AI community has witnessed the emergence of what are often referred to as "giant models." These models boast an astounding number of parameters, reaching into the billions or even trillions. While these huge models have achieved remarkable performance, they have a significant computational cost. Effectively managing and training such large-scale models has become a prominent and active area of research and discussion within the AI field. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_dataset_size__datapoints: + title: Training dataset size + unit: 'datapoints' + description_short: The number of examples provided to train an AI model. Typically, more data results in a more comprehensive understanding by the model. + description_key: + - Training data size refers to the volume of data employed to train an artificial intelligence (AI) model effectively. It's a representation of the number of examples that the model learns from during its training process. It is a fundamental measure of the scope of the data used in the model's learning phase. + + - To grasp the concept of training data size, imagine teaching a friend the art of distinguishing different types of birds. In this analogy, each bird picture presented to your friend corresponds to an individual piece of training data. If you showed them 100 unique bird photos, then the training data size in this scenario would be quantified as 100. + + - Training data size is an essential indicator in AI and machine learning. First and foremost, it directly impacts the depth of learning achieved by the model. The more extensive the dataset, the more profound and comprehensive the model's understanding of the subject matter becomes. Additionally, a large training data size contributes significantly to improved recognition capabilities. By exposing the model to a diverse array of examples, it becomes adept at identifying subtle nuances, much like how it becomes skilled at distinguishing various bird species through exposure to a large variety of bird images. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_computation_petaflop: + title: Training computation (petaFLOP) + unit: 'petaFLOP' + description_short: Computation is measured in total petaFLOP, which is 10¹⁵ [floating-point operations](#dod:flop) estimated from AI literature, albeit with some uncertainty. + description_key: + - In the context of artificial intelligence (AI), training computation is predominantly measured using floating-point operations or “FLOP”. One FLOP represents a single arithmetic operation involving floating-point numbers, such as addition, subtraction, multiplication, or division. To adapt to the vast computational demands of AI systems, the measurement unit of petaFLOP is commonly used. One petaFLOP stands as a staggering one quadrillion FLOPs, underscoring the magnitude of computational operations within AI. + + - Modern AI systems are rooted in machine learning and deep learning techniques. These methodologies are notorious for their computational intensity, involving complex mathematical processes and algorithms. During the training phase, AI models process large volumes of data, while continuously adapting and refining their parameters to optimize performance, rendering the training process computationally intensive. + + - Many factors influence the magnitude of training computation within AI systems. Notably, the size of the dataset employed for training significantly impacts the computational load. Larger datasets necessitate more processing power. The complexity of the model's architecture also plays a pivotal role; more intricate models lead to more computations. Parallel processing, involving the simultaneous use of multiple processors, also has a substantial effect. Beyond these factors, specific design choices and other variables further contribute to the complexity and scale of training computation within AI. + + description_processing: Training computation was converted from its original measurement in FLOPs (floating-point operations) to a more manageable unit known as petaFLOPs. This conversion is performed by dividing the original training compute value by 1e15, which represents one quadrillion (10^15). The purpose of this conversion is to provide a more human-readable and practical representation of the immense computational efforts involved in training AI systems. By expressing the training computation in petaFLOPs, it becomes easier to grasp the scale and magnitude of the computational resources required for training these systems, especially when dealing with large datasets and complex architectures. + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + presentation: + grapher_config: + title: Training computation + + publication_date: + title: Publication date + unit: '' + description_short: The date when the AI system was first published. + description_from_producer: The publication, announcement, or release date of the model, in YYYY-MM-DD format. If the year and month are known but the day is unknown, the day is filled in as YYYY-MM-15. If the year is known but the month and day are unknown, the month and day are filled in as YYYY-07-01. + + + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive.py b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive.py new file mode 100644 index 00000000000..4eb3048784b --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive.py @@ -0,0 +1,60 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Convert FLOP to petaFLOP and remove the column with FLOPs (along with training time in hours) + tb["training_computation_petaflop"] = tb["training_compute__flop"] / 1e15 + + # Convert publication date to a datetime objects + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + + # Calculate 'days_since_1949' + tb["days_since_1949"] = (tb["publication_date"] - pd.to_datetime("1949-01-01")).dt.days.astype("Int64") + tb = tb.dropna(subset=["days_since_1949"]) + + tb = tb.reset_index(drop=True) + + assert not tb[["system", "days_since_1949"]].isnull().any().any(), "Index columns should not have NaN values" + + # Drop columns that are not needed + tb = tb.drop( + ["training_compute__flop", "organization", "authors", "country__from_organization"], + axis=1, + ) + tb = tb.format(["days_since_1949", "system"]) + + # Add metadata to the publication date column + tb["publication_date"].metadata.origins = tb["domain"].metadata.origins + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.countries.json b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.countries.json new file mode 100644 index 00000000000..ddfda66807a --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.countries.json @@ -0,0 +1,18 @@ +{ + "Canada": "Canada", + "China": "China", + "Germany": "Germany", + "Israel": "Israel", + "Singapore": "Singapore", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United States of America": "United States", + "Korea (Republic of)": "South Korea", + "Multinational": "Multinational", + "Russia": "Russia", + "Japan": "Japan", + "France": "France", + "Finland": "Finland", + "Total": "Total", + "Hong Kong": "Hong Kong" +} \ No newline at end of file diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.meta.yml new file mode 100644 index 00000000000..3f97637a89b --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.meta.yml @@ -0,0 +1,31 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). + + unit: 'AI systems' + short_unit: '' + description_short: Refers to the location of the primary organization with which the authors of a large-scale AI systems are affiliated. {definitions.desc_update} + description_processing: The number of large-scale AI systems by country is determined by tallying the number of machine learning models that are associated with the geographical location of the researchers' affiliated institutions. It's important to note that a single model can have multiple authors, each potentially affiliated with different institutions, thus contributing to the count for multiple countries. +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ + +dataset: + update_period_days: 31 + title: Large-scale AI systems by country +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive_countries: + variables: + yearly_count: + title: Annual number of large-scale AI systems by country + + cumulative_count: + title: Cumulative number of large-scale AI systems by country diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.py b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.py new file mode 100644 index 00000000000..69bc951a631 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.py @@ -0,0 +1,67 @@ +"""Generate aggregated table for total yearly and cumulative number of compute intensive AI systems in each country.""" + +import datetime as dt + +import shared as sh + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_compute_intensive_countries.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Define the columns that are not needed + unused_columns = [ + "domain", + "authors", + "organization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + ] + + # Aggregate the data by country + tb_agg = sh.calculate_aggregates(tb, "country__from_organization", paths.short_name, unused_columns) + + # Rename the 'country__from_organization' column to 'country' + tb_agg = tb_agg.rename(columns={"country__from_organization": "country"}) + + # Harmonize the country names + tb_agg = geo.harmonize_countries(df=tb_agg, countries_file=paths.country_mapping_path) + + # Set the index to year and country + tb_agg = tb_agg.format(["year", "country"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive_countries.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain.meta.yml new file mode 100644 index 00000000000..5120b01e1e7 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain.meta.yml @@ -0,0 +1,48 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). + description_short: Describes the specific area, application, or field in which a large-scale AI model is designed to operate. {definitions.desc_update} + description_key: + - Game systems are specifically designed for games and excel in understanding and strategizing gameplay. For instance, AlphaGo, developed by DeepMind, defeated the world champion in the game of Go. Such systems use complex algorithms to compete effectively, even against skilled human players. + + - Language systems are tailored to process language, focusing on understanding, translating, and interacting with human languages. Examples include chatbots, machine translation tools like Google Translate, and sentiment analysis algorithms that can detect emotions in text. + + - Multimodal systems are artificial intelligence frameworks that integrate and interpret more than one type of data input, such as text, images, and audio. ChatGPT-4 is an example of a multimodal system, as it has the capability to process and generate responses based on both textual and visual inputs. + + - Vision systems focus on processing visual information, playing a pivotal role in image recognition and related areas. For example, Facebook's photo tagging system uses vision AI to identify faces. + + - Speech systems are dedicated to handling spoken language, serving as the backbone of voice assistants and similar applications. They recognize, interpret, and generate spoken language to interact with users. + + - Biology systems analyze biological data and simulate biological processes, aiding in drug discovery and genetic research. + + - Image generation systems create visual content from text descriptions or other inputs, used in graphic design and content creation. + + description_processing: The count of large-scale AI models AI systems per domain is derived by tallying the instances of machine learning models classified under each domain category. It's important to note that a single machine learning model can fall under multiple domains. The classification into domains is determined by the specific area, application, or field that the AI system is primarily designed to operate within. + description_from_producer: A foreign key field categorizing the system’s domain of machine learning. This field links to the [ML Domains table](https://airtable.com/appDFXXgaG1xLtXGL/shrhzolGiQCVnwOY5/tbleYEsZORsiYRVTM), and domains are selected from the options in that table. + unit: 'AI systems' + short_unit: '' + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Large-scale AI systems by domain type + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive_domain: + variables: + yearly_count: + title: Annual number of large-scale AI models by domain + + cumulative_count: + title: Cumulative number of large-scale AI models by domain diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain.py b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain.py new file mode 100644 index 00000000000..e832677a43d --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain.py @@ -0,0 +1,60 @@ +"""Generate aggregated table for total yearly and cumulative number of compute intensive AI systems for each domain.""" + +import datetime as dt + +import shared as sh + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_compute_intensive_domain.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Define the columns that are not needed + unused_columns = [ + "authors", + "country__from_organization", + "organization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + ] + + # Aggregate the data by domain + tb_agg = sh.calculate_aggregates(tb, "domain", paths.short_name, unused_columns) + + # Set the index to year and domain + tb_agg = tb_agg.format(["year", "domain"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive_domain.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_regressions.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_regressions.meta.yml new file mode 100644 index 00000000000..8bffd4fdf09 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_regressions.meta.yml @@ -0,0 +1,12 @@ + +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_processing: |- + We performed a regression analysis, fitting exponential models to the data for both the pre-deep learning (before 2010) and deep learning eras (after 2010), using the code provided by researchers from Epoch. +dataset: + title: Parameter, Compute and Data Trends in Machine Learning - Regressions + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_regressions.py b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_regressions.py new file mode 100644 index 00000000000..6d30589f245 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/epoch_regressions.py @@ -0,0 +1,145 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import numpy as np +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table +from sklearn.linear_model import LinearRegression + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Constants for defining the time periods +DL_ERA_START = 2010 +START_DATE = 1950 +END_DATE = 2025.2 + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"].reset_index() + + # Run regression analysis and concatenate results + tb_trend = run_regression(tb) + tb = tb.drop("frac_year", axis=1) + tb = pr.concat([tb_trend, tb]) + + # Format the table + tb = tb.format(["days_since_1949", "system"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch.end") + + +def fit_exponential(models, metric): + """Fit an exponential model to the given metric data. Code provided by Epoch AI team.""" + x = models["frac_year"].values.reshape(-1, 1) + y = models[metric] + + # Filter out non-positive values + positive_mask = y > 0 + x = x[positive_mask] + y = y[positive_mask] + + # Apply log10 transformation + y = np.log10(y) + + # Filter out infinite and extremely large values + finite_mask = np.isfinite(y) & (y < np.finfo(np.float32).max) + x = x[finite_mask] + y = y[finite_mask] + + # Fit linear regression model + reg = LinearRegression().fit(x, y) + return reg.intercept_, reg.coef_[0] + + +def run_regression(tb): + """Run regression analysis on the given table and return the updated table.""" + # Add fractional year for sorting and processing + publication_dates = tb["publication_date"] + tb.loc[:, "frac_year"] = ( + publication_dates.dt.year + (publication_dates.dt.month - 1) / 12 + (publication_dates.dt.day - 1) / 365 + ) + tb = tb.sort_values(by="frac_year") + + # Define periods dynamically + periods = { + f"{START_DATE}–{DL_ERA_START}": (tb["frac_year"] < DL_ERA_START), + f"{DL_ERA_START}–{int(END_DATE)}": ((tb["frac_year"] >= DL_ERA_START) & (tb["frac_year"] < END_DATE)), + } + # Define year grids dynamically + year_grids = { + f"{START_DATE}–{DL_ERA_START}": np.array([START_DATE, DL_ERA_START]), + f"{DL_ERA_START}–{int(END_DATE)}": np.array([DL_ERA_START, END_DATE]), + } + + metrics = ["training_computation_petaflop", "parameters", "training_dataset_size__datapoints"] + new_tables = [] + + for metric in metrics: + # Filter out models without the metric information + tb_metric = tb[pd.notnull(tb[metric])] + dfs = [] + + for period_name, condition in periods.items(): + # Subset data for the current period + period_data = tb_metric[condition] + + # Fit exponential model + fit = fit_exponential(period_data, metric) + oom_per_year = fit[1] + info = f"{10**oom_per_year:.1f}x/year" + + # Log the results + paths.log.info(f"{period_name} ({metric}): {info}") + + # Calculate the regression line for the current period + year_grid = year_grids[period_name] + line = 10 ** (fit[0] + year_grid * fit[1]) + + # Create DataFrame for the current period + df = pd.DataFrame( + { + "days_since_1949": [ + period_data["days_since_1949"].min(), + period_data["days_since_1949"].max(), + ], + f"{metric}": [line[0], line[-1]], + "system": [f"{info} between {period_name}"] * 2, + } + ) + dfs.append(df) + + # Combine the DataFrames for all periods for the current metric + df_combined = pd.concat(dfs, ignore_index=True) + new_tables.append(df_combined) + + # Merge all the new DataFrames + tb_new = new_tables[0] + for tb_m in new_tables[1:]: + tb_new = pd.merge(tb_new, tb_m, on=["system", "days_since_1949"], how="outer") + + # Convert to OWID Table and add metadata + tb_new = Table(tb_new, short_name=paths.short_name) + for column in tb_new.columns: + tb_new[column].metadata.origins = tb["publication_date"].metadata.origins + + return tb_new diff --git a/etl/steps/data/garden/artificial_intelligence/2024-11-03/shared.py b/etl/steps/data/garden/artificial_intelligence/2024-11-03/shared.py new file mode 100644 index 00000000000..f9ac6876d20 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-11-03/shared.py @@ -0,0 +1,74 @@ +from typing import List + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def calculate_aggregates(tb: Table, agg_column: str, short_name: str, unused_columns: List[str]) -> Table: + """ + This function calculates aggregates for a given column in a Table. It is used to calculate the total yearly and cumulative number of notable AI systems for each domain or country. + + Parameters: + tb (Table): The input Table. + agg_column (str): The column to aggregate on. + short_name (str): The short name to set for the table. + unused_columns (List[str]): The list of columns to drop from the table. + + Returns: + Table: The output Table with calculated aggregates. + """ + + # Store the origins metadata for later use + origins = tb[agg_column].metadata.origins + + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Convert the 'publication_date' column to datetime format and extract the year + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + tb["year"] = tb["publication_date"].dt.year + + # Convert the column to category type so that the missing values will be considered as 0 + tb[agg_column] = tb[agg_column].astype("category") + + # Group total yearly counts and calculate cumulative count for total number of systems + tb_total = tb.groupby(["year"]).size().reset_index(name="yearly_count") + total_counts = tb_total.groupby("year")["yearly_count"].sum().reset_index() + total_counts[agg_column] = "Total" + total_counts["cumulative_count"] = total_counts["yearly_count"].cumsum() + + # Split the column to be aggregated by comma (several countries/domains can exist in each cell) + tb[agg_column] = tb[agg_column].str.split(",") + + # Explode the table to create separate rows for each country or domain + tb_exploded = tb.explode(agg_column) + + # Convert the column to category type so that the missing values will be considered as 0 + tb_exploded[agg_column] = tb_exploded[agg_column].astype("category") + + # Drop duplicates where the year, system and country/domain are the same + tb_unique = tb_exploded.drop_duplicates(subset=["year", "system", agg_column]) + + # Group by year and country/domain and count the number of systems (consider all categories which will assume 0 for missing values) + tb_agg = tb_unique.groupby(["year", agg_column], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count (consider all categories which will assume 0 for missing values) + tb_agg["cumulative_count"] = tb_agg.groupby(agg_column, observed=False)["yearly_count"].cumsum() + + # Combine aggregated data with total counts + tb_agg = pr.concat([tb_agg, total_counts], ignore_index=True) + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = short_name + + return tb_agg diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.meta.yml new file mode 100644 index 00000000000..c4764e0418e --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.meta.yml @@ -0,0 +1,98 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch: + variables: + domain: + title: Domain + unit: '' + short_unit: '' + description_short: Refers to the specific area, application, or field in which an AI model is designed to operate. + description_processing: |- + In cases where multiple domains were associated with a model, we consolidated these entries under the label "Multiple domains". We also identified domains associated with fewer than 20 notable systems and grouped these under the category 'Other'. + display: + zeroDay: '1949-01-01' + yearIsDay: true + + organization_categorization: + title: Researcher affiliation + unit: '' + short_unit: '' + description_short: Describes the sector where the authors of an AI model have their primary affiliations. + description_from_producer: |- + Systems are categorized as “Industry” if their authors are affiliated with private sector organizations, “Academia” if the authors are affiliated with universities or academic institutions, or “Industry - Academia Collaboration” when at least 30% of the authors are from each. + + parameters: + title: Number of parameters + unit: '' + description_short: Total number of learnable variables or weights that the model contains. Parameters are adjusted during the training process to optimize the model's performance. + description_key: + - Parameters are internal variables that machine learning models adjust during their training process to improve their ability to make accurate predictions. They act as the model's "knobs" that are fine-tuned based on the provided data. In deep learning, a subset of artificial intelligence (AI), parameters primarily consist of the weights assigned to the connections between the small processing units called neurons. Picture a vast network of interconnected neurons where the strength of each connection represents a parameter. + + - The total number of parameters in a model is influenced by various factors. The model's structure and the number of “layers” of neurons play a significant role. Generally, more complex models with additional layers tend to have a higher number of parameters. Special components of specific deep learning architectures can further contribute to the overall parameter count. + + - Understanding the number of parameters in a model is crucial to design effective models. More parameters can help the model understand complex data patterns, potentially leading to higher accuracy. However, there's a fine balance to strike. If a model has too many parameters, it risks memorizing the specific examples in its training data rather than learning their underlying patterns. Consequently, it may perform poorly when presented with new, unseen data. Achieving the right balance of parameters is a critical consideration in model development. + + - In recent times, the AI community has witnessed the emergence of what are often referred to as "giant models." These models boast an astounding number of parameters, reaching into the billions or even trillions. While these huge models have achieved remarkable performance, they have a significant computational cost. Effectively managing and training such large-scale models has become a prominent and active area of research and discussion within the AI field. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_dataset_size__datapoints: + title: Training dataset size + unit: 'datapoints' + description_short: The number of examples provided to train an AI model. Typically, more data results in a more comprehensive understanding by the model. + description_key: + - Training data size refers to the volume of data employed to train an artificial intelligence (AI) model effectively. It's a representation of the number of examples that the model learns from during its training process. It is a fundamental measure of the scope of the data used in the model's learning phase. + + - To grasp the concept of training data size, imagine teaching a friend the art of distinguishing different types of birds. In this analogy, each bird picture presented to your friend corresponds to an individual piece of training data. If you showed them 100 unique bird photos, then the training data size in this scenario would be quantified as 100. + + - Training data size is an essential indicator in AI and machine learning. First and foremost, it directly impacts the depth of learning achieved by the model. The more extensive the dataset, the more profound and comprehensive the model's understanding of the subject matter becomes. Additionally, a large training data size contributes significantly to improved recognition capabilities. By exposing the model to a diverse array of examples, it becomes adept at identifying subtle nuances, much like how it becomes skilled at distinguishing various bird species through exposure to a large variety of bird images. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_computation_petaflop: + title: Training computation (petaFLOP) + unit: 'petaFLOP' + description_short: Computation is measured in total petaFLOP, which is 10¹⁵ [floating-point operations](#dod:flop) estimated from AI literature, albeit with some uncertainty. + description_key: + - In the context of artificial intelligence (AI), training computation is predominantly measured using floating-point operations or “FLOP”. One FLOP represents a single arithmetic operation involving floating-point numbers, such as addition, subtraction, multiplication, or division. To adapt to the vast computational demands of AI systems, the measurement unit of petaFLOP is commonly used. One petaFLOP stands as a staggering one quadrillion FLOPs, underscoring the magnitude of computational operations within AI. + + - Modern AI systems are rooted in machine learning and deep learning techniques. These methodologies are notorious for their computational intensity, involving complex mathematical processes and algorithms. During the training phase, AI models process large volumes of data, while continuously adapting and refining their parameters to optimize performance, rendering the training process computationally intensive. + + - Many factors influence the magnitude of training computation within AI systems. Notably, the size of the dataset employed for training significantly impacts the computational load. Larger datasets necessitate more processing power. The complexity of the model's architecture also plays a pivotal role; more intricate models lead to more computations. Parallel processing, involving the simultaneous use of multiple processors, also has a substantial effect. Beyond these factors, specific design choices and other variables further contribute to the complexity and scale of training computation within AI. + + description_processing: Training computation was converted from its original measurement in FLOPs (floating-point operations) to a more manageable unit known as petaFLOPs. This conversion is performed by dividing the original training compute value by 1e15, which represents one quadrillion (10^15). The purpose of this conversion is to provide a more human-readable and practical representation of the immense computational efforts involved in training AI systems. By expressing the training computation in petaFLOPs, it becomes easier to grasp the scale and magnitude of the computational resources required for training these systems, especially when dealing with large datasets and complex architectures. + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + presentation: + grapher_config: + title: Training computation + + publication_date: + title: Publication date + unit: '' + description_short: The date when the AI model was first published. + description_from_producer: The publication, announcement, or release date of the model, in YYYY-MM-DD format. If the year and month are known but the day is unknown, the day is filled in as YYYY-MM-15. If the year is known but the month and day are unknown, the month and day are filled in as YYYY-07-01. + + + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.py new file mode 100644 index 00000000000..1f489c23c58 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.py @@ -0,0 +1,144 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + # Filter notable systems by selecting rows where 'notability_criteria' is not nan + tb = tb[tb["notability_criteria"].notna()].reset_index(drop=True) + tb = tb.drop("notability_criteria", axis=1) + + # Convert relevant columns to string type + columns = ["model", "domain", "organization_categorization"] + tb[columns] = tb[columns].astype(str) + + def simplify_entry(entry): + """ + Simplifies an entry of organization categories which can include many entries of Industry, Academia etc. + Removes duplicates, ensures all words except the first one start with a lower case letter,and joins the categories with ", " and " and " before the last one. + """ + # Check for "nan" + if entry == "nan": + return "Not specified" + + # Split the entry into categories, convert to set to remove duplicates + categories = sorted(set(entry.split(","))) + + # Make sure all words except the first one start with a lower case letter + categories = [categories[0]] + [category.lower() for category in categories[1:]] + + # Join the categories with ", " and " and " before the last one + if len(categories) > 1: + simplified_entry = ", ".join(categories[:-1]) + " and " + categories[-1] + " collaboration" + else: + simplified_entry = categories[0] + + return simplified_entry + + tb["organization_categorization"] = tb["organization_categorization"].apply(simplify_entry) + + # Get the unique values in the organization_categorization column and compare them to expected affiliations + unique_values = set(tb["organization_categorization"]) + expected_values = { + "Industry", + "Academia", + "Government", + "Academia and industry collaboration", + "Academia and research collective collaboration", + "Industry and research collective collaboration", + "Academia, industry and research collective collaboration", + "Government and industry collaboration", + "Research collective", + "Academia, government and industry collaboration", + "Academia and government collaboration", + "Academia, government, industry and research collective collaboration", + "Not specified", + } + assert unique_values == expected_values, "Unexpected affiliations in organization_categorization column" + + # Replace affiliation of researchers with less than 20 systems with 'Other' + affiliation_counts = tb["organization_categorization"].value_counts() + + tb["organization_categorization"] = tb["organization_categorization"].where( + tb["organization_categorization"].map(affiliation_counts) >= 20, "Other" + ) + # Get the organizations that were reclassified to 'Other' + reclassified_organizations = affiliation_counts[affiliation_counts < 20].index.tolist() + + paths.log.info( + f"Affiliations of researchers with less than 20 notable systems that were reclassified to 'Other': {', '.join(reclassified_organizations)}" + ) + + # Replace nans with Unspecified in each column to avoid issues when calculating sume of notable systems + columns = ["organization_categorization", "domain", "organization"] + tb[columns] = tb[columns].replace("nan", "Not specified") + + # Check for multiple entries in 'domain' separated by comma + multiple_domains = tb["domain"].str.contains(",") + # Replace entries in 'domain' that contain a comma with 'Multiple Domains' + tb.loc[multiple_domains, "domain"] = "Multiple domains" + + # Replace domains with less than 20 systems with 'Other' + domain_counts = tb["domain"].value_counts() + + tb["domain"] = tb["domain"].where(tb["domain"].map(domain_counts) >= 20, "Other") + # Get the domains that were reclassified to 'Other' + reclassified_domains = domain_counts[domain_counts < 20].index.tolist() + + paths.log.info( + f"Domains with less than 20 notable systems that were reclassified to 'Other': {', '.join(reclassified_domains)}" + ) + # Convert FLOP to petaFLOP and remove the column with FLOPs (along with training time in hours) + tb["training_computation_petaflop"] = tb["training_compute__flop"] / 1e15 + + # Convert publication date to a datetime objects + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + + # Calculate 'days_since_1949' + tb["days_since_1949"] = (tb["publication_date"] - pd.to_datetime("1949-01-01")).dt.days.astype("Int64") + tb = tb.dropna(subset=["days_since_1949"]) + + tb = tb.reset_index(drop=True) + + assert not tb[["model", "days_since_1949"]].isnull().any().any(), "Index columns should not have NaN values" + + # Drop columns that are not needed + tb = tb.drop( + ["training_compute__flop", "organization", "authors", "country__from_organization"], + axis=1, + ) + tb = tb.format(["days_since_1949", "model"]) + + # Add metadata to the publication date column + tb["publication_date"].metadata.origins = tb["domain"].metadata.origins + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.meta.yml new file mode 100644 index 00000000000..1bf9422d84a --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.meta.yml @@ -0,0 +1,35 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_short: Describes the sector where the authors of a notable AI system have their primary affiliations. {definitions.desc_update} + description_from_producer: |- + The distinction is documented in [Academia and Industry](https://docs.google.com/document/d/1wyJmDOWDEKItg0QhO5cpsNAgHq4aHOxQQZnTfzm34gI/edit). + Systems are categorized as “Industry” if their authors are affiliated with private sector organizations, “Academia” if the authors are affiliated with universities or academic institutions, or “Industry - Academia Collaboration” when at least 30% of the authors are from each. + Possible values: Industry, Research Collective, Academia, Industry - Academia Collaboration (Industry leaning), Industry - Academia Collaboration (Academia leaning), Non-profit + unit: 'AI systems' + short_unit: '' + display: + numDecimalPlaces: 0 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Notable AI systems by researcher affiliation + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_aggregates_affiliation: + variables: + yearly_count: + title: Annual number of AI systems by researcher affiliation + + cumulative_count: + title: Cumulative number of AI systems by researcher affiliation diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py new file mode 100644 index 00000000000..7bcbf76a4d8 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py @@ -0,0 +1,75 @@ +"""Generate aggregated table for total yearly and cumulative number of notable AI systems in each category of researcher affiliation.""" + +import datetime as dt + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_aggregates_affiliation.start") + + # + # Load inputs. + # + # Load the the garden dataset without aggregations. + ds_garden = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_garden["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + # Store the origins metadata for later use + origins = tb["organization_categorization"].metadata.origins + + # Define the columns that are not needed + unused_columns = [ + "days_since_1949", + "parameters", + "training_dataset_size__datapoints", + "domain", + "training_computation_petaflop", + ] + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Ensure 'publication_date' column type is datetime64 + assert tb["publication_date"].dtype == "datetime64[ns]", "publication_date column is not of type datetime64" + + # Extract the year from the 'publication_date' column + tb["year"] = tb["publication_date"].dt.year + + # Group by year and country and count the number of systems + tb_agg = tb.groupby(["year", "organization_categorization"], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count + tb_agg["cumulative_count"] = tb_agg.groupby("organization_categorization", observed=False)["yearly_count"].cumsum() + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = paths.short_name + + # Set the index to year and country + tb_agg = tb_agg.format(["year", "organization_categorization"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + ds_garden.save() + + paths.log.info("epoch_aggregates_affiliation.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.meta.yml new file mode 100644 index 00000000000..b1cde5bb5d5 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.meta.yml @@ -0,0 +1,53 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_short: Describes the specific area, application, or field in which an AI system is designed to operate. An AI system can operate in more than one domain, thus contributing to the count for multiple domains. {definitions.desc_update} + description_key: + - Game systems are specifically designed for games and excel in understanding and strategizing gameplay. For instance, AlphaGo, developed by DeepMind, defeated the world champion in the game of Go. Such systems use complex algorithms to compete effectively, even against skilled human players. + + - Language systems are tailored to process language, focusing on understanding, translating, and interacting with human languages. Examples include chatbots, machine translation tools like Google Translate, and sentiment analysis algorithms that can detect emotions in text. + + - Multimodal systems are artificial intelligence frameworks that integrate and interpret more than one type of data input, such as text, images, and audio. ChatGPT-4 is an example of a multimodal system, as it has the capability to process and generate responses based on both textual and visual inputs. + + - Vision systems focus on processing visual information, playing a pivotal role in image recognition and related areas. For example, Facebook's photo tagging system uses vision AI to identify faces. + + - Speech systems are dedicated to handling spoken language, serving as the backbone of voice assistants and similar applications. They recognize, interpret, and generate spoken language to interact with users. + + - Recommendation systems offer suggestions based on user preferences, prominently seen in online shopping and media streaming. For instance, Netflix's movie suggestions or Amazon's product recommendations are powered by algorithms that analyze users' preferences and past behaviors. + + - Audio systems process and generate sound, with applications in music composition, signal processing, and sound recognition. + + - Biology systems analyze biological data and simulate biological processes, aiding in drug discovery and genetic research. + + - Image generation systems create visual content from text descriptions or other inputs, used in graphic design and content creation. + + - Robotics systems combine AI with mechanical engineering to create autonomous robots for various industries. + + - Video systems analyze and generate video content, aiding in editing, surveillance, and content creation. + description_processing: The count of notable AI systems per domain is derived by tallying the instances of machine learning models classified under each domain category. It's important to note that a single machine learning model can fall under multiple domains. The classification into domains is determined by the specific area, application, or field that the AI system is primarily designed to operate within. System domains with less than 10 systems are grouped under "Other." + description_from_producer: A foreign key field categorizing the system’s domain of machine learning. This field links to the [ML Domains table](https://airtable.com/appDFXXgaG1xLtXGL/shrhzolGiQCVnwOY5/tbleYEsZORsiYRVTM), and domains are selected from the options in that table. + unit: 'AI systems' + short_unit: '' + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Notable AI systems by domain type + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_aggregates_domain: + variables: + yearly_count: + title: Annual number of AI systems by domain + + cumulative_count: + title: Cumulative number of AI systems by domain diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py new file mode 100644 index 00000000000..2a4e84e2673 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py @@ -0,0 +1,107 @@ +"""Generate aggregated table for total yearly and cumulative number of notable AI systems for each domain.""" + +import datetime as dt + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_aggregates_domain.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + + # Store the origins metadata for later use + origins = tb["domain"].metadata.origins + + # Select the rows where the 'notability_criteria' column is not null (only consider notable systems) + tb = tb[tb["notability_criteria"].notna()].reset_index(drop=True) + + # Define the columns that are not needed + unused_columns = [ + "authors", + "country__from_organization", + "organization", + "organization_categorization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + "notability_criteria", + ] + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Convert the 'publication_date' column to datetime format and extract the year + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + tb["year"] = tb["publication_date"].dt.year + + # Split the column to be aggregated by comma (several countries/domains can exist in each cell) + tb["domain"] = tb["domain"].str.split(",") + + # Explode the table to create separate rows for each country or domain + tb_exploded = tb.explode("domain") + + # Drop duplicates where the year, model and country/domain are the same + tb_unique = tb_exploded.drop_duplicates(subset=["year", "model", "domain"]) + + # Replace domains with less than 10 systems with 'Other' + domain_counts = tb_unique["domain"].value_counts() + + tb_unique["domain"] = tb_unique["domain"].where(tb_unique["domain"].map(domain_counts) >= 10, "Other") + # Get the domains that were reclassified to 'Other' + reclassified_domains = domain_counts[domain_counts < 10].index.tolist() + domain_counts = tb_unique["domain"].value_counts() + + paths.log.info( + f"Domains with less than 10 notable systems that were reclassified to 'Other': {', '.join(reclassified_domains)}" + ) + # Convert the column to category type so that the missing values will be considered as 0 + tb_unique["domain"] = tb_unique["domain"].astype("category") + + # Group by year and country/domain and count the number of systems (consider all categories which will assume 0 for missing values) + tb_agg = tb_unique.groupby(["year", "domain"], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count (consider all categories which will assume 0 for missing values) + tb_agg["cumulative_count"] = tb_agg.groupby("domain", observed=False)["yearly_count"].cumsum() + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = paths.short_name + # Set the index to year and domain + tb_agg = tb_agg.format(["year", "domain"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_aggregates_domain.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.meta.yml new file mode 100644 index 00000000000..5f95f506c67 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.meta.yml @@ -0,0 +1,91 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive: + variables: + domain: + title: Domain + unit: '' + short_unit: '' + description_short: Refers to the specific area, application, or field in which an AI model is designed to operate. + display: + zeroDay: '1949-01-01' + yearIsDay: true + + + parameters: + title: Number of parameters + unit: '' + description_short: Total number of learnable variables or weights that the model contains. Parameters are adjusted during the training process to optimize the model's performance. + description_key: + - Parameters are internal variables that machine learning models adjust during their training process to improve their ability to make accurate predictions. They act as the model's "knobs" that are fine-tuned based on the provided data. In deep learning, a subset of artificial intelligence (AI), parameters primarily consist of the weights assigned to the connections between the small processing units called neurons. Picture a vast network of interconnected neurons where the strength of each connection represents a parameter. + + - The total number of parameters in a model is influenced by various factors. The model's structure and the number of “layers” of neurons play a significant role. Generally, more complex models with additional layers tend to have a higher number of parameters. Special components of specific deep learning architectures can further contribute to the overall parameter count. + + - Understanding the number of parameters in a model is crucial to design effective models. More parameters can help the model understand complex data patterns, potentially leading to higher accuracy. However, there's a fine balance to strike. If a model has too many parameters, it risks memorizing the specific examples in its training data rather than learning their underlying patterns. Consequently, it may perform poorly when presented with new, unseen data. Achieving the right balance of parameters is a critical consideration in model development. + + - In recent times, the AI community has witnessed the emergence of what are often referred to as "giant models." These models boast an astounding number of parameters, reaching into the billions or even trillions. While these huge models have achieved remarkable performance, they have a significant computational cost. Effectively managing and training such large-scale models has become a prominent and active area of research and discussion within the AI field. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_dataset_size__datapoints: + title: Training dataset size + unit: 'datapoints' + description_short: The number of examples provided to train an AI model. Typically, more data results in a more comprehensive understanding by the model. + description_key: + - Training data size refers to the volume of data employed to train an artificial intelligence (AI) model effectively. It's a representation of the number of examples that the model learns from during its training process. It is a fundamental measure of the scope of the data used in the model's learning phase. + + - To grasp the concept of training data size, imagine teaching a friend the art of distinguishing different types of birds. In this analogy, each bird picture presented to your friend corresponds to an individual piece of training data. If you showed them 100 unique bird photos, then the training data size in this scenario would be quantified as 100. + + - Training data size is an essential indicator in AI and machine learning. First and foremost, it directly impacts the depth of learning achieved by the model. The more extensive the dataset, the more profound and comprehensive the model's understanding of the subject matter becomes. Additionally, a large training data size contributes significantly to improved recognition capabilities. By exposing the model to a diverse array of examples, it becomes adept at identifying subtle nuances, much like how it becomes skilled at distinguishing various bird species through exposure to a large variety of bird images. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_computation_petaflop: + title: Training computation (petaFLOP) + unit: 'petaFLOP' + description_short: Computation is measured in total petaFLOP, which is 10¹⁵ [floating-point operations](#dod:flop) estimated from AI literature, albeit with some uncertainty. + description_key: + - In the context of artificial intelligence (AI), training computation is predominantly measured using floating-point operations or “FLOP”. One FLOP represents a single arithmetic operation involving floating-point numbers, such as addition, subtraction, multiplication, or division. To adapt to the vast computational demands of AI systems, the measurement unit of petaFLOP is commonly used. One petaFLOP stands as a staggering one quadrillion FLOPs, underscoring the magnitude of computational operations within AI. + + - Modern AI systems are rooted in machine learning and deep learning techniques. These methodologies are notorious for their computational intensity, involving complex mathematical processes and algorithms. During the training phase, AI models process large volumes of data, while continuously adapting and refining their parameters to optimize performance, rendering the training process computationally intensive. + + - Many factors influence the magnitude of training computation within AI systems. Notably, the size of the dataset employed for training significantly impacts the computational load. Larger datasets necessitate more processing power. The complexity of the model's architecture also plays a pivotal role; more intricate models lead to more computations. Parallel processing, involving the simultaneous use of multiple processors, also has a substantial effect. Beyond these factors, specific design choices and other variables further contribute to the complexity and scale of training computation within AI. + + description_processing: Training computation was converted from its original measurement in FLOPs (floating-point operations) to a more manageable unit known as petaFLOPs. This conversion is performed by dividing the original training compute value by 1e15, which represents one quadrillion (10^15). The purpose of this conversion is to provide a more human-readable and practical representation of the immense computational efforts involved in training AI systems. By expressing the training computation in petaFLOPs, it becomes easier to grasp the scale and magnitude of the computational resources required for training these systems, especially when dealing with large datasets and complex architectures. + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + presentation: + grapher_config: + title: Training computation + + publication_date: + title: Publication date + unit: '' + description_short: The date when the AI model was first published. + description_from_producer: The publication, announcement, or release date of the model, in YYYY-MM-DD format. If the year and month are known but the day is unknown, the day is filled in as YYYY-MM-15. If the year is known but the month and day are unknown, the month and day are filled in as YYYY-07-01. + + + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.py new file mode 100644 index 00000000000..b9b431c4ef0 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.py @@ -0,0 +1,60 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Convert FLOP to petaFLOP and remove the column with FLOPs (along with training time in hours) + tb["training_computation_petaflop"] = tb["training_compute__flop"] / 1e15 + + # Convert publication date to a datetime objects + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + + # Calculate 'days_since_1949' + tb["days_since_1949"] = (tb["publication_date"] - pd.to_datetime("1949-01-01")).dt.days.astype("Int64") + tb = tb.dropna(subset=["days_since_1949"]) + + tb = tb.reset_index(drop=True) + + assert not tb[["model", "days_since_1949"]].isnull().any().any(), "Index columns should not have NaN values" + + # Drop columns that are not needed + tb = tb.drop( + ["training_compute__flop", "organization", "authors", "country__from_organization"], + axis=1, + ) + tb = tb.format(["days_since_1949", "model"]) + + # Add metadata to the publication date column + tb["publication_date"].metadata.origins = tb["domain"].metadata.origins + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.countries.json b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.countries.json new file mode 100644 index 00000000000..ddfda66807a --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.countries.json @@ -0,0 +1,18 @@ +{ + "Canada": "Canada", + "China": "China", + "Germany": "Germany", + "Israel": "Israel", + "Singapore": "Singapore", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United States of America": "United States", + "Korea (Republic of)": "South Korea", + "Multinational": "Multinational", + "Russia": "Russia", + "Japan": "Japan", + "France": "France", + "Finland": "Finland", + "Total": "Total", + "Hong Kong": "Hong Kong" +} \ No newline at end of file diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.meta.yml new file mode 100644 index 00000000000..3f97637a89b --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.meta.yml @@ -0,0 +1,31 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). + + unit: 'AI systems' + short_unit: '' + description_short: Refers to the location of the primary organization with which the authors of a large-scale AI systems are affiliated. {definitions.desc_update} + description_processing: The number of large-scale AI systems by country is determined by tallying the number of machine learning models that are associated with the geographical location of the researchers' affiliated institutions. It's important to note that a single model can have multiple authors, each potentially affiliated with different institutions, thus contributing to the count for multiple countries. +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ + +dataset: + update_period_days: 31 + title: Large-scale AI systems by country +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive_countries: + variables: + yearly_count: + title: Annual number of large-scale AI systems by country + + cumulative_count: + title: Cumulative number of large-scale AI systems by country diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py new file mode 100644 index 00000000000..69bc951a631 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py @@ -0,0 +1,67 @@ +"""Generate aggregated table for total yearly and cumulative number of compute intensive AI systems in each country.""" + +import datetime as dt + +import shared as sh + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_compute_intensive_countries.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Define the columns that are not needed + unused_columns = [ + "domain", + "authors", + "organization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + ] + + # Aggregate the data by country + tb_agg = sh.calculate_aggregates(tb, "country__from_organization", paths.short_name, unused_columns) + + # Rename the 'country__from_organization' column to 'country' + tb_agg = tb_agg.rename(columns={"country__from_organization": "country"}) + + # Harmonize the country names + tb_agg = geo.harmonize_countries(df=tb_agg, countries_file=paths.country_mapping_path) + + # Set the index to year and country + tb_agg = tb_agg.format(["year", "country"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive_countries.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.meta.yml new file mode 100644 index 00000000000..4d6697e7541 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.meta.yml @@ -0,0 +1,48 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). + description_short: Describes the specific area, application, or field in which a large-scale AI model is designed to operate. {definitions.desc_update} + description_key: + - Game systems are specifically designed for games and excel in understanding and strategizing gameplay. For instance, AlphaGo, developed by DeepMind, defeated the world champion in the game of Go. Such systems use complex algorithms to compete effectively, even against skilled human players. + + - Language systems are tailored to process language, focusing on understanding, translating, and interacting with human languages. Examples include chatbots, machine translation tools like Google Translate, and sentiment analysis algorithms that can detect emotions in text. + + - Multimodal systems are artificial intelligence frameworks that integrate and interpret more than one type of data input, such as text, images, and audio. ChatGPT-4 is an example of a multimodal model, as it has the capability to process and generate responses based on both textual and visual inputs. + + - Vision systems focus on processing visual information, playing a pivotal role in image recognition and related areas. For example, Facebook's photo tagging model uses vision AI to identify faces. + + - Speech systems are dedicated to handling spoken language, serving as the backbone of voice assistants and similar applications. They recognize, interpret, and generate spoken language to interact with users. + + - Biology systems analyze biological data and simulate biological processes, aiding in drug discovery and genetic research. + + - Image generation systems create visual content from text descriptions or other inputs, used in graphic design and content creation. + + description_processing: The count of large-scale AI models AI systems per domain is derived by tallying the instances of machine learning models classified under each domain category. It's important to note that a single machine learning model can fall under multiple domains. The classification into domains is determined by the specific area, application, or field that the AI model is primarily designed to operate within. + description_from_producer: A foreign key field categorizing the system’s domain of machine learning. This field links to the [ML Domains table](https://airtable.com/appDFXXgaG1xLtXGL/shrhzolGiQCVnwOY5/tbleYEsZORsiYRVTM), and domains are selected from the options in that table. + unit: 'AI systems' + short_unit: '' + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Large-scale AI systems by domain type + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive_domain: + variables: + yearly_count: + title: Annual number of large-scale AI models by domain + + cumulative_count: + title: Cumulative number of large-scale AI models by domain diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py new file mode 100644 index 00000000000..e832677a43d --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py @@ -0,0 +1,60 @@ +"""Generate aggregated table for total yearly and cumulative number of compute intensive AI systems for each domain.""" + +import datetime as dt + +import shared as sh + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_compute_intensive_domain.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Define the columns that are not needed + unused_columns = [ + "authors", + "country__from_organization", + "organization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + ] + + # Aggregate the data by domain + tb_agg = sh.calculate_aggregates(tb, "domain", paths.short_name, unused_columns) + + # Set the index to year and domain + tb_agg = tb_agg.format(["year", "domain"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive_domain.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.meta.yml new file mode 100644 index 00000000000..8bffd4fdf09 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.meta.yml @@ -0,0 +1,12 @@ + +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_processing: |- + We performed a regression analysis, fitting exponential models to the data for both the pre-deep learning (before 2010) and deep learning eras (after 2010), using the code provided by researchers from Epoch. +dataset: + title: Parameter, Compute and Data Trends in Machine Learning - Regressions + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.py new file mode 100644 index 00000000000..8968e2c76a4 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.py @@ -0,0 +1,145 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import numpy as np +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table +from sklearn.linear_model import LinearRegression + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Constants for defining the time periods +DL_ERA_START = 2010 +START_DATE = 1950 +END_DATE = 2025.2 + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"].reset_index() + + # Run regression analysis and concatenate results + tb_trend = run_regression(tb) + tb = tb.drop("frac_year", axis=1) + tb = pr.concat([tb_trend, tb]) + + # Format the table + tb = tb.format(["days_since_1949", "model"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch.end") + + +def fit_exponential(models, metric): + """Fit an exponential model to the given metric data. Code provided by Epoch AI team.""" + x = models["frac_year"].values.reshape(-1, 1) + y = models[metric] + + # Filter out non-positive values + positive_mask = y > 0 + x = x[positive_mask] + y = y[positive_mask] + + # Apply log10 transformation + y = np.log10(y) + + # Filter out infinite and extremely large values + finite_mask = np.isfinite(y) & (y < np.finfo(np.float32).max) + x = x[finite_mask] + y = y[finite_mask] + + # Fit linear regression model + reg = LinearRegression().fit(x, y) + return reg.intercept_, reg.coef_[0] + + +def run_regression(tb): + """Run regression analysis on the given table and return the updated table.""" + # Add fractional year for sorting and processing + publication_dates = tb["publication_date"] + tb.loc[:, "frac_year"] = ( + publication_dates.dt.year + (publication_dates.dt.month - 1) / 12 + (publication_dates.dt.day - 1) / 365 + ) + tb = tb.sort_values(by="frac_year") + + # Define periods dynamically + periods = { + f"{START_DATE}–{DL_ERA_START}": (tb["frac_year"] < DL_ERA_START), + f"{DL_ERA_START}–{int(END_DATE)}": ((tb["frac_year"] >= DL_ERA_START) & (tb["frac_year"] < END_DATE)), + } + # Define year grids dynamically + year_grids = { + f"{START_DATE}–{DL_ERA_START}": np.array([START_DATE, DL_ERA_START]), + f"{DL_ERA_START}–{int(END_DATE)}": np.array([DL_ERA_START, END_DATE]), + } + + metrics = ["training_computation_petaflop", "parameters", "training_dataset_size__datapoints"] + new_tables = [] + + for metric in metrics: + # Filter out models without the metric information + tb_metric = tb[pd.notnull(tb[metric])] + dfs = [] + + for period_name, condition in periods.items(): + # Subset data for the current period + period_data = tb_metric[condition] + + # Fit exponential model + fit = fit_exponential(period_data, metric) + oom_per_year = fit[1] + info = f"{10**oom_per_year:.1f}x/year" + + # Log the results + paths.log.info(f"{period_name} ({metric}): {info}") + + # Calculate the regression line for the current period + year_grid = year_grids[period_name] + line = 10 ** (fit[0] + year_grid * fit[1]) + + # Create DataFrame for the current period + df = pd.DataFrame( + { + "days_since_1949": [ + period_data["days_since_1949"].min(), + period_data["days_since_1949"].max(), + ], + f"{metric}": [line[0], line[-1]], + "model": [f"{info} between {period_name}"] * 2, + } + ) + dfs.append(df) + + # Combine the DataFrames for all periods for the current metric + df_combined = pd.concat(dfs, ignore_index=True) + new_tables.append(df_combined) + + # Merge all the new DataFrames + tb_new = new_tables[0] + for tb_m in new_tables[1:]: + tb_new = pd.merge(tb_new, tb_m, on=["model", "days_since_1949"], how="outer") + + # Convert to OWID Table and add metadata + tb_new = Table(tb_new, short_name=paths.short_name) + for column in tb_new.columns: + tb_new[column].metadata.origins = tb["publication_date"].metadata.origins + + return tb_new diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/shared.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/shared.py new file mode 100644 index 00000000000..016e6812e4d --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/shared.py @@ -0,0 +1,74 @@ +from typing import List + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def calculate_aggregates(tb: Table, agg_column: str, short_name: str, unused_columns: List[str]) -> Table: + """ + This function calculates aggregates for a given column in a Table. It is used to calculate the total yearly and cumulative number of notable AI systems for each domain or country. + + Parameters: + tb (Table): The input Table. + agg_column (str): The column to aggregate on. + short_name (str): The short name to set for the table. + unused_columns (List[str]): The list of columns to drop from the table. + + Returns: + Table: The output Table with calculated aggregates. + """ + + # Store the origins metadata for later use + origins = tb[agg_column].metadata.origins + + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Convert the 'publication_date' column to datetime format and extract the year + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + tb["year"] = tb["publication_date"].dt.year + + # Convert the column to category type so that the missing values will be considered as 0 + tb[agg_column] = tb[agg_column].astype("category") + + # Group total yearly counts and calculate cumulative count for total number of systems + tb_total = tb.groupby(["year"]).size().reset_index(name="yearly_count") + total_counts = tb_total.groupby("year")["yearly_count"].sum().reset_index() + total_counts[agg_column] = "Total" + total_counts["cumulative_count"] = total_counts["yearly_count"].cumsum() + + # Split the column to be aggregated by comma (several countries/domains can exist in each cell) + tb[agg_column] = tb[agg_column].str.split(",") + + # Explode the table to create separate rows for each country or domain + tb_exploded = tb.explode(agg_column) + + # Convert the column to category type so that the missing values will be considered as 0 + tb_exploded[agg_column] = tb_exploded[agg_column].astype("category") + + # Drop duplicates where the year, model and country/domain are the same + tb_unique = tb_exploded.drop_duplicates(subset=["year", "model", agg_column]) + + # Group by year and country/domain and count the number of systems (consider all categories which will assume 0 for missing values) + tb_agg = tb_unique.groupby(["year", agg_column], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count (consider all categories which will assume 0 for missing values) + tb_agg["cumulative_count"] = tb_agg.groupby(agg_column, observed=False)["yearly_count"].cumsum() + + # Combine aggregated data with total counts + tb_agg = pr.concat([tb_agg, total_counts], ignore_index=True) + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = short_name + + return tb_agg diff --git a/etl/steps/data/garden/bgs/2024-07-09/world_mineral_statistics.py b/etl/steps/data/garden/bgs/2024-07-09/world_mineral_statistics.py index efee436a46b..95a09fe85a7 100644 --- a/etl/steps/data/garden/bgs/2024-07-09/world_mineral_statistics.py +++ b/etl/steps/data/garden/bgs/2024-07-09/world_mineral_statistics.py @@ -1053,7 +1053,7 @@ def aggregate_coal(tb: Table) -> Table: # Visually compare the resulting series with the one from the Statistical Review of World Energy. # from etl.paths import DATA_DIR - # tb_sr = Dataset(DATA_DIR / "garden/energy_institute/2024-06-20/statistical_review_of_world_energy").read_table("statistical_review_of_world_energy") + # tb_sr = Dataset(DATA_DIR / "garden/energy_institute/2024-06-20/statistical_review_of_world_energy").read("statistical_review_of_world_energy") # tb_sr = tb_sr[["country", "year", 'coal_production_mt']].rename(columns={"coal_production_mt": "value"}) # tb_sr["value"] *= 1e6 # compare = pr.concat([tb_sr.assign(**{"source": "EI"}), tb_coal_sum.assign(**{"source": "BGS"})], ignore_index=True) @@ -1132,7 +1132,7 @@ def run(dest_dir: str) -> None: # # Load meadow dataset and read its main table. ds_meadow = paths.load_dataset("world_mineral_statistics") - tb = ds_meadow.read_table("world_mineral_statistics") + tb = ds_meadow.read("world_mineral_statistics", safe_types=False) # Load regions dataset. ds_regions = paths.load_dataset("regions") diff --git a/etl/steps/data/garden/biodiversity/2022/living_planet_index.py b/etl/steps/data/garden/biodiversity/2022/living_planet_index.py index a4968d65180..02d9f00c5ef 100644 --- a/etl/steps/data/garden/biodiversity/2022/living_planet_index.py +++ b/etl/steps/data/garden/biodiversity/2022/living_planet_index.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # snap = paths.load_snapshot() - tb = snap.read().set_index(["country", "year"]) + tb = snap.read(safe_types=False).set_index(["country", "year"]) # # Save outputs. diff --git a/etl/steps/data/garden/biodiversity/2024-01-25/cherry_blossom.py b/etl/steps/data/garden/biodiversity/2024-01-25/cherry_blossom.py index e4124fa23c3..67deb75a6e8 100644 --- a/etl/steps/data/garden/biodiversity/2024-01-25/cherry_blossom.py +++ b/etl/steps/data/garden/biodiversity/2024-01-25/cherry_blossom.py @@ -12,9 +12,9 @@ def run(dest_dir: str) -> None: log.info("cherry_blossom.start") - # read dataset from meadow + # Read dataset from meadow. ds_meadow = paths.load_dataset("cherry_blossom") - tb = ds_meadow["cherry_blossom"].reset_index() + tb = ds_meadow.read("cherry_blossom") # Calculate a 20,40 and 50 year average tb = calculate_multiple_year_average(tb) diff --git a/etl/steps/data/garden/biodiversity/2024-08-12/invasive_species.py b/etl/steps/data/garden/biodiversity/2024-08-12/invasive_species.py index 06eab3b6f83..6dd29c87cbc 100644 --- a/etl/steps/data/garden/biodiversity/2024-08-12/invasive_species.py +++ b/etl/steps/data/garden/biodiversity/2024-08-12/invasive_species.py @@ -16,10 +16,10 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("invasive_species") # Read table from meadow dataset. - tb_cont = ds_meadow["continental"].reset_index() + tb_cont = ds_meadow.read("continental") tb_cont = tb_cont.rename(columns={"continent": "country"}) - tb_glob = ds_meadow["global"].reset_index() + tb_glob = ds_meadow.read("global") # Combine the global and continental datasets tb = pr.concat([tb_cont, tb_glob]) # Not clear from the paper what this group includes, and there aren't many of them so I'll drop it for now diff --git a/etl/steps/data/garden/biodiversity/2024-09-30/living_planet_index_completeness.py b/etl/steps/data/garden/biodiversity/2024-09-30/living_planet_index_completeness.py index 2c96f2abccc..c4464e8b072 100644 --- a/etl/steps/data/garden/biodiversity/2024-09-30/living_planet_index_completeness.py +++ b/etl/steps/data/garden/biodiversity/2024-09-30/living_planet_index_completeness.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("living_planet_index_completeness") # Read table from meadow dataset. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/garden/biodiversity/2024-09-30/living_planet_index_share.py b/etl/steps/data/garden/biodiversity/2024-09-30/living_planet_index_share.py index 4bafd6c6163..efc3e6f5fed 100644 --- a/etl/steps/data/garden/biodiversity/2024-09-30/living_planet_index_share.py +++ b/etl/steps/data/garden/biodiversity/2024-09-30/living_planet_index_share.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("living_planet_index_share") # Read table from meadow dataset. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/garden/climate/2024-11-18/climate_change_impacts.meta.yml b/etl/steps/data/garden/climate/2024-11-18/climate_change_impacts.meta.yml new file mode 100644 index 00000000000..2a5bbd540b2 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/climate_change_impacts.meta.yml @@ -0,0 +1,24 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + +dataset: + title: Climate Change Impacts + update_period_days: 60 + +tables: + climate_change_impacts_annual: + title: Climate Change Impacts - Annual + variables: + arctic_sea_ice_extent_min: + title: Minimum Arctic sea ice extent + arctic_sea_ice_extent_max: + title: Maximum Arctic sea ice extent + antarctic_sea_ice_extent_min: + title: Minimum Antarctic sea ice extent + antarctic_sea_ice_extent_max: + title: Maximum Antarctic sea ice extent + climate_change_impacts_monthly: + title: Climate Change Impacts - Monthly diff --git a/etl/steps/data/garden/climate/2024-11-18/climate_change_impacts.py b/etl/steps/data/garden/climate/2024-11-18/climate_change_impacts.py new file mode 100644 index 00000000000..38f00ffd808 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/climate_change_impacts.py @@ -0,0 +1,174 @@ +"""Create a garden dataset with all climate change impacts data. + +""" + +from owid.catalog import Table +from owid.datautils.dataframes import combine_two_overlapping_dataframes + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def prepare_sea_ice_extent(tb_nsidc: Table) -> Table: + tb_nsidc = tb_nsidc.copy() + # Create a table with the minimum and maximum Arctic sea ice extent. + # Assume minimum and maximum occur in September and February every year. + tb_nsidc["month"] = tb_nsidc["date"].astype(str).str[5:7] + tb_nsidc["year"] = tb_nsidc["date"].astype(str).str[0:4].astype(int) + arctic_sea_ice_extent = ( + tb_nsidc[(tb_nsidc["location"] == "Northern Hemisphere") & (tb_nsidc["month"].isin(["02", "09"]))] + .pivot(index=["location", "year"], columns=["month"], values="sea_ice_extent", join_column_levels_with=" ") + .rename(columns={"02": "arctic_sea_ice_extent_max", "09": "arctic_sea_ice_extent_min"}, errors="raise") + ) + # Instead of calling the location a generic "Northern Hemisphere", call it "Arctic Ocean". + arctic_sea_ice_extent["location"] = "Arctic Ocean" + + # Idem for the Antarctic sea ice extent. + # Assume maximum and minimum occur in September and February every year. + antarctic_sea_ice_extent = ( + tb_nsidc[(tb_nsidc["location"] == "Southern Hemisphere") & (tb_nsidc["month"].isin(["02", "09"]))] + .pivot(index=["location", "year"], columns=["month"], values="sea_ice_extent", join_column_levels_with=" ") + .rename(columns={"02": "antarctic_sea_ice_extent_min", "09": "antarctic_sea_ice_extent_max"}, errors="raise") + ) + # Instead of calling the location a generic "Southern Hemisphere", call it "Antarctica". + antarctic_sea_ice_extent["location"] = "Antarctica" + + return arctic_sea_ice_extent, antarctic_sea_ice_extent + + +def prepare_ocean_heat_content(tb_ocean_heat_annual: Table, tb_ocean_heat_annual_epa: Table) -> Table: + # Combine NOAA's annual data on ocean heat content (which is more up-to-date) with the analogous EPA's data based on + # NOAA (which, for some reason, spans a longer time range for 2000m). Prioritize NOAA's data on common years. + tb_ocean_heat_annual = combine_two_overlapping_dataframes( + tb_ocean_heat_annual.rename( + columns={ + "ocean_heat_content_700m": "ocean_heat_content_noaa_700m", + "ocean_heat_content_2000m": "ocean_heat_content_noaa_2000m", + }, + errors="raise", + ), + tb_ocean_heat_annual_epa, + index_columns=["location", "year"], + ) + # Recover the original indicator titles (they are empty because of combining two columns with different titles). + tb_ocean_heat_annual["ocean_heat_content_noaa_700m"].metadata.title = tb_ocean_heat_annual_epa[ + "ocean_heat_content_noaa_700m" + ].metadata.title + tb_ocean_heat_annual["ocean_heat_content_noaa_2000m"].metadata.title = tb_ocean_heat_annual_epa[ + "ocean_heat_content_noaa_2000m" + ].metadata.title + + return tb_ocean_heat_annual + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load GISS dataset surface temperature analysis, and read monthly data. + ds_giss = paths.load_dataset("surface_temperature_analysis") + tb_giss = ds_giss["surface_temperature_analysis"].reset_index() + + # Load NSIDC dataset of sea ice index. + ds_nsidc = paths.load_dataset("sea_ice_index") + tb_nsidc = ds_nsidc["sea_ice_index"].reset_index() + + # Load Met Office dataset on sea surface temperature. + ds_met_office = paths.load_dataset("sea_surface_temperature") + tb_met_office = ds_met_office["sea_surface_temperature"].reset_index() + + # Load NOAA/NCIE dataset on ocean heat content. + ds_ocean_heat = paths.load_dataset("ocean_heat_content", namespace="climate") + tb_ocean_heat_monthly = ds_ocean_heat["ocean_heat_content_monthly"].reset_index() + tb_ocean_heat_annual = ds_ocean_heat["ocean_heat_content_annual"].reset_index() + + # Load EPA's compilation of data on ocean heat content. + ds_epa = paths.load_dataset("ocean_heat_content", namespace="epa") + tb_ocean_heat_annual_epa = ds_epa["ocean_heat_content"].reset_index() + + # Load ocean pH data from the School of Ocean and Earth Science and Technology. + ds_ocean_ph = paths.load_dataset("ocean_ph_levels") + tb_ocean_ph = ds_ocean_ph["ocean_ph_levels"].reset_index() + + # Load snow cover extent from Rutgers University Global Snow Lab. + ds_snow = paths.load_dataset("snow_cover_extent") + tb_snow = ds_snow["snow_cover_extent"].reset_index() + + # Load ice sheet mass balance data from EPA. + ds_ice_sheet = paths.load_dataset("ice_sheet_mass_balance") + tb_ice_sheet = ds_ice_sheet["ice_sheet_mass_balance"].reset_index() + + # Load annual data on mass balance of US glaciers from EPA. + ds_us_glaciers = paths.load_dataset("mass_balance_us_glaciers") + tb_us_glaciers = ds_us_glaciers["mass_balance_us_glaciers"].reset_index() + + # Load monthly greenhouse gas concentration data from NOAA/GML. + ds_gml = paths.load_dataset("ghg_concentration") + tb_gml = ds_gml["ghg_concentration"].reset_index() + + # Load long-run yearly greenhouse gas concentration data. + ds_ghg = paths.load_dataset("long_run_ghg_concentration") + tb_ghg = ds_ghg["long_run_ghg_concentration"].reset_index() + + # Load global sea level. + ds_sea_level = paths.load_dataset("global_sea_level") + tb_sea_level = ds_sea_level["global_sea_level"].reset_index() + + # + # Process data. + # + # Prepare sea ice extent data. + arctic_sea_ice_extent, antarctic_sea_ice_extent = prepare_sea_ice_extent(tb_nsidc=tb_nsidc) + + # Prepare ocean heat content data. + tb_ocean_heat_annual = prepare_ocean_heat_content( + tb_ocean_heat_annual=tb_ocean_heat_annual, tb_ocean_heat_annual_epa=tb_ocean_heat_annual_epa + ) + + # Gather monthly data from different tables. + tb_monthly = tb_giss.astype({"date": str}).copy() + # NOTE: The values in tb_ocean_ph are monthly, but the dates are not consistently on the middle of the month. + # Instead, they are on different days of the month. When merging with other tables, this will create many nans. + # We could reindex linearly, but it's not a big deal. + for table in [ + tb_nsidc, + tb_met_office, + tb_ocean_heat_monthly, + tb_ocean_ph, + tb_snow, + tb_ice_sheet, + tb_gml, + tb_sea_level, + ]: + tb_monthly = tb_monthly.merge( + table.astype({"date": str}), + how="outer", + on=["location", "date"], + validate="one_to_one", + short_name="climate_change_impacts_monthly", + ) + + # Gather annual data from different tables. + tb_annual = tb_ocean_heat_annual.copy() + for table in [arctic_sea_ice_extent, antarctic_sea_ice_extent, tb_ghg, tb_us_glaciers.astype({"year": int})]: + tb_annual = tb_annual.merge( + table, + how="outer", + on=["location", "year"], + validate="one_to_one", + short_name="climate_change_impacts_annual", + ) + tb_annual.metadata.short_name = "climate_change_impacts_annual" + + # Set an appropriate index to monthly and annual tables, and sort conveniently. + tb_monthly = tb_monthly.set_index(["location", "date"], verify_integrity=True).sort_index() + tb_annual = tb_annual.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create explorer dataset with combined table in csv format. + ds_explorer = create_dataset(dest_dir, tables=[tb_annual, tb_monthly]) + ds_explorer.save() diff --git a/etl/steps/data/garden/climate/2024-11-18/ghg_concentration.meta.yml b/etl/steps/data/garden/climate/2024-11-18/ghg_concentration.meta.yml new file mode 100644 index 00000000000..ca5e6073998 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/ghg_concentration.meta.yml @@ -0,0 +1,44 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + description_short: Measured in parts per million. + +dataset: + update_period_days: 60 + +tables: + ghg_concentration: + title: Monthly greenhouse gas concentration + variables: + co2_concentration: + title: Monthly concentration of atmospheric carbon dioxide + processing_level: minor + unit: parts per million + short_unit: ppm + ch4_concentration: + title: Monthly concentration of atmospheric methane + processing_level: minor + unit: parts per billion + short_unit: ppb + n2o_concentration: + title: Monthly concentration of atmospheric nitrous oxide + processing_level: minor + unit: parts per billion + short_unit: ppb + co2_concentration_yearly_average: + title: Rolling yearly average of the concentration of atmospheric carbon dioxide + processing_level: major + unit: parts per million + short_unit: ppm + ch4_concentration_yearly_average: + title: Rolling yearly average of the concentration of atmospheric methane + processing_level: major + unit: parts per billion + short_unit: ppb + n2o_concentration_yearly_average: + title: Rolling yearly average of the concentration of atmospheric nitrous oxide + processing_level: major + unit: parts per billion + short_unit: ppb diff --git a/etl/steps/data/garden/climate/2024-11-18/ghg_concentration.py b/etl/steps/data/garden/climate/2024-11-18/ghg_concentration.py new file mode 100644 index 00000000000..914ee6e8776 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/ghg_concentration.py @@ -0,0 +1,139 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from typing import List + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "year": "year", + "month": "month", + "average": "concentration", + # The following column is loaded only to perform a sanity check. + "decimal": "decimal", +} + + +def add_rolling_average(tb: Table, original_column_names: List[str]) -> Table: + tb_with_average = tb.copy() + + # Create a date range of each month (on the 15th). + # NOTE: The minimum date in the data is "2001-01-15", however, when passing this date to pd.date_range with + # freq="MS", the first point is dismissed because it is not the start of a month. For that reason, we shift the + # first point to be at the beginning of the month. + date_range = pd.date_range( + start=tb_with_average["date"].min() - pd.tseries.offsets.MonthBegin(1), + end=tb_with_average["date"].max(), + freq="MS", + ) + pd.DateOffset(days=14) + + # Get unique locations. + unique_locations = tb_with_average["location"].unique() + + # Set date as index and sort. + tb_with_average = tb_with_average.set_index(["location", "date"]).sort_index() + + # Create a MultiIndex with all possible combinations of date and location. + multi_index = pd.MultiIndex.from_product([unique_locations, date_range], names=["location", "date"]) + + # Reindex using the MultiIndex. + tb_with_average = tb_with_average.reindex(multi_index) + + for original_column_name in original_column_names: + # Create a rolling average with a window of one year, linearly interpolating missing values. + # NOTE: Currently no interpolation is needed, as no data points are missing (and in fact date_range is identical + # to the dates in the data). However, we need to interpolate in case there are missing points. Otherwise all + # points after the missing one will be nan. + tb_with_average[f"{original_column_name}_yearly_average"] = ( + tb_with_average[original_column_name].interpolate("linear").rolling(12).mean() + ) + + # Drop empty rows. + tb_with_average = tb_with_average.dropna(subset=original_column_names, how="all").reset_index() + + # Sort conveniently. + tb_with_average = tb_with_average.sort_values(["location", "date"]).reset_index(drop=True) + + for original_column_name in original_column_names: + # Check that the values of the original column have not been altered. + error = f"The values of the original {original_column_name} column have been altered." + assert tb_with_average[original_column_name].astype(float).equals(tb[original_column_name].astype(float)), error + + return tb_with_average + + +def prepare_gas_data(tb: Table) -> Table: + tb = tb.copy() + + # Extract gas name from table's short name. + gas = tb.metadata.short_name.split("_")[0] + + # Columns to select from the data, and how to rename them. + columns = { + "year": "year", + "month": "month", + "average": f"{gas}_concentration", + # The following column is loaded only to perform a sanity check. + "decimal": "decimal", + } + + # Select necessary columns and rename them. + tb = tb[list(columns)].rename(columns=columns, errors="raise") + + # There is a "decimal" column for the year as a decimal number, that only has 12 possible values, corresponding to + # the middle of each month, so we will assume the 15th of each month. + error = "Date format has changed." + assert len(set(tb["decimal"].astype(str).str.split(".").str[1])) == 12, error + assert set(tb["month"]) == set(range(1, 13)), error + tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=15)) + + # Remove unnecessary columns. + tb = tb.drop(columns=["year", "month", "decimal"], errors="raise") + + # Add a location column. + tb["location"] = "World" + + # Add a column with a rolling average for each gas. + tb = add_rolling_average(tb=tb, original_column_names=[f"{gas}_concentration"]) + + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("ghg_concentration") + tb_co2 = ds_meadow["co2_concentration_monthly"].reset_index() + tb_ch4 = ds_meadow["ch4_concentration_monthly"].reset_index() + tb_n2o = ds_meadow["n2o_concentration_monthly"].reset_index() + + # + # Process data. + # + # Prepare data for each gas. + tb_co2 = prepare_gas_data(tb=tb_co2) + tb_ch4 = prepare_gas_data(tb=tb_ch4) + tb_n2o = prepare_gas_data(tb=tb_n2o) + + # Combine data for different gases. + tb = tb_co2.merge(tb_ch4, how="outer", on=["location", "date"]).merge( + tb_n2o, how="outer", on=["location", "date"], short_name=paths.short_name + ) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-11-18/long_run_ghg_concentration.meta.yml b/etl/steps/data/garden/climate/2024-11-18/long_run_ghg_concentration.meta.yml new file mode 100644 index 00000000000..b02cba814ea --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/long_run_ghg_concentration.meta.yml @@ -0,0 +1,27 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + - CO2 & Greenhouse Gas Emissions + description_processing: |- + - Long-run data from ice core studies has been merged with recent measurements of atmospheric concentration of greenhouse gases. + +dataset: + update_period_days: 0 + +tables: + long_run_ghg_concentration: + variables: + co2_concentration: + title: Long-run CO₂ concentration + unit: parts per million volume + short_unit: ppmv + ch4_concentration: + title: Long-run CH₄ concentration + unit: parts per billion volume + short_unit: ppbv + n2o_concentration: + title: Long-run N₂O concentration + unit: parts per billion volume + short_unit: ppbv diff --git a/etl/steps/data/garden/climate/2024-11-18/long_run_ghg_concentration.py b/etl/steps/data/garden/climate/2024-11-18/long_run_ghg_concentration.py new file mode 100644 index 00000000000..0e07095b425 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/long_run_ghg_concentration.py @@ -0,0 +1,84 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table +from owid.datautils.dataframes import combine_two_overlapping_dataframes + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def convert_monthly_to_annual(tb_new: Table) -> Table: + tb_new = tb_new.copy() + + # Create a year column. + tb_new["year"] = tb_new["date"].dt.year + + # Create a table with the number of observations per year. + tb_counts = tb_new.groupby("year", as_index=False).agg( + { + "co2_concentration": "count", + "ch4_concentration": "count", + "n2o_concentration": "count", + } + ) + # Create a table with the average annual values. + tb_new = tb_new.groupby("year", as_index=False).agg( + { + "co2_concentration": "mean", + "ch4_concentration": "mean", + "n2o_concentration": "mean", + } + ) + # Make nan all data points based on less than 12 observations per year. + for gas in ["co2", "ch4", "n2o"]: + tb_new.loc[tb_counts[f"{gas}_concentration"] < 12, f"{gas}_concentration"] = None + + # Drop empty rows. + tb_new = tb_new.dropna( + subset=["co2_concentration", "ch4_concentration", "n2o_concentration"], how="all" + ).reset_index(drop=True) + + return tb_new + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset on long-run GHG concentrations from EPA, and read its main table. + ds_old = paths.load_dataset("ghg_concentration", namespace="epa") + tb_old = ds_old["ghg_concentration"].reset_index() + + # Load garden dataset of up-to-date GHG concentrations, and read its main table. + ds_new = paths.load_dataset("ghg_concentration", namespace="climate") + tb_new = ds_new["ghg_concentration"].reset_index() + + # + # Process data. + # + # Select columns. + tb_new = tb_new[["date", "co2_concentration", "ch4_concentration", "n2o_concentration"]].copy() + + # Calculate average annual values. + tb_new = convert_monthly_to_annual(tb_new=tb_new) + + # Combine old and new data, prioritizing the latter. + tb = combine_two_overlapping_dataframes(df1=tb_new, df2=tb_old, index_columns=["year"]) + + # Rename table. + tb.metadata.short_name = paths.short_name + + # Add location column. + tb["location"] = "World" + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-11-18/ocean_heat_content.meta.yml b/etl/steps/data/garden/climate/2024-11-18/ocean_heat_content.meta.yml new file mode 100644 index 00000000000..c7f6fb474ea --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/ocean_heat_content.meta.yml @@ -0,0 +1,29 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: minor + description_short: Measured in 10²² Joules. + unit: 10²² Joules + short_unit: 10²² J + +dataset: + title: Ocean Heat Content + update_period_days: 60 + +tables: + ocean_heat_content_monthly: + title: Ocean Heat Content - Monthly average + variables: + ocean_heat_content_700m: + title: Monthly average ocean heat content for the 0-700 meters layer + ocean_heat_content_2000m: + title: Monthly average ocean heat content for the 0-2000 meters layer + ocean_heat_content_annual: + title: Ocean Heat Content - Annual average + variables: + ocean_heat_content_700m: + title: Annual average ocean heat content for the 0-700 meters layer + ocean_heat_content_2000m: + title: Annual average ocean heat content for the 0-2000 meters layer diff --git a/etl/steps/data/garden/climate/2024-11-18/ocean_heat_content.py b/etl/steps/data/garden/climate/2024-11-18/ocean_heat_content.py new file mode 100644 index 00000000000..dcbafe0d14c --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/ocean_heat_content.py @@ -0,0 +1,45 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("ocean_heat_content") + tb_monthly = ds_meadow["ocean_heat_content_monthly"].reset_index() + tb_annual = ds_meadow["ocean_heat_content_annual"].reset_index() + + # + # Process data. + # + # Improve the format of the date column in monthly date (assume the middle of the month for each data point). + tb_monthly["date"] = ( + tb_monthly["date"].str.split("-").str[0] + "-" + tb_monthly["date"].str.split("-").str[1].str.zfill(2) + "-15" + ) + + # Replace date column (where all years are given as, e.g. 1955.5, 2000.5) by year column in annual data. + tb_annual["year"] = tb_annual["date"].astype(int) + tb_annual = tb_annual.drop(columns=["date"], errors="raise") + + # Instead of having a column for depth, create columns of heat content for each depth. + tb_monthly["depth"] = tb_monthly["depth"].astype(str) + "m" + tb_monthly = tb_monthly.pivot(index=["location", "date"], columns="depth", join_column_levels_with="_") + tb_annual["depth"] = tb_annual["depth"].astype(str) + "m" + tb_annual = tb_annual.pivot(index=["location", "year"], columns="depth", join_column_levels_with="_") + + # Set an appropriate index to each table and sort conveniently. + tb_monthly = tb_monthly.set_index(["location", "date"], verify_integrity=True).sort_index() + tb_annual = tb_annual.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb_annual, tb_monthly], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-11-18/ocean_ph_levels.meta.yml b/etl/steps/data/garden/climate/2024-11-18/ocean_ph_levels.meta.yml new file mode 100644 index 00000000000..d9364bd3280 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/ocean_ph_levels.meta.yml @@ -0,0 +1,22 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + unit: pH + short_unit: pH + +dataset: + title: Ocean pH Levels + update_period_days: 60 + +tables: + ocean_ph_levels: + title: Ocean pH levels + variables: + ocean_ph: + title: Monthly measurement of ocean pH levels + processing_level: minor + ocean_ph_yearly_average: + title: Rolling yearly average of ocean pH levels + processing_level: major diff --git a/etl/steps/data/garden/climate/2024-11-18/ocean_ph_levels.py b/etl/steps/data/garden/climate/2024-11-18/ocean_ph_levels.py new file mode 100644 index 00000000000..204ec6bc0c5 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/ocean_ph_levels.py @@ -0,0 +1,82 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "date": "date", + "phcalc_insitu": "ocean_ph", +} + + +def add_rolling_average(tb: Table) -> Table: + tb_with_average = tb.copy() + + # Set date as index and sort. + tb_with_average = tb_with_average.set_index("date").sort_index() + + # Since values are given at different days of the month, reindex to have a value for each day. + tb_with_average = tb_with_average.reindex( + pd.date_range(start=tb_with_average.index.min(), end=tb_with_average.index.max(), freq="1D") + ) + + # Create a rolling average with a window of one year, linearly interpolating missing values. + tb_with_average["ocean_ph_yearly_average"] = ( + tb_with_average["ocean_ph"].interpolate(method="time").rolling(365).mean() + ) + + # Drop empty rows. + tb_with_average = ( + tb_with_average.dropna(subset=["ocean_ph"]).reset_index().rename(columns={"index": "date"}, errors="raise") + ) + + # Check that the values of the original ocean ph column have not been altered. + error = "The values of the original ocean_ph column have been altered." + assert tb_with_average["ocean_ph"].equals( + tb.dropna(subset=["ocean_ph"]).sort_values("date").reset_index(drop=True)["ocean_ph"] + ), error + + return tb_with_average + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("hawaii_ocean_time_series") + tb_meadow = ds_meadow["hawaii_ocean_time_series"].reset_index() + + # + # Process data. + # + # Select and rename columns. + tb = tb_meadow[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Add location column. + tb["location"] = "Hawaii" + + # Improve format of date column. + tb["date"] = pd.to_datetime(tb["date"], format="%d-%b-%y") + + # Add a column with a rolling average. + tb = add_rolling_average(tb=tb) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # Rename table. + tb.metadata.short_name = paths.short_name + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-11-18/sea_ice_index.meta.yml b/etl/steps/data/garden/climate/2024-11-18/sea_ice_index.meta.yml new file mode 100644 index 00000000000..7facebf9240 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/sea_ice_index.meta.yml @@ -0,0 +1,19 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + +dataset: + title: Sea Ice Index + update_period_days: 60 + +tables: + sea_ice_index: + variables: + sea_ice_extent: + title: Sea ice extent + # description_short: TODO + unit: million square kilometers + short_unit: million km² + processing_level: minor diff --git a/etl/steps/data/garden/climate/2024-11-18/sea_ice_index.py b/etl/steps/data/garden/climate/2024-11-18/sea_ice_index.py new file mode 100644 index 00000000000..3f8247e42b5 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/sea_ice_index.py @@ -0,0 +1,44 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("sea_ice_index") + + # Read table from meadow dataset. + tb = ds_meadow["sea_ice_index"].reset_index() + + # + # Process data. + # + # Remove column with annual average. + tb = tb.drop(columns=["annual"]) + + # Convert table to long format. + tb = tb.melt(id_vars=["location", "year"], var_name="month", value_name="sea_ice_extent") + + # Create column of date, assuming each measurement is taken mid month. + tb["date"] = pd.to_datetime(tb["year"].astype(str) + tb["month"].str[0:3] + "15", format="%Y%b%d") + + # Drop empty rows and unnecessary columns. + tb = tb.dropna().drop(columns=["year", "month"]) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset with the combined table. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-11-18/sea_surface_temperature.meta.yml b/etl/steps/data/garden/climate/2024-11-18/sea_surface_temperature.meta.yml new file mode 100644 index 00000000000..bf9ee9d13dc --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/sea_surface_temperature.meta.yml @@ -0,0 +1,29 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: minor + +dataset: + title: Sea surface temperature + update_period_days: 60 + +tables: + sea_surface_temperature: + variables: + sea_temperature_anomaly: + title: "Monthly sea surface temperature anomaly" + description_short: Measured in degrees Celsius. + unit: °C + short_unit: °C + sea_temperature_anomaly_low: + title: "Monthly sea surface temperature anomaly (lower bound)" + description_short: Measured in degrees Celsius. + unit: °C + short_unit: °C + sea_temperature_anomaly_high: + title: "Monthly sea surface temperature anomaly (upper bound)" + description_short: Measured in degrees Celsius. + unit: °C + short_unit: °C diff --git a/etl/steps/data/garden/climate/2024-11-18/sea_surface_temperature.py b/etl/steps/data/garden/climate/2024-11-18/sea_surface_temperature.py new file mode 100644 index 00000000000..2c2fb56098e --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/sea_surface_temperature.py @@ -0,0 +1,48 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +# Columns to select from data, and how to rename them. +COLUMNS = { + "year": "year", + "month": "month", + "location": "location", + "anomaly": "sea_temperature_anomaly", + "lower_bound_95pct_bias_uncertainty_range": "sea_temperature_anomaly_low", + "upper_bound_95pct_bias_uncertainty_range": "sea_temperature_anomaly_high", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("sea_surface_temperature") + tb = ds_meadow["sea_surface_temperature"].reset_index() + + # + # Process data. + # + # Select and rename columns. + tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Create a date column (assume the middle of the month for each monthly data point). + tb["date"] = tb["year"].astype(str) + "-" + tb["month"].astype(str).str.zfill(2) + "-15" + + # Remove unnecessary columns. + tb = tb.drop(columns=["year", "month"], errors="raise") + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset with the combined table. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-11-18/snow_cover_extent.meta.yml b/etl/steps/data/garden/climate/2024-11-18/snow_cover_extent.meta.yml new file mode 100644 index 00000000000..698ad73c63f --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/snow_cover_extent.meta.yml @@ -0,0 +1,23 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + unit: "square kilometers" + short_unit: "km²" + description_short: Measured in square kilometers. + +dataset: + title: Snow Cover Extent + update_period_days: 60 + +tables: + snow_cover_extent: + title: Snow Cover Extent + variables: + snow_cover_extent: + title: Monthly measurement of the area covered by snow + processing_level: minor + snow_cover_extent_yearly_average: + title: Rolling yearly average of the area covered by snow + processing_level: major diff --git a/etl/steps/data/garden/climate/2024-11-18/snow_cover_extent.py b/etl/steps/data/garden/climate/2024-11-18/snow_cover_extent.py new file mode 100644 index 00000000000..618e62cce08 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/snow_cover_extent.py @@ -0,0 +1,93 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "date": "date", + "phcalc_insitu": "ocean_ph", +} + + +def add_rolling_average(tb: Table, original_column_name: str) -> Table: + tb_with_average = tb.copy() + + # Create a date range. + date_range = pd.date_range(start=tb_with_average["date"].min(), end=tb_with_average["date"].max(), freq="1D") + + # Get unique locations. + unique_locations = tb_with_average["location"].unique() + + # Set date as index and sort. + tb_with_average = tb_with_average.set_index(["location", "date"]).sort_index() + + # Create a MultiIndex with all possible combinations of date and location. + multi_index = pd.MultiIndex.from_product([unique_locations, date_range], names=["location", "date"]) + + # Reindex using the MultiIndex. + tb_with_average = tb_with_average.reindex(multi_index) + + # Create a rolling average with a window of one year, linearly interpolating missing values. + tb_with_average[f"{original_column_name}_yearly_average"] = ( + tb_with_average[original_column_name].interpolate(method="linear").rolling(365).mean() + ) + + # Drop empty rows. + tb_with_average = tb_with_average.dropna(subset=[original_column_name]).reset_index() + + # Remove rolling average for the first year, given that it is based on incomplete data. + tb_with_average.loc[ + tb_with_average["date"] < tb_with_average["date"].min() + pd.Timedelta(days=365), + f"{original_column_name}_yearly_average", + ] = None + + # Sort conveniently. + tb_with_average = tb_with_average.sort_values(["location", "date"]).reset_index(drop=True) + + # Check that the values of the original column have not been altered. + error = f"The values of the original {original_column_name} column have been altered." + assert tb_with_average[original_column_name].astype(int).equals(tb[original_column_name].astype(int)), error + + return tb_with_average + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("snow_cover_extent") + tb = ds_meadow["snow_cover_extent"].reset_index() + + # + # Process data. + # + # Create a date column. + # NOTE: Assign the middle of the month. + tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=15)) + tb = tb.drop(columns=["year", "month"], errors="raise") + + # Data starts in 1966, but, as mentioned on their website + # https://climate.rutgers.edu/snowcover/table_area.php?ui_set=1&ui_sort=0 + # there is missing data between 1968 and 1971. + # So, for simplicity, select data from 1972 onwards, where data is complete. + tb = tb[tb["date"] >= "1972-01-01"].reset_index(drop=True) + + # Add a column with a rolling average. + tb = add_rolling_average(tb=tb, original_column_name="snow_cover_extent") + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-11-18/surface_temperature_analysis.meta.yml b/etl/steps/data/garden/climate/2024-11-18/surface_temperature_analysis.meta.yml new file mode 100644 index 00000000000..eda07f5ae5a --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/surface_temperature_analysis.meta.yml @@ -0,0 +1,20 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + +dataset: + title: GISS surface temperature analysis + update_period_days: 60 + +tables: + surface_temperature_analysis: + variables: + temperature_anomaly: + title: "Global warming: monthly temperature anomaly" + description_short: |- + Combined land-surface air and sea-surface water temperature anomaly, given as the deviation from the 1951-1980 mean, in degrees Celsius. + unit: °C + short_unit: °C + processing_level: minor diff --git a/etl/steps/data/garden/climate/2024-11-18/surface_temperature_analysis.py b/etl/steps/data/garden/climate/2024-11-18/surface_temperature_analysis.py new file mode 100644 index 00000000000..43d328abbde --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-18/surface_temperature_analysis.py @@ -0,0 +1,56 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("surface_temperature_analysis") + tb = ds_meadow["surface_temperature_analysis_world"] + + # + # Process data. + # + # Initialize dictionary to store processed tables. + tables = {} + for table_name in ds_meadow.table_names: + # Read table. + tb = ds_meadow[table_name].reset_index() + # Get location from table name. + location = table_name.split("surface_temperature_analysis_")[-1].replace("_", " ").title() + # Add column for location. + tb["location"] = location + # Convert table to long format. + tb = tb.melt(id_vars=["year", "location"], var_name="month", value_name="temperature_anomaly") + # Create column of date, assuming each measurement is taken mid month. + tb["date"] = pd.to_datetime(tb["year"].astype(str) + tb["month"] + "15", format="%Y%b%d") + # Copy metadata from any other previous column. + tb["date"] = tb["date"].copy_metadata(tb["location"]) + # Select necessary columns. + tb = tb[["location", "date", "temperature_anomaly"]] + # Remove rows with missing values. + tb = tb.dropna(subset=["temperature_anomaly"]).reset_index(drop=True) + # Update table. + tables[location] = tb + + # Concatenate all tables. + tb = pr.concat(list(tables.values()), ignore_index=True, short_name=paths.short_name) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset with the combined table. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-11-19/total_precipitation.countries.json b/etl/steps/data/garden/climate/2024-11-19/total_precipitation.countries.json new file mode 100644 index 00000000000..9553cbf9871 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-19/total_precipitation.countries.json @@ -0,0 +1,197 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas, The": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia, The": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran, Islamic Republic of": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Korea, Republic of": "South Korea", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyz Republic": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Mali": "Mali", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Slovak Republic": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Vietnam": "Vietnam", + "West Bank and Gaza": "Palestine", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "eSwatini": "Eswatini", + "American Samoa (US)": "American Samoa", + "Anguilla (UK)": "Anguilla", + "Cayman Islands (UK)": "Cayman Islands", + "Congo, Democratic Republic of": "Democratic Republic of Congo", + "Congo, Rep. of": "Congo", + "Cook Islands (NZ)": "Cook Islands", + "Egypt, Arab Republic of": "Egypt", + "Falkland Islands (UK)/Islas Malvinas": "Falkland Islands", + "Faroe Islands (Den.)": "Faroe Islands", + "French Polynesia (Fr.)": "French Polynesia", + "Greenland (Den.)": "Greenland", + "Heard Island and McDonald Islands (Aus.)": "Heard Island and McDonald Islands", + "Hong Kong (SAR, China)": "Hong Kong", + "Isle of Man (UK)": "Isle of Man", + "Korea, Democratic People's Republic of": "North Korea", + "New Caledonia (Fr.)": "New Caledonia", + "Puerto Rico (US)": "Puerto Rico", + "Saint Helena, Ascension and Tristan da Cunha (UK)": "Saint Helena, Ascension and Tristan da Cunha", + "South Georgia and South Sandwich Islands (UK)": "South Georgia and the South Sandwich Islands", + "S\u00e3o Tom\u00e9 and Pr\u00edncipe": "Sao Tome and Principe", + "U.S. Virgin Islands (US)": "United States Virgin Islands", + "Venezuela, Republica Bolivariana de": "Venezuela", + "Yemen, Republic of": "Yemen", + "World": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/climate/2024-11-19/total_precipitation.excluded_countries.json b/etl/steps/data/garden/climate/2024-11-19/total_precipitation.excluded_countries.json new file mode 100644 index 00000000000..04d505bdaa1 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-19/total_precipitation.excluded_countries.json @@ -0,0 +1,3 @@ +[ +"French Southern and Antarctic Lands (Fr.)" +] diff --git a/etl/steps/data/garden/climate/2024-11-19/total_precipitation.meta.yml b/etl/steps/data/garden/climate/2024-11-19/total_precipitation.meta.yml new file mode 100644 index 00000000000..e8e72ae1f13 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-19/total_precipitation.meta.yml @@ -0,0 +1,44 @@ + +definitions: + common: + unit: millimeters + short_unit: mm + presentation: + topic_tags: + - Climate Change + display: + numDecimalPlaces: 0 + description_from_producer: + This parameter is the accumulated liquid and frozen water, comprising rain and snow, that falls to the Earth's surface. It is the sum of large-scale precipitation and convective precipitation. Large-scale precipitation is generated by the cloud scheme in the ECMWF Integrated Forecasting System (IFS). The cloud scheme represents the formation and dissipation of clouds and large-scale precipitation due to changes in atmospheric quantities (such as pressure, temperature and moisture) predicted directly by the IFS at spatial scales of the grid box or larger. Convective precipitation is generated by the convection scheme in the IFS, which represents convection at spatial scales smaller than the grid box. This parameter does not include fog, dew or the precipitation that evaporates in the atmosphere before it lands at the surface of the Earth. This parameter is accumulated over a particular time period which depends on the data extracted. For the monthly averaged reanalysis and the monthly averaged ensemble members, the accumulation period is 1 day. For the monthly averaged reanalysis by hour of day, the accumulation period is 1 hour and for the monthly averaged ensemble members by hour of day, the accumulation period is 3 hours. The units of this parameter are depth in metres of water equivalent. It is the depth the water would have if it were spread evenly over the grid box. Care should be taken when comparing model parameters with observations, because observations are often local to a particular point in space and time, rather than representing averages over a model grid box. + processing_level: major + common_processing: |- + - Initially, the dataset is provided with specific coordinates in terms of longitude and latitude. To tailor this data to each country, we use geographical boundaries as defined by the World Bank. The method involves trimming the precipitation dataset to match the exact geographical shape of each country. To correct for potential distortions caused by projecting the Earth's curved surface onto a flat map, we apply a latitude-based weighting. This step is essential for maintaining accuracy, particularly in high-latitude regions where distortion is more pronounced. The result of this process is a latitude-weighted average precipitation for each nation. + - It’s important to note, however, that due to the resolution constraints of the Copernicus dataset, this methodology might not be as effective for countries with very small landmasses. In such cases, the process may not yield reliable data. + - The derived precipitation for each country is calculated based on administrative borders, encompassing all land surface types within these areas. As a result, precipitation over oceans and seas is not included in these averages, keeping the data focused on terrestrial environments. + - Global precipitation averages and anomalies, however, are calculated over both land and ocean surfaces. + precipitation_anomaly: |- + - The precipitation anomaly is calculated by comparing the average surface precipitation of a specific time period (e.g., a particular year or month) to the mean surface precipitation of the same period from 1991 to 2020. + - When calculating anomalies for each country, the average surface precipitation of a given year or month is compared to the 1991-2020 mean precipitation for that specific country. + - The reason for using the 1991-2020 period as the reference mean is that it is the standard reference period used by our data source, the Copernicus Climate Change Service. This period is also adopted by the UK Met Office. This approach ensures consistency in identifying climate variations over time. + +dataset: + title: Monthly precipitation by country + update_period_days: 180 +tables: + total_precipitation: + variables: + total_precipitation: + title: Monthly precipitation + description_short: Total monthly precipitation—rain and snow—calculated as the sum of daily averages, reported as the depth of water falling to Earth's surface, excluding fog and dew. + description_processing: |- + {definitions.common_processing} + {definitions.precipitation_anomaly} + + + precipitation_anomaly: + title: Monthly precipitation anomaly + description_short: The difference in a specific month's total precipitation—rain and snow—from the 1991–2020 average, measured in millimeters, excluding fog and dew. + description_processing: |- + {definitions.common_processing} + {definitions.precipitation_anomaly} + diff --git a/etl/steps/data/garden/climate/2024-11-19/total_precipitation.py b/etl/steps/data/garden/climate/2024-11-19/total_precipitation.py new file mode 100644 index 00000000000..e1d69ed4c7e --- /dev/null +++ b/etl/steps/data/garden/climate/2024-11-19/total_precipitation.py @@ -0,0 +1,60 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import calendar + +import owid.catalog.processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("total_precipitation") + tb = ds_meadow["total_precipitation"].reset_index() + + # + # Process data. + # + # Harmonize country names. + tb = geo.harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) + # Extract year and month as integers + tb["year"] = tb["time"].astype(str).str[0:4].astype(int) + tb["month"] = tb["time"].astype(str).str[5:7].astype(int) + + # Get the number of days in the given month and year + tb["days_in_month"] = tb.apply(lambda row: calendar.monthrange(row["year"], row["month"])[1], axis=1) + tb["days_in_month"] = tb["days_in_month"].copy_metadata(tb["total_precipitation"]) + + # Use the number of days to convert to monthly totals rather than daily averages - as per info here https://confluence.ecmwf.int/pages/viewpage.action?pageId=197702790. The data is in meters so we convert to mm. + tb["total_precipitation"] = tb["total_precipitation"] * 1000 * tb["days_in_month"] + + # Use the baseline from the Copernicus Climate Service https://climate.copernicus.eu/surface-air-temperature-january-2024 + tb_baseline = tb[(tb["year"].astype(int) > 1990) & (tb["year"].astype(int) < 2021)] + tb_baseline = tb_baseline.groupby(["country", "month"], as_index=False)["total_precipitation"].mean() + tb_baseline = tb_baseline.rename(columns={"total_precipitation": "mean_total_precipitation"}) + + # Ensure that the reference mean DataFrame has a name for the mean column, e.g., 'mean_temp' + tb = pr.merge(tb, tb_baseline, on=["country", "month"]) + + # Calculate the anomalies (below and above the mean) + tb["precipitation_anomaly"] = tb["total_precipitation"] - tb["mean_total_precipitation"] + + tb = tb.drop(columns=["month", "year", "mean_total_precipitation"]) + tb = tb.format(["country", "time"]) + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/latest/weekly_wildfires.meta.yml b/etl/steps/data/garden/climate/latest/weekly_wildfires.meta.yml index 11904670706..7aea860863b 100644 --- a/etl/steps/data/garden/climate/latest/weekly_wildfires.meta.yml +++ b/etl/steps/data/garden/climate/latest/weekly_wildfires.meta.yml @@ -102,27 +102,3 @@ tables: description_key: - CO₂ emissions contribute to the greenhouse effect, influencing global warming and climate change. - area_ha_per_wildfire: - title: Area burnt per wildfire - unit: hectares - short_unit: ha - description_short: The average area burnt per [wildfire](#dod:wildfires), in hectares. {definitions.desc_update} - description_processing: The area burnt per wildfire is calculated by dividing the area burnt by wildfires by the number of fires. - description_key: *desc_wildfires - - - co2_ha_per_area: - title: Carbon dioxide emissions per hectare burnt - unit: tonnes - short_unit: t - description_short: Carbon dioxide emissions per hectare burnt by [wildfires](#dod:wildfires), in tonnes. {definitions.desc_update} - description_processing: The carbon dioxide emissions per hectare is calculated by dividing the carbon dioxide emissions by the area burnt by wildfires. - description_key: *desc_wildfires - - pm2_5_ha_per_area: - title: PM2.5 emissions per hectare burnt - unit: tonnes - short_unit: t - description_short: PM2.5 emissions per hectare burnt by [wildfires](#dod:wildfires), in tonnes. {definitions.desc_update} - description_processing: The PM2.5 emissions per hectare is calculated by dividing the PM2.5 emissions by the area burnt by wildfires. - description_key: *desc_wildfires diff --git a/etl/steps/data/garden/climate/latest/weekly_wildfires.py b/etl/steps/data/garden/climate/latest/weekly_wildfires.py index dd6be9460e6..cb0f80f5c68 100644 --- a/etl/steps/data/garden/climate/latest/weekly_wildfires.py +++ b/etl/steps/data/garden/climate/latest/weekly_wildfires.py @@ -1,7 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" -import numpy as np import owid.catalog.processing as pr import pandas as pd @@ -87,15 +86,6 @@ def run(dest_dir: str) -> None: tb["share_area_ha"] = (tb["area_ha"] / tb["total_area_ha"]) * 100 tb["share_area_ha_cumulative"] = (tb["area_ha_cumulative"] / tb["total_area_ha"]) * 100 - # Area per wildfire - tb["area_ha_per_wildfire"] = tb["area_ha"] / tb["events"] - tb["co2_ha_per_area"] = tb["CO2"] / tb["area_ha"] - tb["pm2_5_ha_per_area"] = tb["PM2.5"] / tb["area_ha"] - - tb[["co2_ha_per_area", "pm2_5_ha_per_area"]] = tb[["co2_ha_per_area", "pm2_5_ha_per_area"]].replace( - [float("inf"), -float("inf")], np.nan - ) - tb = tb.drop(columns=["total_area_ha"]) tb = tb.set_index(["country", "date"], verify_integrity=True) diff --git a/etl/steps/data/garden/climate_watch/2024-11-21/emissions_by_sector.countries.json b/etl/steps/data/garden/climate_watch/2024-11-21/emissions_by_sector.countries.json new file mode 100644 index 00000000000..55a1a323f57 --- /dev/null +++ b/etl/steps/data/garden/climate_watch/2024-11-21/emissions_by_sector.countries.json @@ -0,0 +1,197 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "European Union (27)": "European Union (27)", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macedonia": "North Macedonia", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia": "Micronesia (country)", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Korea": "North Korea", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Congo": "Congo", + "Romania": "Romania", + "Russia": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Korea": "South Korea", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Türkiye": "Turkey", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "World": "World", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe" +} diff --git a/etl/steps/data/garden/climate_watch/2024-11-21/emissions_by_sector.meta.yml b/etl/steps/data/garden/climate_watch/2024-11-21/emissions_by_sector.meta.yml new file mode 100644 index 00000000000..ad883625fe3 --- /dev/null +++ b/etl/steps/data/garden/climate_watch/2024-11-21/emissions_by_sector.meta.yml @@ -0,0 +1,525 @@ +definitions: + common: + # Short description for all indicators except per capita indicators (which are measured in tonnes per person), and + # indicators on CO2 emissions (for which we do not need to mention CO2 equivalents). + description_short: Emissions are measured in million tonnes of [carbon dioxide-equivalents](#dod:carbondioxideequivalents). + # Units of all except per capita indicators. + unit: million tonnes + short_unit: Mt + presentation: + topic_tags: + - CO2 & Greenhouse Gas Emissions + processing_level: major + # Short description for per capita indicators, except on CO2 emissions. + description_short_per_capita: &description-short-per-capita Emissions are measured in tonnes of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) per person. + # Short description for indicators of CO2 emissions, except per capita indicators. + description_short_co2: &description-short-co2 Emissions are measured in million tonnes. + # Short description for per capita indicators of CO2 emissions. + description_short_co2_per_capita: &description-short-co2-per-capita Emissions are measured in tonnes per person. + # Unit and short unit for per capita indicators. + unit_per_capita: &unit-per-capita tonnes per person + short_unit_per_capita: &short-unit-per-capita t/person + # For convenience, in charts, show a simplified unit. + display_per_capita: &display-per-capita + unit: tonnes + shortUnit: t + # To avoid the previous common definitions to affect population, explicitly define population's metadata. + metadata_population: &metadata-population + title: Population + description_short: Population by country and year. + unit: people + short_unit: "" + +dataset: + title: Greenhouse gas emissions by sector + update_period_days: 365 + +tables: + greenhouse_gas_emissions_by_sector: + variables: + agriculture_ghg_emissions: + title: Greenhouse gas emissions from agriculture + agriculture_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from agriculture + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + aviation_and_shipping_ghg_emissions: + title: Greenhouse gas emissions from bunker fuels + aviation_and_shipping_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from bunker fuels + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + buildings_ghg_emissions: + title: Greenhouse gas emissions from buildings + buildings_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from buildings + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + electricity_and_heat_ghg_emissions: + title: Greenhouse gas emissions from electricity and heat + electricity_and_heat_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from electricity and heat + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + energy_ghg_emissions: + title: Greenhouse gas emissions from energy + energy_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from energy + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + fugitive_ghg_emissions: + title: Fugitive emissions of greenhouse gases from energy production + fugitive_ghg_emissions_per_capita: + title: Per capita fugitive emissions of greenhouse gases from energy production + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + industry_ghg_emissions: + title: Greenhouse gas emissions from industry + industry_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from industry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + land_use_change_and_forestry_ghg_emissions: + title: Greenhouse gas emissions from land use change and forestry + land_use_change_and_forestry_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from land use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + manufacturing_and_construction_ghg_emissions: + title: Greenhouse gas emissions from manufacturing and construction + manufacturing_and_construction_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from manufacturing and construction + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + other_fuel_combustion_ghg_emissions: + title: Greenhouse gas emissions from other fuel combustion + other_fuel_combustion_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from other fuel combustion + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + total_ghg_emissions_excluding_lucf: + title: Total greenhouse gas emissions excluding land-use change and forestry + total_ghg_emissions_excluding_lucf_per_capita: + title: Total greenhouse gas emissions per capita excluding land-use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + total_ghg_emissions_including_lucf: + title: Total greenhouse gas emissions including land-use change and forestry + total_ghg_emissions_including_lucf_per_capita: + title: Total greenhouse gas emissions per capita including land-use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + transport_ghg_emissions: + title: Greenhouse gas emissions from transport + transport_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from transport + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + waste_ghg_emissions: + title: Greenhouse gas emissions from waste + waste_ghg_emissions_per_capita: + title: Per capita greenhouse gas emissions from waste + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + population: *metadata-population + carbon_dioxide_emissions_by_sector: + variables: + aviation_and_shipping_co2_emissions: + title: Carbon dioxide emissions from bunker fuels + description_short: *description-short-co2 + aviation_and_shipping_co2_emissions_per_capita: + title: Per capita carbon dioxide emissions from bunker fuels + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + buildings_co2_emissions: + title: Carbon dioxide emissions from buildings + description_short: *description-short-co2 + buildings_co2_emissions_per_capita: + title: Per capita carbon dioxide emissions from buildings + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + electricity_and_heat_co2_emissions: + title: Carbon dioxide emissions from electricity and heat + description_short: *description-short-co2 + electricity_and_heat_co2_emissions_per_capita: + title: Per capita carbon dioxide emissions from electricity and heat + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + energy_co2_emissions: + title: Carbon dioxide emissions from energy + description_short: *description-short-co2 + energy_co2_emissions_per_capita: + title: Per capita carbon dioxide emissions from energy + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + fugitive_co2_emissions: + title: Fugitive emissions of carbon dioxide from energy production + description_short: *description-short-co2 + fugitive_co2_emissions_per_capita: + title: Per capita fugitive emissions of carbon dioxide from energy production + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + industry_co2_emissions: + title: Carbon dioxide emissions from industry + description_short: *description-short-co2 + industry_co2_emissions_per_capita: + title: Per capita carbon dioxide emissions from industry + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + land_use_change_and_forestry_co2_emissions: + title: Carbon dioxide emissions from land use change and forestry + description_short: *description-short-co2 + land_use_change_and_forestry_co2_emissions_per_capita: + title: Per capita carbon dioxide emissions from land use change and forestry + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + manufacturing_and_construction_co2_emissions: + title: Carbon dioxide emissions from manufacturing and construction + description_short: *description-short-co2 + manufacturing_and_construction_co2_emissions_per_capita: + title: Per capita carbon dioxide emissions from manufacturing and construction + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + other_fuel_combustion_co2_emissions: + title: Carbon dioxide emissions from other fuel combustion + description_short: *description-short-co2 + other_fuel_combustion_co2_emissions_per_capita: + title: Per capita carbon dioxide emissions from other fuel combustion + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + total_co2_emissions_excluding_lucf: + title: Total carbon dioxide emissions excluding land-use change and forestry + description_short: *description-short-co2 + total_co2_emissions_excluding_lucf_per_capita: + title: Total carbon dioxide emissions per capita excluding land-use change and forestry + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + total_co2_emissions_including_lucf: + title: Total carbon dioxide emissions including land-use change and forestry + description_short: *description-short-co2 + total_co2_emissions_including_lucf_per_capita: + title: Total carbon dioxide emissions per capita including land-use change and forestry + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + transport_co2_emissions: + title: Carbon dioxide emissions from transport + description_short: *description-short-co2 + transport_co2_emissions_per_capita: + title: Per capita carbon dioxide emissions from transport + description_short: *description-short-co2-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + population: *metadata-population + methane_emissions_by_sector: + variables: + agriculture_ch4_emissions: + title: Methane emissions from agriculture + agriculture_ch4_emissions_per_capita: + title: Per capita methane emissions from agriculture + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + aviation_and_shipping_ch4_emissions: + title: Methane emissions from bunker fuels + aviation_and_shipping_ch4_emissions_per_capita: + title: Per capita methane emissions from bunker fuels + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + buildings_ch4_emissions: + title: Methane emissions from buildings + buildings_ch4_emissions_per_capita: + title: Per capita methane emissions from buildings + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + electricity_and_heat_ch4_emissions: + title: Methane emissions from electricity and heat + electricity_and_heat_ch4_emissions_per_capita: + title: Per capita methane emissions from electricity and heat + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + energy_ch4_emissions: + title: Methane emissions from energy + energy_ch4_emissions_per_capita: + title: Per capita methane emissions from energy + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + fugitive_ch4_emissions: + title: Fugitive emissions of methane from energy production + fugitive_ch4_emissions_per_capita: + title: Per capita fugitive emissions of methane from energy production + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + industry_ch4_emissions: + title: Methane emissions from industry + industry_ch4_emissions_per_capita: + title: Per capita methane emissions from industry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + land_use_change_and_forestry_ch4_emissions: + title: Methane emissions from land use change and forestry + land_use_change_and_forestry_ch4_emissions_per_capita: + title: Per capita methane emissions from land use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + manufacturing_and_construction_ch4_emissions: + title: Methane emissions from manufacturing and construction + manufacturing_and_construction_ch4_emissions_per_capita: + title: Per capita methane emissions from manufacturing and construction + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + other_fuel_combustion_ch4_emissions: + title: Methane emissions from other fuel combustion + other_fuel_combustion_ch4_emissions_per_capita: + title: Per capita methane emissions from other fuel combustion + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + total_ch4_emissions_excluding_lucf: + title: Total methane emissions excluding land-use change and forestry + total_ch4_emissions_excluding_lucf_per_capita: + title: Total methane emissions per capita excluding land-use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + total_ch4_emissions_including_lucf: + title: Total methane emissions including land-use change and forestry + total_ch4_emissions_including_lucf_per_capita: + title: Total methane emissions per capita including land-use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + transport_ch4_emissions: + title: Methane emissions from transport + transport_ch4_emissions_per_capita: + title: Per capita methane emissions from transport + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + waste_ch4_emissions: + title: Methane emissions from waste + waste_ch4_emissions_per_capita: + title: Per capita methane emissions from waste + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + population: *metadata-population + nitrous_oxide_emissions_by_sector: + variables: + agriculture_n2o_emissions: + title: Nitrous oxide emissions from agriculture + agriculture_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from agriculture + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: + unit: tonnes + shortUnit: t + name: Agriculture + presentation: + title_public: Per capita nitrous oxide emissions from agriculture + aviation_and_shipping_n2o_emissions: + title: Nitrous oxide emissions from bunker fuels + aviation_and_shipping_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from bunker fuels + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + buildings_n2o_emissions: + title: Nitrous oxide emissions from buildings + buildings_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from buildings + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + electricity_and_heat_n2o_emissions: + title: Nitrous oxide emissions from electricity and heat + electricity_and_heat_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from electricity and heat + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + energy_n2o_emissions: + title: Nitrous oxide emissions from energy + energy_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from energy + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + fugitive_n2o_emissions: + title: Fugitive emissions of nitrous oxide from energy production + fugitive_n2o_emissions_per_capita: + title: Per capita fugitive emissions of nitrous oxide from energy production + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + industry_n2o_emissions: + title: Nitrous oxide emissions from industry + industry_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from industry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + land_use_change_and_forestry_n2o_emissions: + title: Nitrous oxide emissions from land use change and forestry + land_use_change_and_forestry_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from land use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + manufacturing_and_construction_n2o_emissions: + title: Nitrous oxide emissions from manufacturing and construction + manufacturing_and_construction_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from manufacturing and construction + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + other_fuel_combustion_n2o_emissions: + title: Nitrous oxide emissions from other fuel combustion + other_fuel_combustion_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from other fuel combustion + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + total_n2o_emissions_excluding_lucf: + title: Total nitrous oxide emissions excluding land-use change and forestry + total_n2o_emissions_excluding_lucf_per_capita: + title: Total nitrous oxide emissions per capita excluding land-use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + total_n2o_emissions_including_lucf: + title: Total nitrous oxide emissions including land-use change and forestry + total_n2o_emissions_including_lucf_per_capita: + title: Total nitrous oxide emissions per capita including land-use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + transport_n2o_emissions: + title: Nitrous oxide emissions from transport + transport_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from transport + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + waste_n2o_emissions: + title: Nitrous oxide emissions from waste + waste_n2o_emissions_per_capita: + title: Per capita nitrous oxide emissions from waste + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + population: *metadata-population + fluorinated_gas_emissions_by_sector: + variables: + industry_fgas_emissions: + title: Fluorinated gas emissions from industry + industry_fgas_emissions_per_capita: + title: Per capita fluorinated gas emissions from industry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + total_fgas_emissions_excluding_lucf: + title: Total fluorinated gas emissions excluding land-use change and forestry + total_fgas_emissions_excluding_lucf_per_capita: + title: Total fluorinated gas emissions per capita excluding land-use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + total_fgas_emissions_including_lucf: + title: Total fluorinated gas emissions including land-use change and forestry + total_fgas_emissions_including_lucf_per_capita: + title: Total fluorinated gas emissions per capita including land-use change and forestry + description_short: *description-short-per-capita + unit: *unit-per-capita + short_unit: *short-unit-per-capita + display: *display-per-capita + population: *metadata-population diff --git a/etl/steps/data/garden/climate_watch/2024-11-21/emissions_by_sector.py b/etl/steps/data/garden/climate_watch/2024-11-21/emissions_by_sector.py new file mode 100644 index 00000000000..62b1a0c8ac0 --- /dev/null +++ b/etl/steps/data/garden/climate_watch/2024-11-21/emissions_by_sector.py @@ -0,0 +1,199 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Dataset, Table, utils +from owid.datautils.dataframes import map_series + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# All sectors expected in the data, and how to rename them. +SECTORS = { + "Agriculture": "Agriculture emissions", + "Building": "Buildings emissions", + "Bunker Fuels": "Aviation and shipping emissions", + "Electricity/Heat": "Electricity and heat emissions", + "Energy": "Energy emissions", + "Fugitive Emissions": "Fugitive emissions", + "Industrial Processes": "Industry emissions", + "Land-Use Change and Forestry": "Land-use change and forestry emissions", + "Manufacturing/Construction": "Manufacturing and construction emissions", + "Other Fuel Combustion": "Other fuel combustion emissions", + "Total excluding LUCF": "Total emissions excluding LUCF", + "Total including LUCF": "Total emissions including LUCF", + "Transportation": "Transport emissions", + "Waste": "Waste emissions", +} + +# Suffix to add to the name of per capita variables. +PER_CAPITA_SUFFIX = "_per_capita" + +# Mapping of gas name (as given in Climate Watch data) to the name of the corresponding output table. +TABLE_NAMES = { + "All GHG": "Greenhouse gas emissions by sector", + "CH4": "Methane emissions by sector", + "CO2": "Carbon dioxide emissions by sector", + "F-Gas": "Fluorinated gas emissions by sector", + "N2O": "Nitrous oxide emissions by sector", +} + +# Aggregate regions to add, following OWID definitions. +REGIONS = { + # Continents. + "Africa": {}, + "Asia": {}, + "Europe": {}, + # The EU27 is already included in the original data, and after inspection the data coincides with our aggregate. + # So we simply keep the original data for EU27 given in the data. + "North America": {}, + "Oceania": {}, + "South America": {}, + # Income groups. + "Low-income countries": {}, + "Upper-middle-income countries": {}, + "Lower-middle-income countries": {}, + "High-income countries": {}, +} + +# Convert million tonnes to tonnes. +MT_TO_T = 1e6 + + +def create_table_for_gas( + tb: Table, gas: str, ds_regions: Dataset, ds_population: Dataset, ds_income_groups: Dataset +) -> Table: + """Extract data for a particular gas and create a table with variables' metadata. + + Parameters + ---------- + tb : Table + gas : str + Name of gas to consider (as called in "gas" column of the original data). + ds_regions : Dataset + Regions dataset. + ds_population : Dataset + Population dataset. + ds_income_groups : Dataset + Income groups dataset. + + Returns + ------- + table_gas : Table + Table with data for considered gas, and metadata for each variable. + + """ + # Select data for current gas. + tb_gas = tb[tb["gas"] == gas].drop(columns="gas").reset_index(drop=True) + + # Pivot table to have a column for each sector. + tb_gas = tb_gas.pivot(index=["country", "year"], columns="sector", values="value", join_column_levels_with="_") + + # Create region aggregates for all columns (with a simple sum) except for the column of efficiency factors. + aggregations = { + column: "sum" for column in tb_gas.columns if column not in ["country", "year", "efficiency_factor"] + } + tb_gas = geo.add_regions_to_table( + tb=tb_gas, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + aggregations=aggregations, + min_num_values_per_year=1, + ) + + # Add population to data. + tb_gas = geo.add_population_to_table(tb=tb_gas, ds_population=ds_population) + + # List columns with emissions data. + emissions_columns = [column for column in tb_gas.columns if column not in ["country", "year", "population"]] + + # Add per capita variables. + for variable in emissions_columns: + tb_gas[variable + PER_CAPITA_SUFFIX] = MT_TO_T * tb_gas[variable] / tb_gas["population"] + + # Remove rows and columns that only have nans. + tb_gas = tb_gas.dropna(how="all", axis=1) + tb_gas = tb_gas.dropna(subset=emissions_columns, how="all").reset_index(drop=True) + + # Adapt table title and short name. + tb_gas.metadata.title = TABLE_NAMES[gas] + tb_gas.metadata.short_name = utils.underscore(TABLE_NAMES[gas]) + + # Adapt column names. + tb_gas = tb_gas.rename( + columns={ + column: utils.underscore(column) + .replace("emissions", f"{utils.underscore(gas)}_emissions") + .replace("all_ghg", "ghg") + .replace("f_gas", "fgas") + for column in tb_gas.columns + if column not in ["country", "year"] + } + ) + + # Improve table format. + tb_gas = tb_gas.format(sort_columns=True) + + return tb_gas + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("emissions_by_sector") + tb = ds_meadow.read("emissions_by_sector") + + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # + # Process data. + # + # Select only one data source (Climate Watch). + tb = tb[tb["data_source"] == "Climate Watch"].reset_index(drop=True) + + # Check that there is only one unit in dataset. + assert set(tb["unit"]) == {"MtCO₂e"}, "Unknown units in dataset" + + # Remove unnecessary columns. + tb = tb.drop(columns=["unit", "id", "data_source", "iso_code3"], errors="raise") + + # Rename sectors. + tb["sector"] = map_series( + series=tb["sector"], + mapping=SECTORS, + warn_on_missing_mappings=True, + warn_on_unused_mappings=True, + ) + + # Harmonize country names. + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + warn_on_missing_countries=True, + warn_on_unused_countries=True, + ) + + # Create one table for each gas, and one for all gases combined. + tables = [ + create_table_for_gas( + tb=tb, gas=gas, ds_regions=ds_regions, ds_population=ds_population, ds_income_groups=ds_income_groups + ) + for gas in tb["gas"].unique() + ] + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=tables, check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/covid/2024-11-05/github_stats.meta.yml b/etl/steps/data/garden/covid/2024-11-05/github_stats.meta.yml new file mode 100644 index 00000000000..0a435d3975b --- /dev/null +++ b/etl/steps/data/garden/covid/2024-11-05/github_stats.meta.yml @@ -0,0 +1,105 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + others: + interval: + <% if interval == "cumulative" %> + over time + <% elif interval == "weekly" %> + in the last week + <% elif interval == "4-weekly" %> + in the last four weeks + <% elif interval == "7-day rolling sum" %> + in the last 7 days + <%- endif -%> + common: + presentation: + topic_tags: + - COVID-19 + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + contributions: + variables: + new_issue: + title: "Number of created issues (<>)" + unit: issue + description_short: "The number of new issues created in the GitHub repository {definitions.others.interval}." + new_pr: + title: "Number of created pull requests (<>)" + unit: pull request + description_short: "The number of new pull requests created in the GitHub repository {definitions.others.interval}." + new_issue_or_pr: + title: "Number of created issues or pull requests (<>)" + unit: issue or pull request + description_short: "The number of new issues or pull requests created in the GitHub repository {definitions.others.interval}." + new_comment_issue_or_pr: + title: "Number of comments on issues or pull requests (<>)" + unit: comments + description_short: "The number of new comments on issues or pull requests in the GitHub repository {definitions.others.interval}." + new_contributions: + title: "Number of contributions (<>)" + unit: contributions + description_short: "The number of new contributions in the GitHub repository {definitions.others.interval}. A contribution can be a new issue, a new pull request, or a comment on an issue or pull request." + + user_contributions: + variables: + number_distinct_users_create_issue: + title: Number of users who created an issue (<>) + unit: users + short_unit: users + description_short: |- + Number of new users that contributed by creating at least one issue in the GitHub repository {definitions.others.interval}. + + number_distinct_users_create_pr: + title: Number of users who created an pull request (<>) + unit: users + short_unit: users + description_short: Number of new users that contributed by creating at least one pull request in the GitHub repository {definitions.others.interval}. + + number_distinct_users_create_any: + title: Number of users who created an issue or pull request (<>) + unit: users + short_unit: users + description_short: Number of new users that contributed by creating at least one issue or one pull request in the GitHub repository {definitions.others.interval}. + + number_distinct_users_comment_issue: + title: Number of users who commented on an issue or pull request (<>) + unit: users + short_unit: users + description_short: Number of new users that contributed by commenting to an existing issue or pull request in the GitHub repository {definitions.others.interval}. + + number_distinct_users_comment_pr: + title: Number of users who added a code-review comment on a pull request (<>) + unit: users + short_unit: users + description_short: Number of new users that contributed by commenting to a code snippet of an existing pull request in the GitHub repository {definitions.others.interval}. + + number_distinct_users_comment_any: + title: Number of users who commented on an issue or pull request in any form (<>) + unit: users + short_unit: users + description_short: Number of new users that contributed by commenting to an existing issue or pull request (in any form) in the GitHub repository {definitions.others.interval}. + + number_distinct_users_issue: + title: Number of users who participated on an issue (<>) + unit: users + short_unit: users + description_short: Number of new users that contributed either by creating or commenting to at least one issue in the GitHub repository {definitions.others.interval}. + + number_distinct_users_pr: + title: Number of users who participated on a pull request (<>) + unit: users + short_unit: users + description_short: Number of new users that contributed either by creating or commenting to at least one pull request in the GitHub repository {definitions.others.interval}. + + number_distinct_users_any: + title: Number of users who participated on an issue or pull request (<>) + unit: users + short_unit: users + description_short: Number of new users that contributed either by creating or commenting to at least one issue or one pull request in the GitHub repository {definitions.others.interval}. diff --git a/etl/steps/data/garden/covid/2024-11-05/github_stats.py b/etl/steps/data/garden/covid/2024-11-05/github_stats.py new file mode 100644 index 00000000000..1e23e6ceecc --- /dev/null +++ b/etl/steps/data/garden/covid/2024-11-05/github_stats.py @@ -0,0 +1,260 @@ +"""Load a meadow dataset and create a garden dataset. + + +Given: raw tables with data on issues, pull requests and commits. + +Goal: + (higher) User involvement: + + - A) Number of users submitting an issue or PR + - B) Number of users submitting a commit + - C = A or B + - Contributions by country (e.g. number of users, number of user-comments, etc.) + + (lower) User involvement: + - A) Number of users submitting an issue or PR, or commenting to one + - B) Number of users submitting a commit + - C = A or B + - Contributions by country (e.g. number of users, number of user-comments, etc.) + +Clarifications: + + - github_stats_issues: list of all issues, including PRs. + - github_stats_pr: list of PRs (redundant with `issues`) + - github_stats_issues_comments: list of comments on issues. + - github_stats_pr_comments: list of comments on PRs. These are not regular comments, but comments on code (e.g. review comments). + +""" + +import owid.catalog.processing as pr +import pandas as pd + +from etl.data_helpers.misc import expand_time_column +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Other config +COLNAME_BASE = "number_distinct_users" + + +def run(dest_dir: str) -> None: + # + # Load inputs and pre-process data. + # + # 1/ Load meadow dataset. + ds_meadow = paths.load_dataset("github_stats") + + # Combine PR & issues tables. + tb_issues = ds_meadow.read("github_stats_issues") + # tb_pr = ds_meadow.read("github_stats_pr") + tb_issues = make_table_issues(tb_issues) + + # Get list of all comments (including issue/pr description) + tb_comments = ds_meadow.read("github_stats_issues_comments") + tb_comments_pr = ds_meadow.read("github_stats_pr_comments") + tb_comments = make_table_comments(tb_comments, tb_comments_pr) + + # Get the list of all users + tb_users = ds_meadow.read("github_stats_issues_users") + tb_users_pr = ds_meadow.read("github_stats_pr_users") + tb_users = pr.concat([tb_users, tb_users_pr], ignore_index=True) + tb_users = tb_users.drop_duplicates(subset=["user_id"]) + assert tb_users.user_login.notna().all(), "Some missing usernames!" + + # # Commits + # tb_commits = ds_meadow.read("github_stats_commits") + + # + # Process data. + # + # 1/ TABLE: user contributions + ## Get table with number of new users contribution to the repository + tb_distinct_users = make_table_user_counts(tb_issues, tb_comments, tb_users) + ## Add flavours of counts (cumulative, weekly, 7-day rolling sum, etc.) + tb_distinct_users = get_intervals(tb_distinct_users) + + # 2/ TABLE: issues or PR created, and comments + ## Issue or PR + tb_issues_count = tb_issues.copy() + tb_issues_count["new_pr"] = tb_issues_count["is_pr"].astype(int) + tb_issues_count["new_issue"] = (~tb_issues_count["is_pr"]).astype(int) + tb_issues_count["new_issue_or_pr"] = 1 + tb_issues_count["new_issue_or_pr"] = tb_issues_count["new_issue_or_pr"].copy_metadata(tb_issues_count["new_issue"]) + tb_issues_count = tb_issues_count.groupby("date", as_index=False)[["new_issue", "new_pr", "new_issue_or_pr"]].sum() + ## Comments + tb_comments_count = tb_comments.copy() + tb_comments_count["new_comment_issue_or_pr"] = 1 + tb_comments_count["new_comment_issue_or_pr"] = tb_comments_count["new_comment_issue_or_pr"].copy_metadata( + tb_comments_count["issue_id"] + ) + tb_comments_count = tb_comments_count.groupby("date", as_index=False)["new_comment_issue_or_pr"].sum() + ## Combine + tb_counts = tb_issues_count.merge(tb_comments_count, on="date", how="outer") + tb_counts = expand_time_column(tb_counts, time_col="date", fillna_method="zero") + tb_counts["new_contributions"] = tb_counts["new_issue_or_pr"] + tb_counts["new_comment_issue_or_pr"] + tb_counts = tb_counts.format("date") + ## Intervals + tb_counts = get_intervals(tb_counts) + + # 4/ Format + tb_distinct_users = tb_distinct_users.format(["date", "interval"], short_name="user_contributions").astype(int) + tb_counts = tb_counts.format(["date", "interval"], short_name="contributions").astype(int) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + tables = [ + tb_distinct_users, + tb_counts, + ] + + ds_garden = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def make_table_issues(tb_issues): + assert tb_issues.author_login.notna().all(), "Some missing usernames!" + ## Add date + ## Dtypes + tb_issues = tb_issues.astype( + { + "author_name": "string", + "author_login": "string", + "date_created": "datetime64[ns]", + "is_pr": "bool", + } + ) + tb_issues["date_created"] = pd.to_datetime(tb_issues["date_created"]) + tb_issues["date"] = pd.to_datetime(tb_issues["date_created"].dt.date) + + ## Sort + tb_issues = tb_issues.sort_values("date_created") + + # Columns + tb_issues = tb_issues[ + [ + "issue_id", + "author_name", + "author_login", + "date_created", + "date", + "is_pr", + ] + ] + return tb_issues + + +def make_table_comments(tb_issues, tb_pr): + tb_pr["is_pr"] = True + tb = pr.concat([tb_issues, tb_pr], ignore_index=True) + tb["is_pr"] = tb["is_pr"].fillna(False) + + assert tb["comment_id"].value_counts().max(), "Repeated comments!" + assert tb.user_id.notna().all(), "Some missing usernames!" + + tb = tb.astype( + { + "date_created": "datetime64[ns]", + "date_updated": "datetime64[ns]", + "is_pr": "bool", + } + ) + tb["date"] = pd.to_datetime(tb["date_created"].dt.date) + + # Sort rows and columns + tb = tb.sort_values(["issue_id", "date"])[ + [ + "comment_id", + "issue_id", + "date", + "date_created", + "date_updated", + "user_id", + "is_pr", + ] + ] + return tb + + +def get_number_distinct_users(tb, col_pr_flag, colname_user, colname_output, col_date: str = "date"): + def _get_counts(tb, colname_output): + # Drop duplicate users + tb = tb.drop_duplicates(subset=[colname_user], keep="first") + + # Get unique number for a given date + tb = tb.groupby(col_date, as_index=False)[colname_user].nunique() + + # Drop unnecessary columns + tb = tb.rename(columns={colname_user: colname_output}) + + return tb + + tb_pr = _get_counts(tb[tb[col_pr_flag]], f"{colname_output}_pr") + tb_issue = _get_counts(tb[~tb[col_pr_flag]], f"{colname_output}_issue") + tb_any = _get_counts(tb, f"{colname_output}_any") + + tb = pr.multi_merge([tb_pr, tb_issue, tb_any], on=col_date, how="outer").fillna(0) + + # Fill NaNs and set dtypes + columns = [col for col in tb.columns if col != col_date] + tb[columns] = tb[columns].fillna(0).astype("Int64") + + return tb + + +def combine_user_contribution(tb_create, tb_comment, tb_any): + tb = pr.multi_merge([tb_create, tb_comment, tb_any], on="date", how="outer") + tb = expand_time_column(df=tb, time_col="date", fillna_method="zero") + tb = tb.format("date") + return tb + + +def get_intervals(tb): + ## 4.1/ Cumulative + tb_cum = tb.cumsum().reset_index() + tb_cum["interval"] = "cumulative" + + ## 4.3/ Weekly + tb_week = tb.resample("W").sum().reset_index() + tb_week["interval"] = "weekly" + + ## 4.3/ 4-week + tb_4week = tb.resample("4W").sum().reset_index() + tb_4week["interval"] = "4-weekly" + + ## 4.4/ 7-day rolling + tb_rolling = tb.rolling(window=7, min_periods=0).sum().reset_index() + tb_rolling["interval"] = "7-day rolling sum" + + ## 4.5/ Combine + tb = pr.concat([tb_cum, tb_rolling, tb_week, tb_4week]) + + return tb + + +def make_table_user_counts(tb_issues, tb_comments, tb_users): + # 2.1/ Number of distinct users submitting an issue or PR over time + tb_distinct_users_create = get_number_distinct_users(tb_issues, "is_pr", "author_login", f"{COLNAME_BASE}_create") + + # 2.2/ Number of distinct users commenting in an issue or PR thread + tb_distinct_users_comments = get_number_distinct_users(tb_comments, "is_pr", "user_id", f"{COLNAME_BASE}_comment") + + # 2.3 Any + tb_issues_b = tb_issues.merge( + tb_users[["user_login", "user_id"]], left_on="author_login", right_on="user_login", how="left" + ) + cols = ["date", "user_id", "issue_id", "is_pr"] + tb_any = pr.concat([tb_issues_b.loc[:, cols], tb_comments.loc[:, cols]]) + tb_distinct_users_any = get_number_distinct_users(tb_any, "is_pr", "user_id", f"{COLNAME_BASE}") + + # 3/ Combine + tb = combine_user_contribution(tb_distinct_users_create, tb_distinct_users_comments, tb_distinct_users_any) + + return tb diff --git a/etl/steps/data/garden/covid/latest/cases_deaths.py b/etl/steps/data/garden/covid/latest/cases_deaths.py index c1d9ed7bace..d5b2e81897b 100644 --- a/etl/steps/data/garden/covid/latest/cases_deaths.py +++ b/etl/steps/data/garden/covid/latest/cases_deaths.py @@ -39,7 +39,7 @@ def run(dest_dir: str) -> None: ds_population = paths.load_dataset("population") # Read table from meadow dataset. - tb = ds_meadow.read_table("cases_deaths") + tb = ds_meadow.read("cases_deaths") # # Process data. @@ -175,15 +175,15 @@ def discard_rows(tb: Table): print("Discarding rows…") # For all rows where new_cases or new_deaths is negative, we keep the cumulative value but set # the daily change to NA. This also sets the 7-day rolling average to NA for the next 7 days. - tb.loc[tb["new_cases"] < 0, "new_cases"] = np.nan - tb.loc[tb["new_deaths"] < 0, "new_deaths"] = np.nan + tb.loc[tb["new_cases"] < 0, "new_cases"] = pd.NA + tb.loc[tb["new_deaths"] < 0, "new_deaths"] = pd.NA # Custom data corrections for ldc in LARGE_DATA_CORRECTIONS: - tb.loc[(tb["country"] == ldc[0]) & (tb["date"].astype(str) == ldc[1]), f"new_{ldc[2]}"] = np.nan + tb.loc[(tb["country"] == ldc[0]) & (tb["date"].astype(str) == ldc[1]), f"new_{ldc[2]}"] = pd.NA for ldc in LARGE_DATA_CORRECTIONS_SINCE: - tb.loc[(tb["country"] == ldc[0]) & (tb["date"].astype(str) >= ldc[1]), f"new_{ldc[2]}"] = np.nan + tb.loc[(tb["country"] == ldc[0]) & (tb["date"].astype(str) >= ldc[1]), f"new_{ldc[2]}"] = pd.NA # Sort (legacy) tb = tb.sort_values(["country", "date"]) @@ -216,8 +216,8 @@ def add_period_aggregates(tb: Table, prefix: str, periods: int): ) # Set NaNs where the original data was NaN - tb.loc[tb["new_cases"].isnull(), cases_colname] = np.nan - tb.loc[tb["new_deaths"].isnull(), deaths_colname] = np.nan + tb.loc[tb["new_cases"].isnull(), cases_colname] = pd.NA + tb.loc[tb["new_deaths"].isnull(), deaths_colname] = pd.NA return tb @@ -247,7 +247,7 @@ def add_doubling_days(tb: Table) -> Table: for col, spec in DOUBLING_DAYS_SPEC.items(): value_col = spec["value_col"] periods = spec["periods"] - tb.loc[tb[value_col] == 0, value_col] = np.nan + tb.loc[tb[value_col] == 0, value_col] = pd.NA tb[col] = ( tb.groupby("country", as_index=False)[value_col] .pct_change(periods=periods, fill_method=None) @@ -348,7 +348,7 @@ def _apply_row_cfr_100(row): tb.loc[ (tb["cfr_short_term"] < 0) | (tb["cfr_short_term"] > 10) | (tb["date"].astype(str) < "2020-09-01"), "cfr_short_term", - ] = np.nan + ] = pd.NA # Replace inf cols = [ diff --git a/etl/steps/data/garden/covid/latest/countries_reporting.meta.yml b/etl/steps/data/garden/covid/latest/countries_reporting.meta.yml new file mode 100644 index 00000000000..3ab26130309 --- /dev/null +++ b/etl/steps/data/garden/covid/latest/countries_reporting.meta.yml @@ -0,0 +1,79 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + others: + processing_1: &processing_1 |- + Dates were obtained based on each country's time series in our current COVID-19 dataset. Therefore, it may not completely reflect the history of the reporting since data points may have been removed by providers from previous versions. + processing_2: &processing_2 |- + Dates were obtained based on the GitHub history of our COVID-19 dataset. We have been reporting vaccination data since 2020 for each country, and storing individual country files at https://github.com/owid/covid-19-data/tree/master/public/data/vaccinations/country_data. + description_key_2: &description_key_2 + - A country may have started reporting with reports, and not necessarily with standardized data files. + - Reporting is not necessarily a measure of the quality of the data or the reporting. + common: + presentation: + topic_tags: + - COVID-19 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 0 + title: COVID-19, Countries reporting data + +tables: + country_flags: + variables: + reporting_latest: + title: Country has data on COVID-19 << type >> (OLD) + description_short: |- + Whether a country has at least one data point on COVID-19 << type >> by a given date. This does not imply that the country was reporting by then. + description_processing: *processing_1 + unit: "" + country_counts: + variables: + num_countries_reporting_latest: + title: Number of countries with data on COVID-19 << type >> (OLD) + description_short: |- + Number of counties had at least one data point on COVID-19 << type >> by a given date. This does not imply that the country was reporting by then. + description_processing: *processing_1 + unit: "countries" + share_countries_reporting_latest: + title: Share of countries with data on COVID-19 << type >> (OLD) + description_short: |- + Share of counties had at least one data point on COVID-19 << type >> by a given date. This does not imply that the country was reporting by then. + description_processing: *processing_1 + unit: "%" + + country_flags_historical: + variables: + reporting: + title: Countries reporting data on COVID-19 vaccinations + description_short: |- + Whether a country had started reporting data on COVID-19 << type >>. + description_processing: *processing_2 + description_key: *description_key_2 + unit: "" + country_counts_historical: + variables: + num_countries_reporting: + title: Number of countries reporting data on COVID-19 << type >> + description_short: |- + Number of counties had reported data on COVID-19 << type >> at least once by a given date. + description_processing: *processing_2 + description_key: *description_key_2 + unit: "countries" + share_countries_reporting: + title: Share of countries reporting data on COVID-19 << type >> + description_short: |- + Share of counties had reported data on COVID-19 << type >> at least once by a given date. + description_processing: *processing_2 + description_key: *description_key_2 + unit: "%" + country_reporting_delay: + variables: + num_days_delay_in_reporting: + title: Number of days needed to first report data on COVID-19 << type | default('') >> since the first vaccine was administered + description_short: |- + Number of days needed to first report data on COVID-19 << type | default('') >> since the first vaccine was administered. Some countries may have started vaccinating before they reported it, or may have started reporting it before they started vaccinating. + description_processing: *processing_2 + description_key: *description_key_2 + unit: "days" diff --git a/etl/steps/data/garden/covid/latest/countries_reporting.py b/etl/steps/data/garden/covid/latest/countries_reporting.py new file mode 100644 index 00000000000..3742ff433a6 --- /dev/null +++ b/etl/steps/data/garden/covid/latest/countries_reporting.py @@ -0,0 +1,129 @@ +"""Load a meadow dataset and create a garden dataset.""" + + +from owid.catalog import Table + +from etl.data_helpers.misc import expand_time_column +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow datasets. + ds_meadow = paths.load_dataset("countries_reporting") + ds_vax = paths.load_dataset("vaccinations_global") + + # Read table from meadow dataset. + tb = ds_meadow.read("vaccinations", safe_types=False) + tb_latest = ds_vax.read("vaccinations_global", safe_types=False) + + # 1/ LATEST DATA + ## 1.1/ Process main table + ## Drop NaNs and zeroes + tb_latest = tb_latest.dropna(subset="total_vaccinations") + tb_latest = tb_latest.loc[tb_latest["total_vaccinations"] != 0] + ## Keep first entry + tb_latest = tb_latest.sort_values("date").drop_duplicates(subset="country", keep="first") + + # Get table with flags whenever a country reports + tb_latest = make_table_with_country_flags(tb_latest) + + ## 1.2/ New table: counts + tb_counts = make_table_counts(tb_latest) + + ## 1.3/ Renamings + tb_latest = tb_latest.rename( + columns={ + "reporting": "reporting_latest", + } + ) + tb_counts = tb_counts.rename( + columns={ + "num_countries_reporting": "num_countries_reporting_latest", + "share_countries_reporting": "share_countries_reporting_latest", + } + ) + # 2/ GIT HISTORY + ## 2.0/ Only keep countries (as per the UN list) and avoid double-countings + COUNTRIES_SKIP = [ + "England", + "Scotland", + "Wales", + "Northern Ireland", + ] + tb = tb.loc[~tb["country"].isin(COUNTRIES_SKIP)] + + ## 2.1/ Process main table + tb_history = make_table_with_country_flags(tb) + ## 2.2/ New table: counts + tb_hist_counts = make_table_counts(tb_history) + ## 2.3/ Aux + tb["num_days_delay_in_reporting"] = (tb["date_first_reported"] - tb["date_first_value"]).dt.days + tb["num_days_delay_in_reporting"] = tb["num_days_delay_in_reporting"].copy_metadata(tb["date_first_reported"]) + tb["year"] = 2023 + tb = tb[["country", "year", "num_days_delay_in_reporting"]] + + # Format + tables = [ + tb_latest.format(["country", "date", "type"], short_name="country_flags"), + tb_counts.format(["country", "date", "type"], short_name="country_counts"), + tb_history.format(["country", "date", "type"], short_name="country_flags_historical"), + tb_hist_counts.format(["country", "date", "type"], short_name="country_counts_historical"), + tb.format(["country", "year"], short_name="country_reporting_delay"), + ] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def make_table_with_country_flags(tb: Table): + ## Add reporting column + tb["reporting"] = 1 + ## Copy metadata + tb["reporting"] = tb["reporting"].copy_metadata(tb["date"]) + ## Keep relevant columns + tb = tb.loc[:, ["country", "date", "reporting"]] + ## Extend + tb = expand_time_column(tb, time_col="date", dimension_col="country", method="full_range", fillna_method=["ffill"]) + ## Add data type + tb["type"] = "vaccinations" + + return tb + + +def make_table_counts(tb: Table, col_name: str = "reporting"): + ## Count number of countries reporting + tb_counts = tb.groupby("date", as_index=False)[col_name].sum() + ## Rename columns + tb_counts = tb_counts.rename( + columns={ + col_name: "num_countries_reporting", + } + ) + ## Estimate ratio + tb_counts["share_countries_reporting"] = ( + tb_counts["num_countries_reporting"] / tb_counts["num_countries_reporting"].max() + ) + ## Add world + tb_counts["country"] = "World" + ## Add data type + tb_counts["type"] = "vaccinations" + ## Sort columns + tb_counts = tb_counts[["country", "date", "type", "num_countries_reporting", "share_countries_reporting"]] + + return tb_counts diff --git a/etl/steps/data/garden/covid/latest/covax.py b/etl/steps/data/garden/covid/latest/covax.py index 67ba179351a..a9b42bef42e 100644 --- a/etl/steps/data/garden/covid/latest/covax.py +++ b/etl/steps/data/garden/covid/latest/covax.py @@ -17,7 +17,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("covax") # Read table from meadow dataset. - tb = ds_meadow.read_table("covax") + tb = ds_meadow.read("covax") # # Process data. diff --git a/etl/steps/data/garden/covid/latest/infections_model.py b/etl/steps/data/garden/covid/latest/infections_model.py index fb382dcc45b..2ce0b1c7a25 100644 --- a/etl/steps/data/garden/covid/latest/infections_model.py +++ b/etl/steps/data/garden/covid/latest/infections_model.py @@ -17,7 +17,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("infections_model") # Read table from meadow dataset. - tb = ds_meadow.read_table("infections_model") + tb = ds_meadow.read("infections_model") # # Process data. diff --git a/etl/steps/data/garden/covid/latest/shared.py b/etl/steps/data/garden/covid/latest/shared.py index a23468a4f34..a5ffe52b8d3 100644 --- a/etl/steps/data/garden/covid/latest/shared.py +++ b/etl/steps/data/garden/covid/latest/shared.py @@ -84,7 +84,7 @@ def make_table_population_daily(ds_population: Dataset, year_min: int, year_max: Uses linear interpolation. """ # Load population table - population = ds_population.read_table("population") + population = ds_population.read("population") # Filter only years of interest population = population[(population["year"] >= year_min) & (population["year"] <= year_max)] # Create date column @@ -179,7 +179,7 @@ def make_monotonic(tb: Table, max_removed_rows=10) -> Table: tb = tb.sort_values("date") metrics = ("total_vaccinations", "people_vaccinated", "people_fully_vaccinated") - tb[list(metrics)] = tb[list(metrics)].astype(float) + tb[list(metrics)] = tb[list(metrics)].astype("float64[pyarrow]") for metric in metrics: while not tb[metric].ffill().fillna(0).is_monotonic_increasing: diff = tb[metric].ffill().shift(-1) - tb[metric].ffill() diff --git a/etl/steps/data/garden/covid/latest/vaccinations_age.py b/etl/steps/data/garden/covid/latest/vaccinations_age.py index b0dbd79bd55..f9681ca375f 100644 --- a/etl/steps/data/garden/covid/latest/vaccinations_age.py +++ b/etl/steps/data/garden/covid/latest/vaccinations_age.py @@ -72,7 +72,7 @@ def fill_time_gaps(tb: Table) -> Table: Often, values for certain countries are missing. This can lead to very large under-estimates regional values. To mitigate this, we combine zero-filling with interpolation and other techniques. """ - tb = expand_time_column(tb, ["country", "vaccine"], "date", "full_range") + tb = expand_time_column(tb, dimension_col=["country", "vaccine"], time_col="date", method="full_range") # cumulative metrics: Interpolate, forward filling (for latest) + zero-filling (for remaining NaNs, likely at start) cols_ffill = [ "total_vaccinations", diff --git a/etl/steps/data/garden/covid/latest/vaccinations_global.py b/etl/steps/data/garden/covid/latest/vaccinations_global.py index 7df7ad7e5ec..205efa23403 100644 --- a/etl/steps/data/garden/covid/latest/vaccinations_global.py +++ b/etl/steps/data/garden/covid/latest/vaccinations_global.py @@ -212,7 +212,7 @@ def _prepare_table_for_aggregates(tb: Table) -> Table: Often, values for certain countries are missing. This can lead to very large under-estimates regional values. To mitigate this, we combine zero-filling with interpolation and other techniques. """ - tb_agg = expand_time_column(tb, "country", "date", method="full_range") + tb_agg = expand_time_column(tb, dimension_col="country", time_col="date", method="full_range") cols_index = ["country", "date"] # cumulative metrics: Interpolate, forward filling (for latest) + zero-filling (for remaining NaNs, likely at start) cols_ffill = [ diff --git a/etl/steps/data/garden/covid/latest/vaccinations_manufacturer.py b/etl/steps/data/garden/covid/latest/vaccinations_manufacturer.py index 1c23de05351..97293fc24f8 100644 --- a/etl/steps/data/garden/covid/latest/vaccinations_manufacturer.py +++ b/etl/steps/data/garden/covid/latest/vaccinations_manufacturer.py @@ -86,7 +86,7 @@ def fill_time_gaps(tb: Table) -> Table: Often, values for certain countries are missing. This can lead to very large under-estimates regional values. To mitigate this, we combine zero-filling with interpolation and other techniques. """ - tb = expand_time_column(tb, ["country", "vaccine"], "date", "full_range") + tb = expand_time_column(tb, dimension_col=["country", "vaccine"], time_col="date", method="full_range") # cumulative metrics: Interpolate, forward filling (for latest) + zero-filling (for remaining NaNs, likely at start) cols_ffill = [ "total_vaccinations", diff --git a/etl/steps/data/garden/covid/latest/vaccinations_us.meta.yml b/etl/steps/data/garden/covid/latest/vaccinations_us.meta.yml index 94d5622f023..f987a11471a 100644 --- a/etl/steps/data/garden/covid/latest/vaccinations_us.meta.yml +++ b/etl/steps/data/garden/covid/latest/vaccinations_us.meta.yml @@ -1,29 +1,21 @@ # NOTE: To learn more about the fields, hover over their names. definitions: - display_zero_day: &display_zero_day - zeroDay: 2020-01-01 - yearIsDay: true common: - display: - numDecimalPlaces: 0 - <<: *display_zero_day + numDecimalPlaces: 0 presentation: topic_tags: - COVID-19 - # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ dataset: update_period_days: 365 - title: COVID-19, Vaccinations (United States) tables: vaccinations_us: variables: - total_vaccinations: title: Total vaccinations unit: doses @@ -31,85 +23,77 @@ tables: description_processing: All doses, including boosters, are counted individually. total_vaccinations_per_hundred: title: Total vaccinations (per 100 people) - unit: 'doses per 100 people' + unit: "doses per 100 people" description_short: Cumularive number of COVID-19 vaccination doses administered, per 100 people. description_processing: All doses, including boosters, are counted individually. display: numDecimalPlaces: 2 - <<: *display_zero_day total_distributed: title: Total doses distributed - unit: 'doses' + unit: "doses" description_short: Cumulative counts of COVID-19 vaccine doses reported to Operation Warp Speed as delivered. distributed_per_hundred: title: Total doses distributed (per 100 people) - unit: 'doses per 100 people' + unit: "doses per 100 people" description_short: Cumulative counts of COVID-19 vaccine doses reported to Operation Warp Speed as delivered, per 100 people. display: numDecimalPlaces: 2 - <<: *display_zero_day people_vaccinated: title: People vaccinated description_short: Total number of people who received at least one vaccine dose. - unit: 'people' + unit: "people" people_vaccinated_per_hundred: title: People vaccinated (per 100 people) description_short: Share of people who received at least one vaccine dose. - unit: '%' - short_unit: '%' + unit: "%" + short_unit: "%" display: numDecimalPlaces: 2 - <<: *display_zero_day people_fully_vaccinated: title: People fully vaccinated description_short: Total number of people who received all doses prescribed by the vaccination protocol. - unit: 'people' + unit: "people" people_fully_vaccinated_per_hundred: title: People fully vaccinated (per 100 people) description_short: Share of people who received all doses prescribed by the vaccination protocol. - unit: '%' - short_unit: '%' + unit: "%" + short_unit: "%" display: numDecimalPlaces: 2 - <<: *display_zero_day total_boosters: title: Total booster doses administered - unit: 'doses' + unit: "doses" total_boosters_per_hundred: title: Total booster doses administered (per 100 people) - unit: 'doses per 100 people' + unit: "doses per 100 people" display: numDecimalPlaces: 3 - <<: *display_zero_day daily_vaccinations: title: Daily doses administered (7-day rolling average) description_short: All doses, including boosters, are counted individually. 7-day rolling average. - unit: 'doses' + unit: "doses" display: numDecimalPlaces: 2 - <<: *display_zero_day daily_vaccinations_raw: title: Daily doses administered description_short: All doses, including boosters, are counted individually. - unit: 'doses' + unit: "doses" daily_vaccinations_per_million: title: Daily doses administered (per million people) description_short: All doses, including boosters, are counted individually, per million people - unit: 'doses per million people' + unit: "doses per million people" display: numDecimalPlaces: 2 - <<: *display_zero_day share_doses_used: title: Share of doses used description_short: Share of distributed vaccination doses that have been administered/used in the population. Distributed figures represent those reported to Operation Warp Speed as delivered. - unit: '%' - short_unit: '%' + unit: "%" + short_unit: "%" display: numDecimalPlaces: 1 - <<: *display_zero_day diff --git a/etl/steps/data/garden/covid/latest/vaccinations_us.py b/etl/steps/data/garden/covid/latest/vaccinations_us.py index 40e36fca047..2cc8684f23c 100644 --- a/etl/steps/data/garden/covid/latest/vaccinations_us.py +++ b/etl/steps/data/garden/covid/latest/vaccinations_us.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("vaccinations_us") # Read table from meadow dataset. - tb = ds_meadow["vaccinations_us"].reset_index() + tb = ds_meadow.read("vaccinations_us") # # Process data. diff --git a/etl/steps/data/garden/covid/latest/yougov.meta.yml b/etl/steps/data/garden/covid/latest/yougov.meta.yml index d4ac7be9e20..03e09284fd1 100644 --- a/etl/steps/data/garden/covid/latest/yougov.meta.yml +++ b/etl/steps/data/garden/covid/latest/yougov.meta.yml @@ -17,111 +17,65 @@ definitions: part1: |- Have you had the first or second doses of a Coronavirus (COVID-19) vaccine? question_mapper: |- - <%- if (question == 'activities_improve_health') -%> - {definitions.questions_templates.standard.part1} 'I feel it is important to carry out activities which will improve my health' - <%- elif (question == 'avoided_crowded_areas') -%> - {definitions.questions_templates.i12.part1} 'Avoided crowded areas' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_going_out') -%> - {definitions.questions_templates.i12.part1} 'Avoided going out in general' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_guests_at_home') -%> - {definitions.questions_templates.i12.part1} 'Avoided having guests to your home' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_healthcare_settings') -%> - {definitions.questions_templates.i12.part1} 'Avoided going to hospital or other healthcare settings' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_large_gatherings') -%> - {definitions.questions_templates.i12.part1} 'Avoided large-sized social gatherings (more than 10 people)' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_medium_gatherings') -%> - {definitions.questions_templates.i12.part1} 'Avoided medium-sized social gatherings (between 3 and 10 people)' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_objects_public') -%> - {definitions.questions_templates.i12.part1} 'Avoided touching objects in public (e.g. elevator buttons or doors)' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_people_with_symptoms') -%> - {definitions.questions_templates.i12.part1} 'Avoided contact with people who have symptoms or you think may have been exposed to the coronavirus' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_public_events') -%> - {definitions.questions_templates.i12.part1} 'Avoided attending public events, such as sports matches, festivals, theatres, clubs, or going to religious services' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_public_transport') -%> - {definitions.questions_templates.i12.part1} 'Avoided taking public transport' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_shops') -%> - {definitions.questions_templates.i12.part1} 'Avoided going to shops' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_small_gatherings') -%> - {definitions.questions_templates.i12.part1} 'Avoided small social gatherings (not more than 2 people)' {definitions.questions_templates.i12.part2} - <%- elif (question == 'avoided_working_outside_home') -%> - {definitions.questions_templates.i12.part1} 'Avoided working outside your home' {definitions.questions_templates.i12.part2} - <%- elif (question == 'children_avoided_school') -%> - {definitions.questions_templates.i12.part1} 'Avoided letting your children go to school/university' {definitions.questions_templates.i12.part2} - <%- elif (question == 'cleaned_surfaces_home') -%> - {definitions.questions_templates.i12.part1} 'Cleaned frequently touched surfaces in the home (e.g. doorknobs, toilets, taps)' {definitions.questions_templates.i12.part2} - <%- elif (question == 'covered_mouth_sneeze') -%> - {definitions.questions_templates.i12.part1} 'Covered your nose and mouth when sneezing or coughing' {definitions.questions_templates.i12.part2} + <% set standard = "{definitions.questions_templates.standard.part1}" %> + <% set i12_part1 = "{definitions.questions_templates.i12.part1}" %> + <% set i12_part2 = "{definitions.questions_templates.i12.part2}" %> - <%- elif (question == 'covid_dangerous_to_me') -%> - {definitions.questions_templates.standard.part1} 'Coronavirus (COVID-19) is very dangerous for me' - <%- elif (question == 'covid_vaccine_important_health') -%> - How important do you think getting a COVID-19 vaccine is for your health? - <%- elif (question == 'covid_vaccine_received_one_or_two_doses') -%> - Have you had the first or second doses of a Coronavirus (COVID-19) vaccine? + <% set question_mapper = dict( + activities_improve_health=standard ~ " 'I feel it is important to carry out activities which will improve my health'", + avoided_crowded_areas=i12_part1 ~ " 'Avoided crowded areas' " ~ i12_part2, + avoided_going_out=i12_part1 ~ " 'Avoided going out in general' " ~ i12_part2, + avoided_guests_at_home=i12_part1 ~ " 'Avoided having guests to your home' " ~ i12_part2, + avoided_healthcare_settings=i12_part1 ~ " 'Avoided going to hospital or other healthcare settings' " ~ i12_part2, + avoided_large_gatherings=i12_part1 ~ " 'Avoided large-sized social gatherings (more than 10 people)' " ~ i12_part2, + avoided_medium_gatherings=i12_part1 ~ " 'Avoided medium-sized social gatherings (between 3 and 10 people)' " ~ i12_part2, + avoided_objects_public=i12_part1 ~ " 'Avoided touching objects in public (e.g. elevator buttons or doors)' " ~ i12_part2, + avoided_people_with_symptoms=i12_part1 ~ " 'Avoided contact with people who have symptoms or you think may have been exposed to the coronavirus' " ~ i12_part2, + avoided_public_events=i12_part1 ~ " 'Avoided attending public events, such as sports matches, festivals, theatres, clubs, or going to religious services' " ~ i12_part2, + avoided_public_transport=i12_part1 ~ " 'Avoided taking public transport' " ~ i12_part2, + avoided_shops=i12_part1 ~ " 'Avoided going to shops' " ~ i12_part2, + avoided_small_gatherings=i12_part1 ~ " 'Avoided small social gatherings (not more than 2 people)' " ~ i12_part2, + avoided_working_outside_home=i12_part1 ~ " 'Avoided working outside your home' " ~ i12_part2, + children_avoided_school=i12_part1 ~ " 'Avoided letting your children go to school/university' " ~ i12_part2, + cleaned_surfaces_home=i12_part1 ~ " 'Cleaned frequently touched surfaces in the home (e.g. doorknobs, toilets, taps)' " ~ i12_part2, + covered_mouth_sneeze=i12_part1 ~ " 'Covered your nose and mouth when sneezing or coughing' " ~ i12_part2, + covid_dangerous_to_me=standard ~ " 'Coronavirus (COVID-19) is very dangerous for me'", + covid_vaccine_important_health="How important do you think getting a COVID-19 vaccine is for your health?", + covid_vaccine_received_one_or_two_doses="Have you had the first or second doses of a Coronavirus (COVID-19) vaccine?", + covid_vaccine_will_prevent_transmission=standard ~ " 'A vaccine will completely prevent those who receive it from transmitting COVID19 to others'", + covid_vaccine_will_protect_health=standard ~ " 'A vaccine will completely protect those who receive it from possible health effects of COVID19'", + difficult_to_isolate="If you were advised to do so by a healthcare professional or public health authority, how easy or difficult would it be for you to self-isolate for 7 days?", + eaten_separately=i12_part1 ~ " 'Eaten separately at home, when normally you would eat a meal with others' " ~ i12_part2, + govt_will_provide_effective_covid_vaccine=standard ~ " 'I believe government health authorities in my country will provide me with an effective COVID19 vaccine'", + hand_sanitiser=i12_part1 ~ " 'Used hand sanitiser' " ~ i12_part2, + handwashing_yesterday="Thinking about yesterday… about how many times, would you say you washed your hands with soap or used hand sanitiser?", + household_members_contact="About how many people from your household have you come into physical contact with (within 2 meters / 6 feet)?", + life_greatly_affected=standard ~ " 'My life has been greatly affected by coronavirus (COVID-19)'", + likely_get_covid_future=standard ~ " 'It is likely that I will get coronavirus (COVID-19) in the future'", + mask_at_home=i12_part1 ~ " 'Worn a face mask inside your home' " ~ i12_part2, + mask_at_work=i12_part1 ~ " 'Worn a face mask at your place of work' " ~ i12_part2, + mask_clothing_store=i12_part1 ~ " 'Worn a face mask inside a clothing / footwear shop' " ~ i12_part2, + mask_grocery_store=i12_part1 ~ " 'Worn a face mask inside a grocery store / supermarket' " ~ i12_part2, + mask_not_possible=standard ~ " 'Wearing a mask to protect me against coronavirus (COVID-19) is not possible for me'", + mask_outside_home=i12_part1 ~ " 'Worn a face mask outside your home (e.g. when on public transport, going to a supermarket, going to a main road)' " ~ i12_part2, + mask_protect_me=standard ~ " 'Wearing a mask will protect me against coronavirus (COVID-19)'", + mask_protect_others=standard ~ " 'Wearing a mask will protect others against coronavirus (COVID-19)'", + mask_public_transport=i12_part1 ~ " 'Worn a face mask on public transportation' " ~ i12_part2, + people_contact_outside_household="Not including those people in your household, about how many people have you come into physical contact with (within 2 meters / 6 feet)?", + slept_separate_bedrooms=i12_part1 ~ " 'Slept in separate bedrooms at home, when normally you would share a bedroom' " ~ i12_part2, + times_left_home_yesterday="How many times did you leave your home yesterday? If you are not staying at home, how many times did you leave where you are staying yesterday?", + trust_covid_vaccines="How much do you trust COVID-19 vaccines?", + uncertain_covid_vaccinate_this_week=standard ~ " 'If a Covid-19 vaccine were made available to me this week, I would definitely get it' (neutral)", + unwillingness_covid_vaccinate_this_week=standard ~ " 'If a Covid-19 vaccine were made available to me this week, I would definitely get it' (disagreement)", + washed_hands=i12_part1 ~ " 'Washed hands with soap and water' " ~ i12_part2, + willingness_covid_vaccinate_this_week=standard ~ " 'If a Covid-19 vaccine were made available to me this week, I would definitely get it' (agreement)", + willingness_isolate_if_advised=i12_part1 ~ " 'Washed hands with soap and water' " ~ i12_part2, + willingness_isolate_if_symptoms="Thinking about the next 7 days… would you isolate yourself after feeling unwell or having any of the following new symptoms: a dry cough, fever, loss of sense of smell, loss of sense of taste, shortness of breath or difficulty breathing?", + worried_covid_vaccine_side_effects=standard ~ " 'I am worried about potential side effects of a COVID19 vaccine'" + ) %> - <%- elif (question == 'covid_vaccine_will_prevent_transmission') -%> - {definitions.questions_templates.standard.part1} 'A vaccine will completely prevent those who receive it from transmitting COVID19 to others' - <%- elif (question == 'covid_vaccine_will_protect_health') -%> - {definitions.questions_templates.standard.part1} 'A vaccine will completely protect those who receive it from possible health effects of COVID19' - <%- elif (question == 'difficult_to_isolate') -%> - If you were advised to do so by a healthcare professional or public health authority, how easy or difficult would it be for you be to self-isolate for 7 days? + << question_mapper[question] >> - <%- elif (question == 'eaten_separately') -%> - {definitions.questions_templates.i12.part1} 'Eaten separately at home, when normally you would eat a meal with others' {definitions.questions_templates.i12.part2} - - <%- elif (question == 'govt_will_provide_effective_covid_vaccine') -%> - {definitions.questions_templates.standard.part1} 'I believe government health authorities in my country will provide me with an effective COVID19 vaccine' - <%- elif (question == 'hand_sanitiser') -%> - {definitions.questions_templates.i12.part1} 'Used hand sanitiser' {definitions.questions_templates.i12.part2} - <%- elif (question == 'handwashing_yesterday') -%> - Thinking about yesterday… about how many times, would you say you washed your hands with soap or used hand sanitiser? - <%- elif (question == 'household_members_contact') -%> - About how many people from your household have you come into physical contact with (within 2 meters / 6 feet)? - <%- elif (question == 'life_greatly_affected') -%> - {definitions.questions_templates.standard.part1} 'My life has been greatly affected by coronavirus (COVID-19)' - <%- elif (question == 'likely_get_covid_future') -%> - {definitions.questions_templates.standard.part1} 'It is likely that I will get coronavirus (COVID-19) in the future' - <%- elif (question == 'mask_at_home') -%> - {definitions.questions_templates.i12.part1} 'Worn a face mask inside your home' {definitions.questions_templates.i12.part2} - <%- elif (question == 'mask_at_work') -%> - {definitions.questions_templates.i12.part1} 'Worn a face mask at your place of work' {definitions.questions_templates.i12.part2} - <%- elif (question == 'mask_clothing_store') -%> - {definitions.questions_templates.i12.part1} 'Worn a face mask inside a clothing / footwear shop' {definitions.questions_templates.i12.part2} - <%- elif (question == 'mask_grocery_store') -%> - {definitions.questions_templates.i12.part1} 'Worn a face mask inside a grocery store / supermarket' {definitions.questions_templates.i12.part2} - <%- elif (question == 'mask_not_possible') -%> - {definitions.questions_templates.standard.part1} 'Wearing a mask to protect me against coronavirus (COVID-19) is not possible for me' - <%- elif (question == 'mask_outside_home') -%> - {definitions.questions_templates.i12.part1} 'Worn a face mask outside your home (e.g. when on public transport, going to a supermarket, going to a main road)' {definitions.questions_templates.i12.part2} - <%- elif (question == 'mask_protect_me') -%> - {definitions.questions_templates.standard.part1} 'Wearing a mask will protect me against coronavirus (COVID-19)' - <%- elif (question == 'mask_protect_others') -%> - {definitions.questions_templates.standard.part1} 'Wearing a mask will protect others against coronavirus (COVID-19)' - <%- elif (question == 'mask_public_transport') -%> - {definitions.questions_templates.i12.part1} 'Worn a face mask on public transportation' {definitions.questions_templates.i12.part2} - <%- elif (question == 'people_contact_outside_household') -%> - Not including those people in your household, about how many people have you come into physical contact with (within 2 meters / 6 feet)? - <%- elif (question == 'slept_separate_bedrooms') -%> - {definitions.questions_templates.i12.part1} 'Slept in separate bedrooms at home, when normally you would share a bedroom' {definitions.questions_templates.i12.part2} - <%- elif (question == 'times_left_home_yesterday') -%> - How many times did you leave your home yesterday? If you are not staying at home, how many times did you leave where you are staying yesterday? - <%- elif (question == 'trust_covid_vaccines') -%> - How much do you trust COVID-19 vaccines? - <%- elif (question == 'uncertain_covid_vaccinate_this_week') -%> - {definitions.questions_templates.standard.part1} 'If a Covid-19 vaccine were made available to me this week, I would definitely get it' (neutral) - <%- elif (question == 'unwillingness_covid_vaccinate_this_week') -%> - {definitions.questions_templates.standard.part1} 'If a Covid-19 vaccine were made available to me this week, I would definitely get it' (disagreement) - <%- elif (question == 'washed_hands') -%> - <%- elif (question == 'willingness_covid_vaccinate_this_week') -%> - {definitions.questions_templates.standard.part1} 'If a Covid-19 vaccine were made available to me this week, I would definitely get it' (agreement) - - <%- elif (question == 'willingness_isolate_if_advised') -%> - {definitions.questions_templates.i12.part1} 'Washed hands with soap and water' {definitions.questions_templates.i12.part2} - <%- elif (question == 'willingness_isolate_if_symptoms') -%> - Thinking about the next 7 days… would you isolate yourself after feeling unwell or having any of the following new symptoms: a dry cough, fever, loss of sense of smell, loss of sense of taste, shortness of breath or difficulty breathing? - <%- elif (question == 'worried_covid_vaccine_side_effects') -%> - {definitions.questions_templates.standard.part1} 'I am worried about potential side effects of a COVID19 vaccine' - <%- endif -%> # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ diff --git a/etl/steps/data/garden/democracy/2024-03-07/bmr.py b/etl/steps/data/garden/democracy/2024-03-07/bmr.py index 2ada080e38c..8c398fd3b55 100644 --- a/etl/steps/data/garden/democracy/2024-03-07/bmr.py +++ b/etl/steps/data/garden/democracy/2024-03-07/bmr.py @@ -310,15 +310,15 @@ def add_years_in_democracy(tb: Table) -> Table: - num_years_in_democracy_ws: Total number of years in democracy with women's suffrage. """ ### Count the number of years since the country first became a democracy. Transition NaN -> 1 is considered as 0 -> 1. - tb["num_years_in_democracy_consecutive"] = tb.groupby(["country", tb["regime"].fillna(0).eq(0).cumsum()])[ - "regime" - ].cumsum() + tb["num_years_in_democracy_consecutive"] = tb.groupby( + ["country", tb["regime"].fillna(0).eq(0).astype(int).cumsum()] + )["regime"].cumsum() tb["num_years_in_democracy_consecutive"] = tb["num_years_in_democracy_consecutive"].astype(float) tb["num_years_in_democracy"] = tb.groupby("country")["regime"].cumsum() ## Add democracy age (including women's suffrage) / experience ### Count the number of years since the country first became a democracy. Transition NaN -> 1 is considered as 0 -> 1. tb["num_years_in_democracy_ws_consecutive"] = tb.groupby( - ["country", tb["regime_womsuffr"].fillna(0).eq(0).cumsum()] + ["country", tb["regime_womsuffr"].fillna(0).eq(0).astype(int).cumsum()] )["regime_womsuffr"].cumsum() tb["num_years_in_democracy_ws_consecutive"] = tb["num_years_in_democracy_ws_consecutive"].astype(float) tb["num_years_in_democracy_ws"] = tb.groupby("country")["regime_womsuffr"].cumsum() diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py index fddf58c2378..2d2bed25da4 100644 --- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py +++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py @@ -250,7 +250,8 @@ def add_age_and_experience(tb: Table) -> Table: # Replace category numbers with labels (age in *) mapping = {num: label for num, label in REGIME_LABELS.items() if num <= col[2]} mask = (tb[col_age] == 0) | (tb[col_age].isna()) - tb.loc[mask, col_age] = tb.loc[mask, col[0]].replace(mapping) + tb[col_age] = tb[col_age].astype(object) + tb.loc[mask, col_age] = tb.loc[mask, col[0]].astype(object).replace(mapping) return tb diff --git a/etl/steps/data/garden/demography/2022-12-08/population/__init__.py b/etl/steps/data/garden/demography/2022-12-08/population/__init__.py index 0ec4489cb3a..abb23658664 100644 --- a/etl/steps/data/garden/demography/2022-12-08/population/__init__.py +++ b/etl/steps/data/garden/demography/2022-12-08/population/__init__.py @@ -13,6 +13,7 @@ Provides data on former countries, and complements other sources with data on missing years for some countries. More on this dataset please refer to module gapminder_sg. """ + import os from copy import deepcopy from typing import List, cast diff --git a/etl/steps/data/garden/demography/2024-07-18/population_doubling_times.py b/etl/steps/data/garden/demography/2024-07-18/population_doubling_times.py index d6b724730ee..2a3abdd8b76 100644 --- a/etl/steps/data/garden/demography/2024-07-18/population_doubling_times.py +++ b/etl/steps/data/garden/demography/2024-07-18/population_doubling_times.py @@ -8,6 +8,7 @@ NOTE 2: In the future, we might want to have other countries and regions in this dataset. In that scenario, please review all the code below and Grapher's. """ + from typing import cast import numpy as np @@ -130,7 +131,7 @@ def get_target_years(tb: Table) -> Table: ## 2. Check if the sign of the population error changes (from negative to positive) ## Keep start and end year of target-crossing ## Tag target-crossing with a number (so that we know that the start- and end-years belong to the same target-crossing) - tb["target_crossing"] = np.sign(tb["population_error"]).diff() > 0 + tb["target_crossing"] = np.sign(tb["population_error"]).diff().fillna(0) > 0 tb["target_crossing"] = np.where(tb["target_crossing"], tb["target_crossing"].cumsum(), 0) tb["target_crossing"] = tb["target_crossing"] + tb["target_crossing"].shift(-1).fillna(0) diff --git a/etl/steps/data/garden/demography/2024-11-26/multiple_births.countries.json b/etl/steps/data/garden/demography/2024-11-26/multiple_births.countries.json new file mode 100644 index 00000000000..98cf9570896 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-11-26/multiple_births.countries.json @@ -0,0 +1,27 @@ +{ + "Australia": "Australia", + "Austria": "Austria", + "Canada": "Canada", + "Chile": "Chile", + "Czech Republic": "Czechia", + "Denmark": "Denmark", + "Finland": "Finland", + "France": "France", + "Germany": "Germany", + "Greece": "Greece", + "Iceland": "Iceland", + "Italy": "Italy", + "Japan": "Japan", + "Lithuania": "Lithuania", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Norway": "Norway", + "Republic of Korea": "South Korea", + "Spain": "Spain", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "United States": "United States", + "Uruguay": "Uruguay", + "UK-England and Wales": "England and Wales", + "UK-Scotland": "Scotland" +} diff --git a/etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.excluded_countries.json b/etl/steps/data/garden/demography/2024-11-26/multiple_births.excluded_countries.json similarity index 100% rename from etl/steps/archive/garden/ihme_gbd/2023-03-29/gbd_drug_disorders.excluded_countries.json rename to etl/steps/data/garden/demography/2024-11-26/multiple_births.excluded_countries.json diff --git a/etl/steps/data/garden/demography/2024-11-26/multiple_births.meta.yml b/etl/steps/data/garden/demography/2024-11-26/multiple_births.meta.yml new file mode 100644 index 00000000000..207df2e06ee --- /dev/null +++ b/etl/steps/data/garden/demography/2024-11-26/multiple_births.meta.yml @@ -0,0 +1,110 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Fertility Rate + + display: + entityAnnotationsMap: |- + Chile: Stillbirths excluded + South Korea: Stillbirths excluded + Czech Republic: Stillbirths included + Denmark: Stillbirths included + England and Wales: Stillbirths included + France: Stillbirths included + Greece: Stillbirths included + Italy: Stillbirths included + Lithuania: Stillbirths included + Netherlands: Stillbirths included + Norway: Stillbirths included + Spain: Stillbirths included + Sweden: Stillbirths included + Switzerland: Stillbirths included + Australia: Stillbirths included in some years + New Zealand: Stillbirths included in some years + Scotland: Stillbirths included in some years + United States: Stillbirths included in some years + Uruguay: Stillbirths included in some years + Austria: Stillbirths typically included + Canada: Stillbirths typically included + Finland: Stillbirths typically included + Germany: Stillbirths typically included + Japan: Stillbirths typically included + + description_processing: |- + Data sometimes includes stillbirths, therefore comparability across countries should be done with care. + + **Countries including stillbirths**: Czech Republic, Denmark, England and Wales, France, Greece, Italy, Lithuania, Netherlands, Norway, Spain, Sweden, Switzerland + + **Countries mostly including stillbirths**: Austria (unknown for 1920, 1921, 1928, 1929, 1931, 1934), Canada (unknown for 1921-1925, 1927-1990), Finland (unknown for 1906-1936, 1941-1999), Germany (unknown for 1906-1936), Japan (unknown for 1923-1936) + + **Countries excluding stillbirths**: Chile, South Korea + + **Countries with mixed practices**: Australia, New Zealand (excluded for 1856-1915), United States, Scotland (excluded for 1856-1938), Uruguay + + For more details about the data for a specific country, please refer to the original source. + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + multiple_births: + variables: + singletons: + title: Singleton deliveries + unit: "deliveries" + description_short: |- + Number of single deliveries. + twin_deliveries: + title: Twin deliveries + unit: "deliveries" + description_short: |- + Number of twin deliveries. + multiple_deliveries: + title: Multiple deliveries + unit: "deliveries" + description_short: |- + Total number of multiple deliveries (i.e. the sum of twin, triplet, and quadruplet+ deliveries). + total_deliveries: + title: Total deliveries + unit: "deliveries" + description_short: |- + Total number of deliveries (i.e., single and multiple deliveries combined) + singleton_rate: + title: Singleton rate + unit: "twin deliveries per total deliveries" + description_short: |- + The rate of deliveries that are single deliveries, per 1,000 deliveries. + twinning_rate: + title: Twinning delivery rate + unit: "twin deliveries per 1,000 deliveries" + description_short: |- + The rate of twin deliveries, per 1,000 deliveries. + multiple_rate: + title: Multiple delivery rate + unit: "multiple deliveries per 1,000 deliveries" + description_short: |- + The rate of deliveries that are multiple deliveries, per 1,000 deliveries. + children_delivery_ratio: + title: "Children per delivery" + unit: "children per 1,000 deliveries" + description_short: |- + The average number of children born per delivery. This is estimated by dividing the number of children born by the total number deliveries, per 1,000 people. + children_multiple_delivery_ratio: + title: "Children per multiple delivery" + unit: "children per 1,000 multiple deliveries" + description_short: |- + Number of children born in multiple deliveries divided by the total number of multiple deliveries, per 1,000 people. Gives an indication of the average number of children born in multiple deliveries. + multiple_to_singleton_ratio: + title: "Multiple children to singleton ratio" + unit: "multiple deliveries per 1,000 single delivery" + description_short: |- + Number of multiple deliveries per 1,000 single deliveries. + multiple_children: + title: "Multiple children" + unit: "children" + description_short: |- + Number of children born from multiple deliveries (twin babies, triplet babies, etc.) diff --git a/etl/steps/data/garden/demography/2024-11-26/multiple_births.py b/etl/steps/data/garden/demography/2024-11-26/multiple_births.py new file mode 100644 index 00000000000..eafd3129e47 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-11-26/multiple_births.py @@ -0,0 +1,225 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +FLAGS_EXPECTED = [ + { + "countries": [ + "Chile", + "South Korea", + ], + "flags": { + 0, + }, + }, + { + "countries": [ + "Czech Republic", + "Denmark", + "France", + "Greece", + "Italy", + "Lithuania", + "Netherlands", + "Norway", + "Spain", + "Switzerland", + "England/Wales", + ], + "flags": { + 1, + }, + }, + { + "countries": [ + "Australia", + "United States", + "Uruguay", + ], + "flags": { + 2, + }, + }, + { + "countries": [ + "Austria", + "Canada", + "Finland", + "Germany", + "Japan", + ], + "flags": { + 1, + 99, + }, + }, + { + "countries": { + "Iceland", + "New Zealand", + "Sweden", + "Scotland", + }, + "flags": { + 0, + 1, + }, + }, +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("multiple_births") + + # Read table from meadow dataset. + tb = ds_meadow.read("multiple_births") + + # Harmonize country names + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + ) + + # Sanity check + check_stillbirths(tb) + + # Adapt flags + tb = adapt_stillbirths_flags(tb) + + # Estimate singleton_rate + tb["singleton_rate"] = (1_000 * tb["singletons"] / tb["total_deliveries"]).round(2) + + # Estimate ratios + tb["children_delivery_ratio"] = ( + 1_000 * (tb["multiple_children"] + tb["singletons"]) / tb["total_deliveries"] + ).round(3) + tb["children_multiple_delivery_ratio"] = (1_000 * tb["multiple_children"] / tb["multiple_deliveries"]).round(3) + tb["multiple_to_singleton_ratio"] = (1_000 * tb["multiple_deliveries"] / tb["singletons"]).round(3) + + # Remove outliers + flag = (tb["country"] == "England and Wales") & (tb["year"] == 1938) + assert ( + tb.loc[flag, "children_multiple_delivery_ratio"] >= 4000 + ).all(), "Unexpected outlier for England and Wales in 1938" + tb.loc[flag, ["multiple_children", "children_multiple_delivery_ratio", "children_delivery_ratio"]] = pd.NA + flag = (tb["country"] == "England and Wales") & (tb["year"] == 1939) + assert ( + tb.loc[flag, "children_multiple_delivery_ratio"] <= 1500 + ).all(), "Unexpected outlier for England and Wales in 1938" + tb.loc[flag, ["multiple_children", "children_multiple_delivery_ratio", "children_delivery_ratio"]] = pd.NA + + # Keep relevant columns + tb = tb[ + [ + # Index + "country", + "year", + # Absolute numbers + "singletons", + "twin_deliveries", + "multiple_deliveries", + "total_deliveries", + # Relative numbers + "singleton_rate", + "twinning_rate", + "multiple_rate", + # Ratios + "children_delivery_ratio", + "children_multiple_delivery_ratio", + "multiple_to_singleton_ratio", + # Births + "multiple_children", + ] + ] + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def check_stillbirths(tb): + """Datapoints (country-year) are given using different methodologies. + + This is communciated in the 'stillbirths' column, which can vary from country to country (and year to year): + + 0: Stillbirths not included + 1: Stillbirths included + 2: Mixed (stillbirths included in some cases only) + 99: Unsure + + Reference: https://www.twinbirths.org/en/data-metadata/, Table 1 + """ + # Check that the stillbirths flags are as expected. + for expected in FLAGS_EXPECTED: + countries_expected = expected["countries"] + flags_expected = expected["flags"] + + flags_actual = set(tb.loc[tb["country"].isin(countries_expected), "stillbirths"].unique()) + + assert ( + flags_actual == flags_expected + ), f"Expected flags {flags_expected} for countries {countries_expected} are not as expected! Found: {flags_actual}" + + # Check Overlaps + ## There are overlaps in New Zealand and Sweden + x = tb.groupby(["country", "year"], as_index=False).stillbirths.nunique() + countries_overlap_expected = {"New Zealand", "Sweden"} + countries_overlap_actually = set(x.loc[x["stillbirths"] != 1, "country"].unique()) + assert ( + countries_overlap_actually == countries_overlap_expected + ), f"Expected countries with overlaps {countries_overlap_expected} are not as expected! Found: {countries_overlap_actually}" + + +def adapt_stillbirths_flags(tb): + # Iceland: Remove even there is no replacement. Keep only 1. + country = "Iceland" + flag = (tb["country"] == country) & (tb["stillbirths"] == 0) + assert len(tb.loc[flag]) == 5, f"Unexpected number of values for {country}" + tb = tb.loc[~flag] + + # If there is 1 and 0, keep 1. + flag = tb.sort_values("stillbirths").duplicated(subset=["country", "year"], keep="last") + assert set(tb.loc[flag, "stillbirths"].unique()) == { + 0 + }, "Removed rows because of duplicate country-year values should only be stillbirths=0!" + tb = tb.loc[~flag] + + # Sweden: Remove, ensure there is actually redundancy. Keep 1. + assert set(tb.loc[tb["country"] == "Sweden", "stillbirths"].unique()) == {1}, "Unexpected stillbirths=0 for Sweden!" + + return tb + + +def get_summary_methodology_sb(tb): + tbx = tb.groupby("country", as_index=False)["stillbirths"].agg(["nunique", "unique"]) + + # Only one method + tbx1 = tbx.loc[tbx["nunique"] == 1] + tbx1["unique"] = tbx1["unique"].apply(lambda x: x[0]) + tbx1 = tbx1[["country", "unique"]].sort_values("unique") + + # Multiple methods + tbx2 = tbx.loc[tbx["nunique"] > 1] + countries_mult = set(tbx2["country"].unique()) + tb[tb["country"].isin(countries_mult)].groupby(["country", "stillbirths"]).agg({"year": ("min", "max")}) diff --git a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml new file mode 100644 index 00000000000..07e19bde3f9 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml @@ -0,0 +1,44 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Life Expectancy + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/en/latest/architecture/metadata/reference/dataset/ +dataset: + title: Survivorship percentiles (HMD; Alvarez and Vaupel 2023) + update_period_days: 365 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/en/latest/architecture/metadata/reference/tables/ +tables: + survivor_percentiles: + variables: + age: + title: Survivorship age + unit: years + processing_level: major + description_short: |- + <%- if percentile == 1 -%> + The age until which the 1st percentile (99% of the population) of the population would survive until, if they experienced the same age-specific death rates throughout their whole lives as the age-specific death rates seen in that particular year. + <%- else -%> + The age until which the << percentile>>th percentile (<< 100 - percentile|int>>% of the population) of the population would survive until, if they experienced the same age-specific death rates throughout their whole lives as the age-specific death rates seen in that particular year. + <%- endif -%> + + description_processing: |- + This was calculated with the method published in Alvarez and Vaupel (2023), with code provided by the authors: + + Jesús-Adrián Alvarez, James W. Vaupel; Mortality as a Function of Survival. Demography 1 February 2023; 60 (1): 327–342. doi: https://doi.org/10.1215/00703370-10429097 + + These estimates were regenerated for data from more recent years in the Human Mortality Database. + + Original R code from: https://github.com/jssalvrz/s-ages + description_key: + - This is calculated with the period life tables indicators. + display: + numDecimalPlaces: 1 + presentation: + attribution: |- + Alvarez & Vaupel (2023); Human Mortality Database (2024) diff --git a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py new file mode 100644 index 00000000000..1f2b1ef59cc --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py @@ -0,0 +1,137 @@ +"""Load a meadow dataset and create a garden dataset. + +Methods used here are taken from https://github.com/jssalvrz/s-ages. Authors of Citation: Alvarez, J.-A., & Vaupel, J. W. (2023). Mortality as a Function of Survival. Demography, 60(1), 327–342. https://doi.org/10.1215/00703370-10429097 + + +Dr. Saloni Dattani translated the R scripts into Python: + - Original: https://github.com/jssalvrz/s-ages + - Translated: https://github.com/saloni-nd/misc/tree/main/survivorship-ages + +Lucas Rodes-Guirao adapted the python code for ETL. +""" + +import numpy as np +import pandas as pd +from owid.catalog import Table +from scipy.integrate import cumulative_trapezoid as cumtrapz +from scipy.interpolate import InterpolatedUnivariateSpline + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + paths.log.info("load data.") + # Load meadow dataset. + ds_meadow = paths.load_dataset("hmd") + + # Read table from meadow dataset. + tb_deaths = ds_meadow.read("deaths") + tb_exposure = ds_meadow.read("exposures") + + # + # Process data. + # + # Combine tables, drop NaNs + tb = tb_deaths.merge(tb_exposure, on=["country", "year", "sex", "age"], how="outer") + tb = tb.dropna(subset=["deaths", "exposure"], how="any") + + # Keep format="1x1", and sex="both" + paths.log.info("keep period & 1-year data.") + tb = tb.loc[tb["age"].str.match(r"^(\d{1,3}|d{3}\+)$") & (tb["type"] == "period")] + + # Drop unused columns + tb = tb.drop(columns=["type"]) + + # 110+ -> 110 + paths.log.info("replace 110+ -> 100, set Dtypes.") + tb["age"] = tb["age"].replace({"110+": "110"}).astype(int) + + # Sort + tb = tb.sort_values(["year", "age"]) + + # Actual calculation + paths.log.info("calculate surviorship ages (can take some minutes)...") + columns_grouping = ["country", "sex", "year"] + tb = tb.groupby(columns_grouping).apply(lambda group: obtain_survivorship_ages(group)).reset_index() # type: ignore + + # Unpivot + paths.log.info("reshape table") + tb = tb.melt( + id_vars=["country", "sex", "year"], + value_vars=["s1", "s10", "s20", "s30", "s40", "s50", "s60", "s70", "s80", "s90", "s99"], + var_name="percentile", + value_name="age", + ) + tb = tb.dropna(subset=["percentile"]) + tb["percentile"] = tb["percentile"].str.replace("s", "").astype(int) + tb["percentile"] = 100 - tb["percentile"] + + # Propagate metadata + tb["age"].metadata.origins = tb_exposure["exposure"].m.origins.copy() + + # Set index + paths.log.info("format") + tb = tb.format(["country", "year", "sex", "percentile"], short_name="survivor_percentiles") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def obtain_survivorship_ages(tb_group: Table, start_age: int = 0, end_age: int = 110) -> pd.DataFrame: + """Get survivorship ages given a life and deaths table. + + Output dataframe has a column for each percentile of survivorship age. + + tb_group is expected to be a subset of the compelte table. It should only concern a particular (country, year, sex) triple. + """ + # Step 1: Apply splines, get Mx for each (country, year, sex, age) + ## Define splines + ### We could use CubicSpline (k=3 order), but it provides slightly different results hence, for precaution, we sticked to InterpolatedUnivariateSpline. + ### This is equivalent to R function interpSpline + spline_deaths = InterpolatedUnivariateSpline(tb_group["age"], tb_group["deaths"], k=3) + spline_exposures = InterpolatedUnivariateSpline(tb_group["age"], tb_group["exposure"], k=3) + + ## Define age range (with step 0.01) + age_range = np.arange(start_age, end_age, 0.01) + + # Run splines over age range + deaths_spline = np.abs(spline_deaths(age_range)) + exposure_spline = np.abs(spline_exposures(age_range)) + exposure_spline[exposure_spline == 0] = np.nan + survival_age_spline = np.abs(deaths_spline / exposure_spline) + + # Step 2: Calculate survival, density, hazard, and cumulative hazards + ## Estimate parameters + Hx = cumtrapz(y=survival_age_spline, x=age_range, initial=0) # Hazard CDF + Sx = np.exp(-Hx) # Survivor function + + # Step 3: Calculate survivorship ages from parameters + out = {} + out["s0"] = max(age_range) + ## I'm using a for loop to simplify the logic here + for i in range(1, 101): + try: + sx_rounded = np.ceil((100 * Sx).round(3)) + value = age_range[sx_rounded == i][0] + out[f"s{i}"] = value + except IndexError: + out[f"s{i}"] = np.nan + + # Create output dataframe + df = pd.DataFrame(out, index=[0]) + + return df diff --git a/etl/steps/data/garden/demography/2024-12-03/birth_rate.meta.yml b/etl/steps/data/garden/demography/2024-12-03/birth_rate.meta.yml new file mode 100644 index 00000000000..0ee61f6edb7 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/birth_rate.meta.yml @@ -0,0 +1,45 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + title_public: Birth rate + topic_tags: + - Fertility Rate + display: + name: |- + Birth rate + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + title: Birth Rate (HMD; UN WPP) + update_period_days: 365 + +tables: + birth_rate: + variables: + birth_rate: + title: Birth rate + unit: births per 1,000 people + description_short: |- + The total number of births per 1,000 people in a given year. + description_processing: |- + The birth data is constructed by combining data from multiple sources: + + - Before 1949: Historical estimates by Human Mortality Database (2024). + + - 1950-2023: Population records by the UN World Population Prospects (2024 revision). + + - 2024-2100: Projections based on Medium variant by the UN World Population Prospects (2024 revision). + + birth_rate_hist: + title: Birth rate (historical) + unit: births per 1,000 people + description_short: |- + The total number of births per 1,000 people in a given year. + description_processing: |- + The birth data is constructed by combining data from multiple sources: + + - Before 1949: Historical estimates by Human Mortality Database (2024). + + - 1950-2023: Population records by the UN World Population Prospects (2024 revision). diff --git a/etl/steps/data/garden/demography/2024-12-03/birth_rate.py b/etl/steps/data/garden/demography/2024-12-03/birth_rate.py new file mode 100644 index 00000000000..02508c2497b --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/birth_rate.py @@ -0,0 +1,62 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +YEAR_WPP_PROJ_START = 2024 +YEAR_WPP_START = 1950 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_hmd = paths.load_dataset("hmd") + ds_un = paths.load_dataset("un_wpp") + + # Read table from meadow dataset. + tb_hmd = ds_hmd.read("births") + tb_un = ds_un.read("births") + + # + # Process data. + # + # UN + tb_un = tb_un.loc[ + (tb_un["age"] == "all") & (tb_un["variant"].isin(["medium", "estimates"])), + ["country", "year", "birth_rate"], + ] + # HMD + tb_hmd = tb_hmd.loc[ + (tb_hmd["year"] < YEAR_WPP_START) & (tb_hmd["sex"] == "total"), ["country", "year", "birth_rate"] + ] + + # Combine + tb = pr.concat([tb_hmd, tb_un], ignore_index=True, short_name="birth_rate") + tb = tb.dropna(subset=["birth_rate"]) + + # Add historical variant + tb["birth_rate_hist"] = tb["birth_rate"].copy() + tb.loc[tb["year"] > YEAR_WPP_PROJ_START, "birth_rate_hist"] = pd.NA + + # Format + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb], + check_variables_metadata=True, + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/demography/2024-12-03/broken_limits_le.meta.yml b/etl/steps/data/garden/demography/2024-12-03/broken_limits_le.meta.yml new file mode 100644 index 00000000000..b8511924d1d --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/broken_limits_le.meta.yml @@ -0,0 +1,51 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Life Expectancy + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + title: "Life Expectancy: Broken limits" + update_period_days: 365 + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + broken_limits_le: + variables: + life_expectancy: + title: &le_name Maximum life expectancy + unit: years + description_short: |- + <%- if (sex == 'female') -%> + Maximum life expectancy recorded in a given year (among females). + <%- elif (sex == 'male') -%> + Maximum life expectancy recorded in a given year (among males). + <%- elif (sex == 'all') -%> + Maximum life expectancy recorded in a given year. + <%- endif -%> + description_key: + - Period life expectancy is a metric that summarizes death rates across all age groups in one particular year. For a given year, it represents the average lifespan for a hypothetical group of people, if they experienced the same age-specific death rates throughout their lives as the age-specific death rates seen in that particular year. + - Records are only shown for countries in the Human Mortality Database. Prior to 1950, we use HMD (2023) data. From 1950 onwards, we use UN WPP (2022) data. + display: + name: *le_name + presentation: + title_public: *le_name + title_variant: "" + attribution_short: HMD; UN WPP + topic_tags: + - Life Expectancy + grapher_config: + hasMapTab: true + + country_with_max_le: + title: Country with yearly maximum life expectancy + unit: "" + description_short: |- + Name of the country with the yearly maximum life expectancy registered<%- if (sex == 'female') %> among females<% elif (sex == 'male') %> among males<% endif -%>. + description_processing: This indicator is meant to be used as an auxiliary indicator. diff --git a/etl/steps/data/garden/demography/2024-12-03/broken_limits_le.py b/etl/steps/data/garden/demography/2024-12-03/broken_limits_le.py new file mode 100644 index 00000000000..2e64cba9275 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/broken_limits_le.py @@ -0,0 +1,76 @@ +"""Load a meadow dataset and create a garden dataset. + +We only consider data from countries that are present in HMD. And, additionally, we only consider entries for these countries since the year they first appear in the HMD dataset (even if for that period we use UN WPP data, i.e. post-1950) +""" + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Year to start tracking. Note that in the first years, few countries have data. Hence, we start in a later year, where more countries have data. +YEAR_FIRST = 1840 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("life_tables") + ds_hmd = paths.load_dataset("hmd") + + # Read table from meadow dataset. + tb = ds_meadow.read("life_tables", reset_index=False) + tb_hmd = ds_hmd.read("life_tables") + + # + # Process data. + # + # Filter relevant dimensions + tb = tb.loc[(slice(None), slice(None), slice(None), "0", "period"), ["life_expectancy"]].reset_index() + + # Keep relevant columns and rows + tb = tb.drop(columns=["type", "age"]).dropna() + + # Rename column + tb = tb.rename(columns={"location": "country"}) + + # Get country-sex and first year of LE reported in HMD + tb_hmd = get_first_year_of_country_in_hmd(tb_hmd) + + # Only preserve countries coming from HDM + tb = tb.merge(tb_hmd, on=["country", "sex"], suffixes=("", "_min")) + tb = tb[tb["year"] >= tb["year_min"]].drop(columns=["year_min"]) + + # Get max for each year + tb = tb.loc[tb.groupby(["year", "sex"], observed=True)["life_expectancy"].idxmax()] + + # Organise columns + tb["country_with_max_le"] = tb["country"] + tb["country"] = tb["country"] + " " + tb["year"].astype("string") + + # First year + tb = tb[tb["year"] >= YEAR_FIRST] + + # Set index + tb = tb.format(["country", "year", "sex"], short_name="broken_limits_le") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def get_first_year_of_country_in_hmd(tb_hmd: Table) -> Table: + tb_hmd = tb_hmd.loc[(tb_hmd["type"] == "period") & (tb_hmd["age"] == "0")] + tb_hmd = tb_hmd.loc[:, ["country", "year", "sex", "life_expectancy"]].dropna() + tb_hmd = tb_hmd.groupby(["country", "sex"], observed=True, as_index=False)["year"].min() + return tb_hmd diff --git a/etl/steps/data/garden/demography/2024-12-03/fertility_rate.meta.yml b/etl/steps/data/garden/demography/2024-12-03/fertility_rate.meta.yml new file mode 100644 index 00000000000..3390773b651 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/fertility_rate.meta.yml @@ -0,0 +1,55 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Fertility Rate + attribution: UN WPP (2024); HFD (2024) + processing_level: major + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + title: Fertility Rate (UN WPP; HFD) + update_period_days: 365 + +tables: + fertility_rate: + variables: + fertility_rate: + title: Fertility rate (period) + description_short: |- + The average number of live births a hypothetical cohort of women would have at the end of their reproductive period if they were subject during their whole lives to the fertility rates of a given period and if they were not subject to mortality. + description_key: + - Assumes current age-specific fertility rates remain constant throughout a woman's lifetime. + - Does not account for potential changes in social, economic, or health conditions that could affect fertility rates. + unit: live births per woman + description_processing: |- + The fertility data is constructed by combining data from multiple sources: + + - Before 1949: Historical estimates by Human Fertility Database (2024). + + - 1950-2023: Population records by the UN World Population Prospects (2024 revision). + + - 2024-2100: Projections based on Medium variant by the UN World Population Prospects (2024 revision). + presentation: + title_public: Fertility rate + title_variant: period tables + + fertility_rate_hist: + title: Fertility rate (period), historical + description_short: |- + The average number of live births a hypothetical cohort of women would have at the end of their reproductive period if they were subject during their whole lives to the fertility rates of a given period and if they were not subject to mortality. + description_key: + - Assumes current age-specific fertility rates remain constant throughout a woman's lifetime. + - Does not account for potential changes in social, economic, or health conditions that could affect fertility rates. + unit: live births per woman + description_processing: |- + The fertility data is constructed by combining data from multiple sources: + + - Before 1949: Historical estimates by Human Fertility Database (2024). + + - 1950-2023: Population records by the UN World Population Prospects (2024 revision). + presentation: + title_public: Fertility rate + title_variant: period tables diff --git a/etl/steps/data/garden/demography/2024-12-03/fertility_rate.py b/etl/steps/data/garden/demography/2024-12-03/fertility_rate.py new file mode 100644 index 00000000000..d2f339089c8 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/fertility_rate.py @@ -0,0 +1,61 @@ +"""Load a meadow dataset and create a garden dataset.""" +import pandas as pd +from owid.catalog import processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Year constants +YEAR_WPP_START = 1950 +YEAR_WPP_PROJ_START = 2023 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_hfd = paths.load_dataset("hfd") + ds_un = paths.load_dataset("un_wpp") + + # Read table from meadow dataset. + tb_hfd = ds_hfd.read("period") + tb_un = ds_un.read("fertility_rate") + + # UN: estimates + medium, + tb_un = tb_un.loc[ + (tb_un["sex"] == "all") & (tb_un["variant"].isin(["medium", "estimates"]) & (tb_un["age"] == "all")), + ["country", "year", "fertility_rate"], + ] + + # HFD: tfr, birth_order=total, + tb_hfd = tb_hfd.loc[ + ((tb_hfd["birth_order"] == "total") & (tb_hfd["year"] < YEAR_WPP_START)), ["country", "year", "tfr"] + ].rename(columns={"tfr": "fertility_rate"}) + + # + # Process data. + # + tb = pr.concat([tb_hfd, tb_un], ignore_index=True, short_name="fertility_rate") + + # Add historical variant + tb["fertility_rate_hist"] = tb["fertility_rate"].copy() + tb.loc[tb["year"] > YEAR_WPP_PROJ_START, "fertility_rate_hist"] = pd.NA + + # Format + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb], + check_variables_metadata=True, + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/demography/2024-12-03/gini_le.meta.yml b/etl/steps/data/garden/demography/2024-12-03/gini_le.meta.yml new file mode 100644 index 00000000000..d0b96883e79 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/gini_le.meta.yml @@ -0,0 +1,35 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Life Expectancy + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + title: Gini coefficient of lifespan inequality (HMD, UN WPP, Aburto et al.; 2023) + update_period_days: 365 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + gini_le: + variables: + life_expectancy_gini: + title: Gini coefficient of lifespan inequality + unit: "" + processing_level: major + description_short: |- + The level of inequality in lifespans, measured between 0 and 1. + description_key: + - |- + {tables.gini_le.variables.life_expectancy_gini.description_short} + - A higher coefficient indicates greater inequality in ages of death, while a lower coefficient indicates more uniform ages of death. + description_processing: |- + This was calculated using the algorithm and scripts from Aburto et al. (2020). We regenerated the Gini coefficient, rather than the inverse-log Gini coefficient. + + Citation: Aburto, J. M., Villavicencio, F., Basellini, U., Kjærgaard, S., & Vaupel, J. W. (2020). Dynamics of life expectancy and life span equality. Proceedings of the National Academy of Sciences, 117(10), 5250–5259. https://doi.org/10.1073/pnas.1915884117 Code available on Zenodo: https://zenodo.org/record/3571095 + presentation: + attribution: |- + Human Mortality Database (2024); Aburto et al. (2023) diff --git a/etl/steps/data/garden/demography/2024-12-03/gini_le.py b/etl/steps/data/garden/demography/2024-12-03/gini_le.py new file mode 100644 index 00000000000..4035c4718b1 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/gini_le.py @@ -0,0 +1,148 @@ +"""Estimate the gini index on life expectency""" + +from typing import Any, cast + +import numpy as np +from numpy.typing import NDArray +from owid.catalog import Table, Variable + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + paths.log.info("gini_le: load data") + # Load meadow dataset. + ds_meadow = paths.load_dataset("life_tables") + + # Read table from meadow dataset. + tb = ds_meadow.read("life_tables") + + # + # Process data. + # + # Keep relevant dimensions + paths.log.info("gini_le: keep relevant dimensions (type='period', sex in ['male', 'female'])") + tb = tb[(tb["type"] == "period") & (tb["sex"].isin(["male", "female"]))] + paths.log.info("gini_le: set year dtype to int") + + # Get origins + origins = tb["life_expectancy"].m.origins + + # Get rate for central_death_rate, as it is given per 1,000 people. + paths.log.info("gini_le: get rate for central_death_rate, as it is given per 1,000 people.") + tb["central_death_rate"] = tb["central_death_rate"] / 1000 + + # 110+ -> 110 + paths.log.info("gini_le: replace 110+ -> 100, 110+ -> 110, set Dtypes.") + tb["age"] = ( + tb["age"] + .replace( + { + "110+": "110", + "100+": "100", + } + ) + .astype("Int64") + ) + + # Sort rows + paths.log.info("gini_le: sort rows (needed for correct estimation)") + tb = tb.sort_values(["country", "year", "sex", "age"]) + + # Estimates (this can take a half minute or so, depending on computation power) + tb = tb.groupby(["country", "year", "sex"], as_index=False, observed=False).apply(gini_from_mx) + tb.life_expectancy_gini.m.origins = origins + + # Rename columns + paths.log.info("gini_le: rename columns") + tb = tb.rename(columns={"central_death_rate": "life_expectancy_gini"}) + + # Set index + paths.log.info("gini_le: set index") + tb = tb.format(["country", "year", "sex"], short_name=paths.short_name) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def AKm02a0(m0: float, is_male: bool = True) -> NDArray[Any]: + """Get estimates. + + Calculate the average number of years lived in the first year of life (ax for age 0), and is calculated based on the mortality rate during the first year of life (m0). + + There is a slight different procedure for male or female. + + More details: https://www.rdocumentation.org/packages/MortHump/versions/0.2/topics/AKm02a0 + """ + if is_male: + return np.where(m0 < 0.0230, 0.14929 - 1.99545 * m0, np.where(m0 < 0.08307, 0.02832 + 3.26201 * m0, 0.29915)) + else: + return np.where(m0 < 0.01724, 0.14903 - 2.05527 * m0, np.where(m0 < 0.06891, 0.04667 + 3.88089 * m0, 0.31411)) + + +def gini_from_mx(tb_group: Table) -> Variable: + """Get Gini coefficient from central death rate. + + This code is adapted from the original R code: https://github.com/jmaburto/Dynamics_Code/tree/V1.0/R%20code + """ + # Get values from input + mx = tb_group["central_death_rate"].values + is_male = tb_group.name[2] == "male" + + # Estimate i_openage, ax + i_openage = len(mx) + m0 = cast(float, mx[0]) + ax = np.full_like(mx, 0.5) + ax[0] = AKm02a0(m0=m0, is_male=is_male) + ax[i_openage - 1] = 1 / mx[i_openage - 1] # type: ignore + + # Estimate X_ + age = np.arange(i_openage) + ax + e = np.ones_like(age) + X_ = np.abs(np.outer(e, age) - np.outer(age, e)) + + # Estimate D + OPENAGE = i_openage - 1 + ## Calculates the probability of dying in each age interval + qx = mx / (1 + (1 - ax) * mx) # type: ignore + qx[i_openage - 1] = 1 if not np.isnan(qx[i_openage - 1]) else np.nan + ## Probability of surviving in each age interval + px = 1 - qx + px[np.isnan(px)] = 0 + ## number of survivors at the start of each interval + RADIX = 1 # starting value + lx = np.concatenate(([RADIX], RADIX * np.cumprod(px[:OPENAGE]))) + ## number of people who die in each interval + dx = lx * qx + ## number of person years lived in each interval + ## [number of initial survivors in that interval] - (1 - [number of years lived during that interval]) * [number who die in the interval] + Lx = lx - (1 - ax) * dx + Lx[i_openage - 1] = lx[i_openage - 1] * ax[i_openage - 1] + ## total number of life years from a given age to the end of the cohort + Tx = np.concatenate((np.cumsum(Lx[:OPENAGE][::-1])[::-1], [0])) + Lx[i_openage - 1] + ## life expectancy + ex = Tx / lx + ## matrix with the number of deaths for each age-pair combination + D = np.outer(dx, dx) + + # Estimate Gini + ## total inequality in lifespans: sum of the product of the matrix D by the age difference, np.sum(D * X_) + ## divided by the life expectancy at birth x2 (helps to normalise it to a number between 0 and 1) + G = np.sum(D * X_) / (2 * ex[0]) + + var = Variable({"life_expectancy_gini": G}) + return var diff --git a/etl/steps/data/garden/demography/2024-12-03/life_expectancy.meta.yml b/etl/steps/data/garden/demography/2024-12-03/life_expectancy.meta.yml new file mode 100644 index 00000000000..c22b9206970 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/life_expectancy.meta.yml @@ -0,0 +1,166 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + global: + selected_entities: + with_americas: &sel_entities_americas + - Africa + - Americas + - Asia + - Europe + - Oceania + - World + standard: &sel_entities + - Africa + - Northern America + - Latin America and the Caribbean + - Asia + - Europe + - Oceania + - World + title_base: Period life expectancy + title_public: Life expectancy + data_used: |- + <%- if (age == 0) and (sex == 'total') -%> + Prior to 1950, we use HMD (2024) data combined with Zijdeman (2015). From 1950 onwards, we use UN WPP (2024) data. For old regional data, we use Riley (2005) estimates. + <%- else -%> + Prior to 1950, we use HMD (2024) data. From 1950 onwards, we use UN WPP (2024) data. + <%- endif -%> + point_1: |- + Period life expectancy is a metric that summarizes death rates across all age groups in one particular year. + point_2: |- + <%- if age == '0' -%> + For a given year, it represents the average lifespan for a hypothetical group of people, if they experienced the same age-specific death rates throughout their whole lives as the age-specific death rates seen in that particular year. + <%- else -%> + For a given year, it represents the remaining average lifespan for a hypothetical group of people, if they experienced the same age-specific death rates throughout the rest of their lives as the age-specific death rates seen in that particular year. + <%- endif -%> + + common: + description_short: |- + <%- if age == 0 -%> + <%- if sex == 'total' -%> + The period life expectancy at birth, in a given year. + <%- else -%> + The period life expectancy at birth among << sex + 's' >>, in a given year. + <%- endif -%> + <%- else -%> + <%- if sex == 'total' -%> + The total period life expectancy at age << age >>, in a given year. + <%- else -%> + The total period life expectancy at age << age >> among << sex + 's' >>, in a given year. + <%- endif -%> + <%- endif -%> + description_key: + - |- + {definitions.global.point_1} + - |- + {definitions.global.point_2} + - |- + {definitions.global.data_used} + presentation: + title_public: |- + {definitions.global.title_public} at << age if age != 0 else 'birth'>> + attribution_short: HMD, UN WPP + topic_tags: + - Life Expectancy + grapher_config: + hasMapTab: true + selectedEntityNames: *sel_entities + display: + numDecimalPlaces: 1 + unit: years + short_unit: years + processing_level: minor + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + title: Life Expectancy (period) + update_period_days: 365 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + # HISTORICAL VALUES (until today) + life_expectancy: + variables: + # {definitions.global.title_base} + life_expectancy: + title: |- + {definitions.global.title_base} + display: &display_hist + numDecimalPlaces: 1 + name: |- + {definitions.global.title_public} at << 'birth' if (age == 0) else age >><< ', ' + sex + 's' if (sex != 'total') >> + presentation: + title_variant: &title_variant_hist << sex + 's, ' if sex != 'total' >>period tables + + life_expectancy_0: + title: |- + {definitions.global.title_base} at birth + display: *display_hist + presentation: + attribution_short: Various sources + title_variant: *title_variant_hist + grapher_config: + selectedEntityNames: *sel_entities_americas + attribution: |- + UN WPP (2024); HMD (2024); Zijdeman et al. (2015); Riley (2005) + + # WITH PROJECTIONS + life_expectancy_with_proj: + variables: + life_expectancy_with_proj: + title: |- + {definitions.global.title_base} (with projections) + display: &display_w_proj + numDecimalPlaces: 1 + name: |- + {definitions.global.title_public} at << 'birth' if (age == 0) else age >><< ', ' + sex + 's' if (sex != 'total') >>, with UN medium projections + presentation: + title_variant: &title_variant_w_proj << sex + 's, ' if sex != 'total' >>period tables, with UN medium projections + + life_expectancy_0_with_proj: + title: |- + {definitions.global.title_base} at birth (with projections) + display: *display_w_proj + presentation: + attribution_short: Various sources + title_variant: *title_variant_w_proj + grapher_config: + selectedEntityNames: *sel_entities_americas + attribution: |- + UN WPP (2024); HMD (2024); Zijdeman et al. (2015); Riley (2005) + + # ONLY PROJECTIONS + life_expectancy_only_proj: + variables: + life_expectancy_only_proj: + title: |- + {definitions.global.title_base} (only projections) + description_key: + - |- + {definitions.global.point_1} + - |- + {definitions.global.point_2} + display: &display_o_proj + numDecimalPlaces: 1 + name: |- + {definitions.global.title_public} at << 'birth' if (age == 0) else age >><< ', ' + sex + 's' if (sex != 'total') >>, medium projection + presentation: + attribution_short: &attr_o_proj UN WPP + title_variant: &title_variant_o_proj << sex + 's, ' if sex != 'total' >>period tables, medium projection + + life_expectancy_0_only_proj: + title: |- + {definitions.global.title_base} at birth (only projections) + description_key: + - |- + {definitions.global.point_1} + - |- + {definitions.global.point_2} + display: *display_o_proj + presentation: + attribution_short: *attr_o_proj + title_variant: *title_variant_o_proj + grapher_config: + selectedEntityNames: *sel_entities_americas diff --git a/etl/steps/data/garden/demography/2024-12-03/life_expectancy.py b/etl/steps/data/garden/demography/2024-12-03/life_expectancy.py new file mode 100644 index 00000000000..322f8af2631 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/life_expectancy.py @@ -0,0 +1,368 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr +from owid.catalog import Dataset, Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Year of last estimate +YEAR_ESTIMATE_LAST = 2023 +YEAR_WPP_START = 1950 + +# Region mapping +# We will be using continent names without (Entity) suffix. This way charts show continuity between lines from different datasets (e.g. riley and UN) +REGION_MAPPING = { + "Africa (Riley 2005)": "Africa", + "Americas (Riley 2005)": "Americas", + "Asia (Riley 2005)": "Asia", + "Europe (Riley 2005)": "Europe", + "Oceania (Riley 2005)": "Oceania", + "Africa (UN)": "Africa", + "Northern America (UN)": "Northern America", + "Latin America and the Caribbean (UN)": "Latin America and the Caribbean", + "Asia (UN)": "Asia", + "Europe (UN)": "Europe", + "Oceania (UN)": "Oceania", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + ## Life tables + paths.log.info("reading dataset `life_tables`") + ds_lt = paths.load_dataset("life_tables") + tb_lt = ds_lt.read("life_tables") + ## zijdeman_et_al_2015 + paths.log.info("reading dataset `zijdeman_et_al_2015`") + ds_zi = paths.load_dataset("zijdeman_et_al_2015") + tb_zi = ds_zi.read("zijdeman_et_al_2015") + ## Riley + paths.log.info("reading dataset `riley_2005`") + ds_ri = paths.load_dataset("riley_2005") + tb_ri = ds_ri.read("riley_2005") + ## WPP + paths.log.info("reading dataset `un_wpp`") + ds_un = paths.load_dataset("un_wpp") + tb_un = ds_un.read("life_expectancy") + + # + # Process data. + # + paths.log.info("processing data") + tb_lt = process_lt(tb_lt) + tb_un = process_un(tb_un) + tb_zi = process_zi(tb_zi) + tb_ri = process_ri(tb_ri) + + paths.log.info("combining tables") + tb = combine_tables(tb_lt, tb_un, tb_zi, tb_ri) + + # Rename regions, and use column 'country' instead of 'country' + tb["country"] = tb["country"].replace(REGION_MAPPING) + + # Add Americas + # tb = add_americas(tb, ds_un) + + ## Check values + paths.log.info("final checks") + _check_column_values(tb, "sex", {"total", "male", "female"}) + _check_column_values(tb, "age", {0, 10, 15, 25, 45, 65, 80}) + + # Create three tables: (i) only historical values, (ii) only future values, (iii) all values + columns_index = ["country", "year", "sex", "age"] + + ## (i) Main table (historical values) + tb_main = tb.loc[tb["year"] <= YEAR_ESTIMATE_LAST].copy() + + ## (ii) Only projections + tb_only_proj = tb.loc[tb["year"] > YEAR_ESTIMATE_LAST].copy() + tb_only_proj = _add_suffix_to_indicators(tb_only_proj, "_only_proj", columns_index=columns_index) + ## Table only with projections should only contain UN as origin + origins_un = [origin for origin in tb_main["life_expectancy"].m.origins if origin.producer == "United Nations"] + for col in tb_only_proj.columns: + tb_only_proj[col].origins = origins_un + + ## (iii) All values + tb_with_proj = tb.copy() + # Only preserve ages that have projections (i.e. data after YEAR_ESTIMATE_LAST) + ages_with_projections = set(tb_with_proj.loc[tb_with_proj["year"] > YEAR_ESTIMATE_LAST, "age"].unique()) + tb_with_proj = tb_with_proj.loc[tb_with_proj["age"].isin(ages_with_projections)] + # Column names + tb_with_proj = _add_suffix_to_indicators(tb_with_proj, "_with_proj", columns_index=columns_index) + + # Format + tables = [ + tb_main.format(columns_index, short_name=paths.short_name), + tb_only_proj.format(columns_index, short_name=f"{paths.short_name}_only_proj"), + tb_with_proj.format(columns_index, short_name=f"{paths.short_name}_with_proj"), + ] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_lt.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def _add_suffix_to_indicators(tb, suffix, columns_index=None): + if columns_index is None: + columns_index = [] + tb.columns = [f"{col}{suffix}" if col not in columns_index else col for col in tb.columns] + return tb + + +def process_lt(tb: Table) -> Table: + """Process LT data and output it in the desired format. + + Desired format is with columns country, year, sex, age | life_expectancy. + """ + tb = tb.loc[ + (tb["age"].isin(["0", "10", "15", "25", "45", "65", "80"])) & (tb["type"] == "period"), + ["country", "year", "sex", "age", "life_expectancy"], + ] + + # Assign dtype + tb["age"] = tb["age"].astype("Int64") + + # Update life_expectancy values + tb["life_expectancy"] = tb["life_expectancy"] + tb["age"] + + # Check latest year + assert ( + tb["year"].max() == YEAR_ESTIMATE_LAST + ), f"Last year was {tb['year'].max()}, but should be {YEAR_ESTIMATE_LAST}" + + # Check column values + ## sex + _check_column_values(tb, "sex", {"total", "female", "male"}) + ## age + _check_column_values(tb, "age", {0, 10, 15, 25, 45, 65, 80}) + + return tb + + +def process_un(tb: Table) -> Table: + """Process UN WPP data and output it in the desired format. + + Desired format is with columns country, year, sex, age | life_expectancy. + """ + # Sanity check + assert ( + tb["year"].min() == YEAR_WPP_START + ), f"Year of first estimate is different than {YEAR_WPP_START}, it is {tb['year'].min()}" + + # Filter + ## dimension values: metric=life_expectancy, variant=medium, year >= YEAR_ESTIMATE_LAST + ## columns: country, year, value, sex, age + tb = tb.loc[ + (tb["year"] > YEAR_ESTIMATE_LAST) & (tb["variant"] == "medium"), + ["country", "year", "sex", "age", "life_expectancy"], + ] + + # Rename column values + tb["age"] = tb["age"].replace({"at birth": "0"}).astype("Int64") + tb["sex"] = tb["sex"].replace({"all": "total"}) + + # Check column values + ## sex + _check_column_values(tb, "sex", {"total", "female", "male"}) + ## age + _check_column_values(tb, "age", {0, 15, 65, 80}) + + # Check minimum year + assert ( + tb.groupby("country", observed=True).year.min() == YEAR_ESTIMATE_LAST + 1 + ).all(), f"Some entry with latest year different than {YEAR_ESTIMATE_LAST}" + + return tb + + +def process_zi(tb: Table) -> Table: + """Process Zijdeman data and output it in the desired format. + + Desired format is with columns country, year, sex, age | life_expectancy. + """ + # Filter + ## dimension values: metric=life_expectancy, variant=medium, year >= YEAR_ESTIMATE_LAST + ## columns: country, year, value, sex, age + tb = tb.loc[(tb["year"] <= YEAR_ESTIMATE_LAST)] + + # Add columns + # tb["type"] = "period" + tb["age"] = 0 + tb["sex"] = "total" + + # Resolution + tb["life_expectancy"] = tb["life_expectancy"].astype("Float64").round(3) + + # Dtypes + tb = tb.astype( + { + "age": "Int64", + "sex": "string", + } + ) + + # Sanity check + assert tb["year"].max() == 2012, f"Last year was {tb['year'].max()}, but should be 2012" + + return tb + + +def process_ri(tb: Table) -> Table: + """Process Riley data and output it in the desired format. + + Desired format is with columns country, year, sex, age | life_expectancy. + """ + # Filter + ## dimension values: metric=life_expectancy, variant=medium, year >= YEAR_ESTIMATE_LAST + ## columns: country, year, value, sex, age + tb = tb.loc[(tb["year"] < 1950),] + + # Rename column names + tb = tb.rename(columns={"entity": "country"}) + + # Add columns + # tb["type"] = "period" + tb["sex"] = "total" + tb["age"] = 0 + + # Dtypes + tb = tb.astype( + { + "age": "Int64", + "sex": "string", + } + ) + + # Resolution + tb["life_expectancy"] = tb["life_expectancy"].astype("Float64").round(3) + + return tb + + +def combine_tables(tb_lt: Table, tb_un: Table, tb_zi: Table, tb_ri: Table) -> Table: + """Combine all LE tables. + + - Only HMD (within LT) contains cohort data. + - LE broken down by sex and age is available from LT and UN_WPP. + - LT already contains UN_WPP data, but without projections. That's why we also use UN WPP's + - RIL and ZIJ contain figures for all sexes and at birth. Only period. + """ + tb = pr.concat([tb_lt, tb_un], ignore_index=True, short_name="life_expectancy") + + # Separate LE at birth from at different ages + mask = (tb["age"] == 0) & (tb["sex"] == "total") + tb_0 = tb.loc[mask] + tb = tb.loc[~mask] + + # Extend tb_0 (only for period) + ## Zijdeman: complement country data + tb_0 = tb_0.merge(tb_zi, how="outer", on=["country", "year", "sex", "age"], suffixes=("", "_zij")) + tb_0["life_expectancy"] = tb_0["life_expectancy"].fillna(tb_0["life_expectancy_zij"]) + tb_0 = tb_0.drop(columns=["life_expectancy_zij"]) + ## Riley: complement with continent data + tb_0 = pr.concat([tb_0, tb_ri], ignore_index=True) + + # Combine tb_0 with tb + tb = tb.merge(tb_0, on=["country", "year", "sex", "age"], how="outer", suffixes=("", "_0")) + + # For some reason, 'sex' is assigned type object + tb["sex"] = tb["sex"].astype("string") + + return tb + + +def _check_column_values(tb: Table, column: str, expected_values: set) -> None: + """Check that a column has only expected values.""" + unexpected_values = set(tb[column]) - expected_values + assert not unexpected_values, f"Unexpected values found in column {column}: {unexpected_values}" + + +def add_americas(tb: Table, ds_population: Dataset) -> Table: + """Estimate value for the Americas using North America and LATAM/Caribbean. + + Only performs this estimation for: + + sex = all + age = 0 + + It estimates it by doing the population-weighted average of life expectancies. + """ + # filter only member countries of the region + AMERICAS_MEMBERS = ["Northern America", "Latin America and the Caribbean"] + tb_am = tb.loc[(tb["country"].isin(AMERICAS_MEMBERS)) & (tb["sex"] == "total") & (tb["age"] == 0),].copy() + + # sanity check + assert ( + tb_am.groupby(["country", "year"]).size().max() == 1 + ), "There is more than one entry for a (country, year) tuple!" + + # add population for LATAM and Northern America (from WPP, hence since 1950) + assert tb_am["year"].min() == YEAR_WPP_START + tb_am = add_population_americas_from_wpp(tb_am, ds_population) + + # sanity check: ensure there are NO missing values. This way, we can safely do the groupby + assert (tb_am[["life_expectancy_0", "population"]].isna().sum() == 0).all() + + # estimate values for regions + # y(country) = weight(country) * metric(country) + tb_am["life_expectancy_0"] *= tb_am["population"] + + # z(region) = sum{ y(country) } for country in region + tb_am = tb_am.groupby("year", as_index=False)[["life_expectancy_0", "population"]].sum() + + # z(region) / sum{ population(country) } for country in region + tb_am["life_expectancy_0"] /= tb_am["population"] + + # assign region name + tb_am = tb_am.assign( + country="Americas", + sex="total", + age=0, + ) + + # drop unused column + tb_am = tb_am.drop(columns="population") + + # concatenate + tb = pr.concat([tb, tb_am], ignore_index=True) + return tb + + +def add_population_americas_from_wpp(tb: Table, ds_population: Dataset) -> Table: + """Add population values for LATAM and Northern America. + + Data is sourced from UN WPP, hence only available since 1950. + """ + pop = load_america_population_from_unwpp(ds_population) + tb = tb.merge(pop, on=["country", "year"]) + return tb + + +def load_america_population_from_unwpp(ds_population: Dataset) -> Table: + """Load population data from UN WPP for Northern America and Latin America and the Caribbean. + + We use this dataset instead of the long-run because we want the entities as defined by the UN. + """ + # load population from WPP + countries = ["Latin America and the Caribbean (UN)", "Northern America (UN)"] + tb = ds_population.read("population") + tb = tb.loc[ + (tb["country"].isin(countries)) + & (tb["sex"] == "all") + & (tb["age"] == "all") + & (tb["variant"].isin(["estimates", "medium"])), + ["country", "year", "population"], + ] + assert len(set(tb["country"])) == 2, f"Check that all of {countries} are in df" + tb["country"] = tb["country"].replace(REGION_MAPPING).drop(columns="country") + + return tb diff --git a/etl/steps/data/garden/demography/2024-12-03/life_tables.meta.yml b/etl/steps/data/garden/demography/2024-12-03/life_tables.meta.yml new file mode 100644 index 00000000000..a5de8e6bd4b --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/life_tables.meta.yml @@ -0,0 +1,223 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + grapher_config: + selectedEntityNames: + - Italy + - England and Wales + - France + - Sweden + topic_tags: + - Life Expectancy + + global: + life_expectancy: + point_1: |- + <% if type == "period" %> + Period life expectancy is a metric that summarizes death rates across all age groups in one particular year. + <%- else %> + Cohort life expectancy is the average lifespan of a group of people, usually a birth cohort – people born in the same year. + <%- endif %> + point_2: |- + <% if type == "period" %> + <%- if age == '0' %> + For a given year, it represents the average lifespan for a hypothetical group of people, if they experienced the same age-specific death rates throughout their whole lives as the age-specific death rates seen in that particular year. + <%- else %> + For a given year, it represents the remaining average lifespan for a hypothetical group of people, if they experienced the same age-specific death rates throughout the rest of their lives as the age-specific death rates seen in that particular year. + <%- endif %> + <%- else %> + <%- if age == '0' %> + It is calculated by tracking individuals from that cohort throughout their lives until death, and calculating their average lifespan. + <%- else %> + It is calculated by tracking individuals from that cohort throughout the rest of their lives until death, and calculating their average remaining lifespan. + <%- endif %> + <%- endif %> + data_used: |- + Prior to 1950, we use HMD (2024) data. From 1950 onwards, we use UN WPP (2024) data. + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + title: Life tables + update_period_days: 365 + description: |- + Life tables from UN and HMD. + + For period data, data prior to 1950 is from HMD, and data from 1950 onwards is from UN WPP. + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + life_tables: + variables: + central_death_rate: + title: Central death rate + unit: deaths per 1,000 people + description_short: |- + The death rate, calculated as the number of deaths divided by the average number of people alive during the interval. + description_key: + - |- + The death rate is measured using the number of person-years lived during the interval. + - |- + Person-years refers to the combined total time that a group of people has lived. For example, if 10 people each live for 2 years, they collectively contribute 20 person-years. + - |- + The death rate is slightly different from the 'probability of death' during the interval, because the 'probability of death' metric uses a different denominator: the number of people alive at that age at the start of the interval, while this indicator uses the average number of people alive during the interval. + - |- + {definitions.global.data_used} + description_processing: |- + The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 1,000 to get a per-1,000 people rate. + processing_level: minor + display: + name: |- + Central death rate at << 'birth' if (age == '0') else age >><< ', ' + sex + 's' if (sex != 'both') >>, << type >> + presentation: + title_public: Central death rate at << age if age != '0' else 'birth'>> + title_variant: << sex + 's, ' if sex != 'both' >><< type + ' tables'>> + topic_tags: + - Life Expectancy + + probability_of_death: + title: Probability of death + unit: "%" + description_short: |- + The probability of dying in a given interval, among people who survived to the start of that interval. + description_key: + - |- + For example, the probability of death for a 50 year old in a given year is found by: dividing the number of deaths in 50 year olds that year, by the number of people alive at the age of 50 at the start of the year. + description_processing: |- + The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 100 to get a percentage. + processing_level: minor + display: + name: |- + Probability of death at << 'birth' if (age == '0') else age >><< ', ' + sex + 's' if (sex != 'both') >>, << type >> + presentation: + title_public: Probability of death at << age if age != '0' else 'birth'>> + title_variant: << sex + 's, ' if sex != 'both' >><< type + ' tables'>> + topic_tags: + - Life Expectancy + - Causes of Death + + average_survival_length: + title: Average survival length + short_unit: years + unit: years + description_short: Average length of survival between ages x and x+n for persons dying in the interval. + + number_survivors: + title: Number of survivors + unit: survivors + description_short: Number of survivors at a given age, assuming survivors at 0 years old is 100,000. + + number_deaths: + title: Number of deaths + short_unit: deaths + unit: deaths + description_short: Number of deaths during a given age. + presentation: + topic_tags: + - Life Expectancy + - Causes of Death + + number_person_years_lived: + title: Number of person-years lived + unit: person-years + description_short: Number of person-years lived between a given ages and a year after. + + number_person_years_remaining: + title: Number of person-years remaining + unit: person-years + description_short: Number of person-years remaining after a given age. + + life_expectancy: + title: Life expectancy + short_unit: years + unit: years + description_short: |- + <%- if age == '0' -%> + <%- if sex == 'both' -%> + The << type >> life expectancy at birth, in a given year. + <%- else -%> + The << type >> life expectancy at birth among << sex + 's' >>, in a given year. + <%- endif -%> + <%- else -%> + <%- if sex == 'both' -%> + The remaining << type >> life expectancy at age << age >>, in a given year. + <%- else -%> + The remaining << type >> life expectancy at age << age >> among << sex + 's' >>, in a given year. + <%- endif -%> + <%- endif -%> + description_key: + - |- + {definitions.global.life_expectancy.point_1} + - |- + {definitions.global.life_expectancy.point_2} + - |- + <%- if age != '0' -%> + <%- if type == "period" -%> + This shows the remaining life expectancy among people who have already reached the age << age >>, using death rates from their age group and older age groups. + <%- else -%> + This shows the remaining cohort life expectancy of people who have reached the age << age >>. + <%- endif -%> + <%- endif -%> + - |- + {definitions.global.data_used} + display: + numDecimalPlaces: 1 + name: |- + Life expectancy at << 'birth' if (age == '0') else age >><< ', ' + sex + 's' if (sex != 'both') >>, << type >> + presentation: + title_public: Life expectancy at << age if age != '0' else 'birth'>> + title_variant: << sex + 's, ' if sex != 'both' >><< type + ' tables'>> + + diff_ratios: + variables: + life_expectancy_fm_diff: + title: Life expectancy difference (f-m) + short_unit: years + unit: years + description_short: |- + The difference in << type >> life expectancy (females - males) at a given age. + description_key: + - Higher values indicate longer life expectancy among females than males. + - |- + {definitions.global.life_expectancy.point_1} + - |- + {definitions.global.life_expectancy.point_2} + - |- + {definitions.global.data_used} + display: + numDecimalPlaces: 1 + name: |- + Life expectancy (female-male difference) at << 'birth' if (age == '0') else age >>, << type >> + presentation: + title_public: Life expectancy at << age if age != '0' else 'birth'>> + title_variant: female-male difference, << type + ' tables'>> + topic_tags: + - Life Expectancy + - Gender Ratio + + life_expectancy_fm_ratio: + title: Life expectancy ratio (f/m) + unit: "" + short_unit: "" + description_short: |- + The ratio of << type >> life expectancy (females/males) at a given age. + description_key: + - Higher values indicate longer life expectancy among females than males. + - |- + {definitions.global.life_expectancy.point_1} + - |- + {definitions.global.life_expectancy.point_2} + - |- + {definitions.global.data_used} + display: + numDecimalPlaces: 1 + name: |- + Life expectancy (female-to-male ratio) at << 'birth' if (age == '0') else age >>, << type >> + presentation: + title_public: Life expectancy at << age if age != '0' else 'birth'>> + title_variant: female-to-male ratio, << type + ' tables'>> + topic_tags: + - Life Expectancy + - Gender Ratio diff --git a/etl/steps/data/garden/demography/2024-12-03/life_tables.py b/etl/steps/data/garden/demography/2024-12-03/life_tables.py new file mode 100644 index 00000000000..33fe750122c --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/life_tables.py @@ -0,0 +1,194 @@ +"""Load a meadow dataset and create a garden dataset. + + +Combines HMD and UN life tables. + +Some notes: + + - Time coverage: + - UN contains data on many more countries, but only since 1950. + - HMD contains data on fewer countries, but since 1676! + - We therefore use UN since 1950 for all countries, and HMD prior to that. We use the same source for all countries in each time period to ensure comparability across countries. + - Age groups: + - HMD contains single-age groups from 0 to 109 and 110+ (equivalent to >=110). It also contains data on wider age groups, but we discard these. + - UN contains single-age groups from 0 to 99 and 100+ (equivalent to >=100) +""" + +import numpy as np +import owid.catalog.processing as pr +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# List of indicator columns +COLUMNS_INDICATORS = [ + "central_death_rate", + "probability_of_death", + "probability_of_survival", + "number_survivors", + "number_deaths", + "number_person_years_lived", + "survivorship_ratio", + "number_person_years_remaining", + "life_expectancy", + "average_survival_length", +] +COLUMN_INDICATORS_REL = [ + "life_expectancy_fm_diff", + "life_expectancy_fm_ratio", + "central_death_rate_mf_ratio", +] +COLUMNS_INDEX = [ + "country", + "year", + "sex", + "age", + "type", +] +COLUMNS_INDEX_REL = [ + "country", + "year", + "age", + "type", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow datasets. + paths.log.info("load dataset, tables") + ds_hmd = paths.load_dataset("hmd") + ds_un = paths.load_dataset("un_wpp_lt") + + # Read table from meadow dataset. + tb_hmd = ds_hmd.read("life_tables") + tb_hmd_diff = ds_hmd.read("diff_ratios") + tb_un = ds_un.read("un_wpp_lt") + + # + # Process data. + # + tb_un = tb_un.rename( + columns={ + "location": "country", + } + ) + # Set type='period' for UN data + tb_un["type"] = "period" + + # Keep only single-years + ## Get only single-year, set dtype as int + flag = ~tb_hmd["age"].str.contains("-") + tb_hmd = tb_hmd.loc[flag] + flag = ~tb_hmd_diff["age"].str.contains("-") + tb_hmd_diff = tb_hmd_diff.loc[flag] + + # Add life expectancy differences and ratios + paths.log.info("calculating extra variables (ratio and difference in life expectancy for f and m).") + tb_un_rel = make_table_diffs_ratios(tb_un) + + # Combine HMD + UN + paths.log.info("concatenate tables") + tb = combine_tables(tb_hmd, tb_un, COLUMNS_INDEX, COLUMNS_INDICATORS) + tb_rel = combine_tables(tb_hmd_diff, tb_un_rel, COLUMNS_INDEX_REL, COLUMN_INDICATORS_REL) + + # Set DTypes + dtypes = { + "type": "string", + } + tb = tb.astype(dtypes) + tb_rel = tb_rel.astype(dtypes) + + # Set index + tb = tb.format(COLUMNS_INDEX, short_name=paths.short_name) + tb_rel = tb_rel.format(COLUMNS_INDEX_REL, short_name="diff_ratios") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb, tb_rel], check_variables_metadata=True) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def combine_tables(tb_hmd: Table, tb_un: Table, cols_index, cols_indicators) -> Table: + """Combine HMD and UN life tables. + + - UN only provides period data. + - We use UN data after 1950. Prior to that, we use HMD. + - We considered using HMD over UN after 1950 if data was available for a given country for all years, ages and sexes. + - However, this is only the case for very few countries: Australia, Germany, Hungary, Lithuania, Northern Ireland, Scotland, United Kingdom. + - We decided against this to ensure comparability across countries (i.e. all countries use same source after 1950). + """ + # HMD + ## Sanity check years + assert tb_hmd["year"].max() == 2023, "HMD data should end in 2023" + assert tb_hmd["year"].min() == 1751, "HMD data should start in 1751" + ## Keep only period HMD data prior to 1950 (UN data starts in 1950) + tb_hmd = tb_hmd.loc[((tb_hmd["year"] < 1950) & (tb_hmd["type"] == "period")) | (tb_hmd["type"] == "cohort")] + ## Filter relevant columns (UN has two columns that HMD doesn't: 'probability_of_survival', 'survivorship_ratio') + columns_indicators_hmd = [col for col in tb_hmd.columns if col in cols_indicators] + tb_hmd = tb_hmd.loc[:, cols_index + columns_indicators_hmd] + + # UN + ## Sanity check years + assert tb_un["year"].max() == 2023, "UN data should end in 2023" + assert tb_un["year"].min() == 1950, "UN data should start in 1950" + assert (tb_un["year"].drop_duplicates().diff().dropna() == 1).all(), "UN data should be yearly" + ## Filter relevant columns + tb_un = tb_un.loc[:, cols_index + cols_indicators] + + # Combine tables + tb = pr.concat([tb_hmd, tb_un], short_name=paths.short_name) + + # Remove all-NaN rows + tb = tb.dropna(subset=cols_indicators, how="all") + + return tb + + +def make_table_diffs_ratios(tb: Table) -> Table: + """Create table with metric differences and ratios. + + Currently, we estimate: + + - female - male: Life expectancy + - male/female: Life Expectancy, Central Death Rate + """ + # Pivot & obtain differences and ratios + cols_index = ["country", "year", "age", "type"] + tb_new = ( + tb.pivot_table( + index=cols_index, + columns="sex", + values=["life_expectancy", "central_death_rate"], + ) + .assign( + life_expectancy_fm_diff=lambda df: df[("life_expectancy", "female")] - df[("life_expectancy", "male")], + life_expectancy_fm_ratio=lambda df: df[("life_expectancy", "female")] / df[("life_expectancy", "male")], + central_death_rate_mf_ratio=lambda df: df[("central_death_rate", "male")] + / df[("central_death_rate", "female")], + ) + .reset_index() + ) + + # Keep relevant columns + cols = [col for col in tb_new.columns if col[1] == ""] + tb_new = tb_new.loc[:, cols] + + # Rename columns + tb_new.columns = [col[0] for col in tb_new.columns] + + # Add metadata back + for col in tb_new.columns: + if col not in cols_index: + tb_new[col].metadata.origins = tb["life_expectancy"].m.origins.copy() + tb_new[col] = tb_new[col].replace([np.inf, -np.inf], np.nan) + + return tb_new diff --git a/etl/steps/data/garden/demography/2024-12-03/phi_gender_le.meta.yml b/etl/steps/data/garden/demography/2024-12-03/phi_gender_le.meta.yml new file mode 100644 index 00000000000..424fe96f873 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/phi_gender_le.meta.yml @@ -0,0 +1,40 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Life Expectancy + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 365 + title: Outsurvival statistic + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + phi_gender_le: + variables: + phi: + title: Outsurvival statistic at birth + unit: "%" + short_unit: "%" + description_short: |- + The probability that a male will live longer than a female if both are randomly selected from the population at birth. + description_processing: |- + This was calculated using scripts from Bergeron-Boucher et al. (2022). + + Citation: Bergeron-Boucher, M.-P., Alvarez, J.-A., Kashnitsky, I., & Zarulli, V. (2022). Probability of males to outlive females: An international comparison from 1751 to 2020. BMJ Open, 12(8), e059964. https://doi.org/10.1136/bmjopen-2021-059964 + + Code available at: https://github.com/CPop-SDU/outsurvival-in-perspective + description_key: + - The probability that a male will live longer than a female, in a given population during a given interval. + - This is calculated for random pairs of one male and one female at age 0. + display: + numDecimalPlaces: 1 + presentation: + grapher_config: + hasMapTab: true + attribution: |- + Human Mortality Database (2024); UN, World Population Prospects (2024); Bergeron-Boucher et al. (2022) diff --git a/etl/steps/data/garden/demography/2024-12-03/phi_gender_le.py b/etl/steps/data/garden/demography/2024-12-03/phi_gender_le.py new file mode 100644 index 00000000000..a8acfd9f94c --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-03/phi_gender_le.py @@ -0,0 +1,106 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("life_tables") + + # Read table from meadow dataset. + tb = ds_meadow.read("life_tables") + + # + # Process data. + # + paths.log.info("replace 110+ -> 110, 100+ -> 100") + tb["age"] = ( + tb["age"] + .replace( + { + "110+": "110", + "100+": "100", + } + ) + .astype(int) + ) + + # Keep only period data broken down by sex + paths.log.info("keep only type='period' and sex in {'male', 'female'}") + tb = tb.loc[(tb["type"] == "period") & (tb["sex"].isin(["female", "male"]))].drop(columns=["type"]) + + # Add phi + paths.log.info("add phi parameter") + tb = make_table_phi(tb) + + # Set index + tb = tb.format(["country", "year"], short_name=paths.short_name) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def make_table_phi(tb: Table) -> Table: + """Estimate phi. + + Phi is defined as the outsurvival probability of males (i.e. probability that a male will live longer than a female in a given population). + + This is estimated using Equation 2 from https://bmjopen.bmj.com/content/bmjopen/12/8/e059964.full.pdf. + + Inspired by code: + - https://github.com/CPop-SDU/sex-gap-e0-pnas/tree/main + - https://github.com/CPop-SDU/outsurvival-in-perspective + """ + # Copy original metadata + origins = tb["number_deaths"].metadata.origins + + # Calculate standard deviations + tb["number_survivors"] = tb["number_survivors"] / 1e5 + tb["number_deaths"] = tb["number_deaths"] / 1e5 + + # Pivot table to align males and females for the different metrics + tb = tb.pivot( + index=["country", "year", "age"], columns="sex", values=["number_survivors", "number_deaths"] + ).reset_index() + + # Order + tb = tb.sort_values(["country", "year", "age"]) + + # Shift one up (note the subindex in the equation 'x-n', in our case n=1 (age group width)) + column = ("number_survivors", "male") + tb[column] = tb.groupby(["country", "year"])[[column]].shift(-1).squeeze() + + # Estimate phi_i (i.e. Eq 2 for a specific age group, without the summation) + tb["phi"] = ( + tb["number_deaths"]["female"] * tb["number_survivors"]["male"] + + tb["number_deaths"]["female"] * tb["number_deaths"]["male"] / 2 + ) + # Apply the summation from Eq 2 + tb = tb.groupby(["country", "year"], as_index=False, observed=True)[[("phi", "")]].sum() + + # Scale + tb["phi"] = (tb["phi"] * 100).round(2) + + # Fix column names (remove multiindex) + tb.columns = [col[0] for col in tb.columns] + + # Copy metadata + tb["phi"].metadata.origins = origins + + return tb diff --git a/etl/steps/data/garden/education/2017-09-30/public_expenditure.py b/etl/steps/data/garden/education/2017-09-30/public_expenditure.py index a4968d65180..02d9f00c5ef 100644 --- a/etl/steps/data/garden/education/2017-09-30/public_expenditure.py +++ b/etl/steps/data/garden/education/2017-09-30/public_expenditure.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # snap = paths.load_snapshot() - tb = snap.read().set_index(["country", "year"]) + tb = snap.read(safe_types=False).set_index(["country", "year"]) # # Save outputs. diff --git a/etl/steps/data/garden/education/2018-04-18/literacy_rates.py b/etl/steps/data/garden/education/2018-04-18/literacy_rates.py index a4968d65180..02d9f00c5ef 100644 --- a/etl/steps/data/garden/education/2018-04-18/literacy_rates.py +++ b/etl/steps/data/garden/education/2018-04-18/literacy_rates.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # snap = paths.load_snapshot() - tb = snap.read().set_index(["country", "year"]) + tb = snap.read(safe_types=False).set_index(["country", "year"]) # # Save outputs. diff --git a/etl/steps/data/garden/education/2023-07-17/education_lee_lee.meta.yml b/etl/steps/data/garden/education/2023-07-17/education_lee_lee.meta.yml index 65a6494b20f..9b64f07c630 100644 --- a/etl/steps/data/garden/education/2023-07-17/education_lee_lee.meta.yml +++ b/etl/steps/data/garden/education/2023-07-17/education_lee_lee.meta.yml @@ -11,7 +11,7 @@ definitions: For the period before 1985, regional aggregates were computed by Our World in Data through yearly population-weighted averages, where annual values are proportionally adjusted to emphasize the influence of larger populations. dataset: - title: Human Capital in the Long Run (Lee and Lee 2016) and WDI (World Bank) + title: Human Capital in the Long Run (Lee and Lee 2016), WDI (World Bank) and UNESCO description: > This dataset is based on 'Human Capital in the Long Run' by Lee and Lee (2016) and World Development Indicators by World Bank. It includes indicators relating to enrollment rates in various levels of education, historic data on average years of education and educational attainment, specifically share of people with no education. diff --git a/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py b/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py index 0e09ed1ebcf..31c33ace7d7 100644 --- a/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py +++ b/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py @@ -58,11 +58,15 @@ def run(dest_dir: str) -> None: ds_garden_wdi = paths.load_dataset("wdi") tb_wdi = ds_garden_wdi["wdi"] - # Extract enrollment rates from the World Bank Education Dataset starting from 2010 - enrolment_wb = extract_related_world_bank_data(tb_wdi) + # Load the UNESCO Education Dataset + ds_garden_unesco = paths.load_dataset("enrolment_rates") + tb_unesco = ds_garden_unesco["enrolment_rates"] + + # Extract enrollment rates from the World Bank and UNESCO datasets starting from 2010 + enrolment_recent = extract_recent_data(tb_wdi, tb_unesco) # Get the list of columns from the World Bank dataset - world_bank_indicators = enrolment_wb.columns + world_bank_indicators = enrolment_recent.columns # # Data Processing @@ -101,7 +105,7 @@ def run(dest_dir: str) -> None: # Concatenate historical and more recent enrollment data hist_1985_tb = merged_tb[merged_tb["year"] < 1985] - tb_merged_enrollment = pr.concat([enrolment_wb, hist_1985_tb[world_bank_indicators]]) + tb_merged_enrollment = pr.concat([enrolment_recent, hist_1985_tb[world_bank_indicators]]) tb_merged_enrollment.set_index(["country", "year"], inplace=True) # Differentiate these columns from the original data tb_merged_enrollment.columns = tb_merged_enrollment.columns + "_combined_wb" @@ -160,7 +164,7 @@ def run(dest_dir: str) -> None: ds_garden.save() -def extract_related_world_bank_data(tb_wb: Table): +def extract_recent_data(tb_wb: Table, tb_unesco: Table) -> Table: """ Extracts enrollment rate indicators from the World Bank dataset. The function specifically extracts net enrollment rates up to secondary education and gross enrollment rates for tertiary education. @@ -171,10 +175,6 @@ def extract_related_world_bank_data(tb_wb: Table): # Define columns to select for enrolment rates select_enrolment_cols = [ - # Primary enrollment columns - "se_prm_nenr", - "se_prm_nenr_fe", - "se_prm_nenr_ma", # Tertiary enrollment columns "se_ter_enrr", "se_ter_enrr_fe", @@ -185,11 +185,18 @@ def extract_related_world_bank_data(tb_wb: Table): "se_sec_nenr_ma", ] + select_primary = [ + # Primary enrollment columns + "total_net_enrolment_rate__primary__both_sexes__pct", + "total_net_enrolment_rate__primary__female__pct", + "total_net_enrolment_rate__primary__male__pct", + ] + # Dictionary to rename columns to be consistent with Lee dataset dictionary_to_rename_and_combine = { - "se_prm_nenr": "mf_primary_enrollment_rates", - "se_prm_nenr_fe": "f_primary_enrollment_rates", - "se_prm_nenr_ma": "m_primary_enrollment_rates", + "total_net_enrolment_rate__primary__both_sexes__pct": "mf_primary_enrollment_rates", + "total_net_enrolment_rate__primary__female__pct": "f_primary_enrollment_rates", + "total_net_enrolment_rate__primary__male__pct": "m_primary_enrollment_rates", "se_ter_enrr": "mf_tertiary_enrollment_rates", "se_ter_enrr_fe": "f_tertiary_enrollment_rates", "se_ter_enrr_ma": "m_tertiary_enrollment_rates", @@ -199,14 +206,15 @@ def extract_related_world_bank_data(tb_wb: Table): } # Select and rename columns - enrolment_wb = tb_wb[select_enrolment_cols] - enrolment_wb = enrolment_wb.rename(columns=dictionary_to_rename_and_combine) + enrolment_wb = tb_wb[select_enrolment_cols].reset_index() + enrolment_unesco = tb_unesco[select_primary].reset_index() + enrolment_recent = pr.merge(enrolment_wb, enrolment_unesco, on=["country", "year"], how="outer") + enrolment_recent = enrolment_recent.rename(columns=dictionary_to_rename_and_combine) # Select data above 1985 - enrolment_wb = enrolment_wb[(enrolment_wb.index.get_level_values("year") >= 1985)] - enrolment_wb = enrolment_wb.reset_index() + enrolment_recent = enrolment_recent[enrolment_recent["year"] >= 1985] - return enrolment_wb + return enrolment_recent def melt_and_pivot(tb, id_vars: List[str], value_vars: List[str], index_vars: List[str], columns_vars: List[str]): diff --git a/etl/steps/data/garden/education/2023-08-14/oecd_education.py b/etl/steps/data/garden/education/2023-08-14/oecd_education.py index 62aa1f91c70..0f55dd9cea9 100644 --- a/etl/steps/data/garden/education/2023-08-14/oecd_education.py +++ b/etl/steps/data/garden/education/2023-08-14/oecd_education.py @@ -23,8 +23,8 @@ def run(dest_dir: str) -> None: tb = ds_meadow["oecd_education"].reset_index() # Load the World Bank Education Dataset - ds_garden_wb = paths.load_dataset("education") - tb_wb = ds_garden_wb["education"].reset_index() + ds_garden_wb = paths.load_dataset("edstats") + tb_wb = ds_garden_wb["edstats"].reset_index() # Harmonize country names tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) diff --git a/etl/steps/data/garden/ember/2024-05-08/yearly_electricity.py b/etl/steps/data/garden/ember/2024-05-08/yearly_electricity.py index 80667817ae8..d196ec6f757 100644 --- a/etl/steps/data/garden/ember/2024-05-08/yearly_electricity.py +++ b/etl/steps/data/garden/ember/2024-05-08/yearly_electricity.py @@ -7,6 +7,7 @@ from typing import Dict import owid.catalog.processing as pr +import pandas as pd from owid.catalog import Dataset, Table, utils from structlog import get_logger @@ -452,10 +453,11 @@ def run(dest_dir: str) -> None: # Therefore, we make nan all aggregate data in all yearly electricity tables prior to 2000. for table_name in tables: for column in tables[table_name].columns: - tables[table_name][ + tables[table_name].loc[ (tables[table_name].index.get_level_values(0).isin(geo.REGIONS)) - & (tables[table_name].index.get_level_values(1) < 2000) - ] = None + & (tables[table_name].index.get_level_values(1) < 2000), + :, + ] = pd.NA #################################################################################################################### # Combine all tables into one. diff --git a/etl/steps/data/garden/ember/2024-11-20/european_wholesale_electricity_prices.countries.json b/etl/steps/data/garden/ember/2024-11-20/european_wholesale_electricity_prices.countries.json new file mode 100644 index 00000000000..48aa385d8ee --- /dev/null +++ b/etl/steps/data/garden/ember/2024-11-20/european_wholesale_electricity_prices.countries.json @@ -0,0 +1,31 @@ +{ + "Austria": "Austria", + "Belgium": "Belgium", + "Bulgaria": "Bulgaria", + "Croatia": "Croatia", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Estonia": "Estonia", + "Finland": "Finland", + "France": "France", + "Germany": "Germany", + "Greece": "Greece", + "Hungary": "Hungary", + "Ireland": "Ireland", + "Italy": "Italy", + "Latvia": "Latvia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Netherlands": "Netherlands", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Poland": "Poland", + "Portugal": "Portugal", + "Romania": "Romania", + "Serbia": "Serbia", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Spain": "Spain", + "Sweden": "Sweden", + "Switzerland": "Switzerland" +} diff --git a/etl/steps/data/garden/ember/2024-11-20/european_wholesale_electricity_prices.meta.yml b/etl/steps/data/garden/ember/2024-11-20/european_wholesale_electricity_prices.meta.yml new file mode 100644 index 00000000000..98ad5c38e4e --- /dev/null +++ b/etl/steps/data/garden/ember/2024-11-20/european_wholesale_electricity_prices.meta.yml @@ -0,0 +1,27 @@ +definitions: + common: + presentation: + topic_tags: + - Energy + processing_level: minor + +dataset: + update_period_days: 365 + +tables: + european_wholesale_electricity_prices_monthly: + variables: + price: + title: Electricity wholesale monthly price + unit: "current euros per megawatt-hour" + short_unit: "€/MWh" + description_short: |- + Monthly average day-ahead spot prices per [megawatt-hour](#dod:watt-hours) of electricity sold. + european_wholesale_electricity_prices_annual: + variables: + price: + title: Electricity wholesale annual price + unit: "current euros per megawatt-hour" + short_unit: "€/MWh" + description_short: |- + Annual average day-ahead spot prices per [megawatt-hour](#dod:watt-hours) of electricity sold. diff --git a/etl/steps/data/garden/ember/2024-11-20/european_wholesale_electricity_prices.py b/etl/steps/data/garden/ember/2024-11-20/european_wholesale_electricity_prices.py new file mode 100644 index 00000000000..e6a27829c94 --- /dev/null +++ b/etl/steps/data/garden/ember/2024-11-20/european_wholesale_electricity_prices.py @@ -0,0 +1,57 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Select and rename columns. +COLUMNS = { + "country": "country", + "date": "date", + "price__eur_mwhe": "price", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("european_wholesale_electricity_prices") + + # Read table from meadow dataset. + tb_monthly = ds_meadow.read("european_wholesale_electricity_prices") + + # + # Process data. + # + # Select and rename columns. + tb_monthly = tb_monthly[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Harmonize country names. + tb_monthly = geo.harmonize_countries(df=tb_monthly, countries_file=paths.country_mapping_path) + + # Ember provides monthly data, so we can create a monthly table of wholesale electricity prices. + # But we also need to create an annual table of average wholesale electricity prices. + tb_annual = tb_monthly.copy() + tb_annual["year"] = tb_annual["date"].str[:4].astype("Int64") + # NOTE: We will include only complete years. This means that the latest year will not be included. But also, we will disregard country-years like Ireland 2022, which only has data for a few months, for some reason. + n_months = tb_annual.groupby(["country", "year"], observed=True, as_index=False)["date"].transform("count") + tb_annual = ( + tb_annual[n_months == 12].groupby(["country", "year"], observed=True, as_index=False).agg({"price": "mean"}) + ) + + # Improve table formats. + tb_monthly = tb_monthly.format(["country", "date"], short_name="european_wholesale_electricity_prices_monthly") + tb_annual = tb_annual.format(short_name="european_wholesale_electricity_prices_annual") + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb_monthly, tb_annual], check_variables_metadata=True) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py b/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py index 0499c02132b..22250f94c7f 100644 --- a/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py +++ b/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py @@ -5,8 +5,8 @@ for per capita GDP and per capita, consumption-based CO2 emissions: https://ourworldindata.org/grapher/co2-emissions-and-gdp -The data in the current step is not used by any grapher step, but will be used by the following static chart: -TODO: Include link to the updated static chart once it is created. +The data from this step is used in this static chart: +https://drive.google.com/file/d/1PflfQpr4mceVWRSGEqMP6Gbo1tFQZzOp/view?usp=sharing """ diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml index 8d6fd94bf5e..e6b59d5f14d 100644 --- a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml +++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml @@ -75,7 +75,7 @@ tables: short_unit: t description_short: *measured-in-tonnes presentation: - title_public: Annual CO₂ emissions + title_public: Annual CO₂ emissions including land use annual_emissions_n2o_fossil: title: Annual nitrous oxide emissions from fossil fuels and industry unit: tonnes @@ -121,7 +121,7 @@ tables: description_short: *ghg-emissions description_processing: *processing-greenhouse-gases presentation: - title_public: Annual greenhouse gas emissions + title_public: Annual greenhouse gas emissions including land use annual_emissions_ch4_fossil_co2eq: title: Annual methane emissions from fossil fuels and industry in CO₂ equivalents unit: tonnes of CO₂ equivalents @@ -145,7 +145,7 @@ tables: description_short: *measured-in-co2-eq description_processing: *processing-methane presentation: - title_public: Annual methane emissions + title_public: Annual methane emissions including land use annual_emissions_n2o_fossil_co2eq: title: Annual nitrous oxide emissions from fossil fuels and industry in CO₂ equivalents unit: tonnes of CO₂ equivalents @@ -169,7 +169,7 @@ tables: description_short: *measured-in-co2-eq description_processing: *processing-nitrous-oxide presentation: - title_public: Annual nitrous oxide emissions + title_public: Annual nitrous oxide emissions including land use # Cumulative emissions of CH4, CO2, N2O and GHG, in tonnes of CO2eq (as originally given in the data). cumulative_emissions_ghg_fossil: title: Cumulative greenhouse gas emissions from fossil fuels and industry @@ -396,33 +396,49 @@ tables: title_public: Share of contribution to global warming # Per capita emissions (calculated by OWID). annual_emissions_co2_total_per_capita: - title: Per-capita CO₂ emissions - unit: tonnes - short_unit: t + title: Per capita CO₂ emissions + unit: tonnes per person + short_unit: t/person description_short: *measured-in-tonnes-per-person presentation: - title_public: Per-capita CO₂ emissions + title_public: Per capita CO₂ emissions including land use + annual_emissions_co2_fossil_per_capita: + title: Per capita CO₂ emissions from fossil fuels and industry + unit: tonnes per person + short_unit: t/person + description_short: *measured-in-tonnes-per-person + presentation: + title_public: Per capita CO₂ emissions from fossil fuels and industry annual_emissions_ch4_total_co2eq_per_capita: - title: Per-capita methane emissions in CO₂ equivalents - unit: tonnes of CO₂ equivalents - short_unit: t + title: Per capita methane emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents per person + short_unit: t/person description_short: *measured-in-co2-eq-per-person description_processing: *processing-methane presentation: - title_public: Per-capita methane emissions + title_public: Per capita methane emissions including land use annual_emissions_n2o_total_co2eq_per_capita: - title: Per-capita nitrous oxide emissions in CO₂ equivalents - unit: tonnes of CO₂ equivalents - short_unit: t + title: Per capita nitrous oxide emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents per person + short_unit: t/person description_short: *measured-in-co2-eq-per-person description_processing: *processing-nitrous-oxide presentation: - title_public: Per-capita nitrous oxide emissions + title_public: Per capita nitrous oxide emissions including land use annual_emissions_ghg_total_co2eq_per_capita: - title: Per-capita greenhouse gas emissions in CO₂ equivalents - unit: tonnes of CO₂ equivalents - short_unit: t + title: Per capita greenhouse gas emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents per person + short_unit: t/person description_short: *ghg-emissions-per-person description_processing: *processing-greenhouse-gases presentation: - title_public: Per-capita greenhouse gas emissions + title_public: Per capita greenhouse gas emissions including land use + annual_emissions_ghg_fossil_co2eq_per_capita: + title: Per capita greenhouse gas emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents per person + short_unit: t/person + description_short: *ghg-emissions-per-person + description_processing: *processing-greenhouse-gases + presentation: + title_public: Per capita greenhouse gas emissions from fossil fuels and industry + diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py index 6ac00bafe70..548ea63802c 100644 --- a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py +++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py @@ -49,8 +49,10 @@ PER_CAPITA_VARIABLES = [ "annual_emissions_ch4_total_co2eq", "annual_emissions_co2_total", + "annual_emissions_co2_fossil", "annual_emissions_n2o_total_co2eq", "annual_emissions_ghg_total_co2eq", + "annual_emissions_ghg_fossil_co2eq", ] # Regions to be added by aggregating data from their member countries. diff --git a/etl/steps/data/garden/emissions/2024-06-20/gdp_and_co2_decoupling.py b/etl/steps/data/garden/emissions/2024-06-20/gdp_and_co2_decoupling.py index 0499c02132b..22250f94c7f 100644 --- a/etl/steps/data/garden/emissions/2024-06-20/gdp_and_co2_decoupling.py +++ b/etl/steps/data/garden/emissions/2024-06-20/gdp_and_co2_decoupling.py @@ -5,8 +5,8 @@ for per capita GDP and per capita, consumption-based CO2 emissions: https://ourworldindata.org/grapher/co2-emissions-and-gdp -The data in the current step is not used by any grapher step, but will be used by the following static chart: -TODO: Include link to the updated static chart once it is created. +The data from this step is used in this static chart: +https://drive.google.com/file/d/1PflfQpr4mceVWRSGEqMP6Gbo1tFQZzOp/view?usp=sharing """ diff --git a/etl/steps/data/garden/emissions/2024-11-13/gdp_and_co2_decoupling.meta.yml b/etl/steps/data/garden/emissions/2024-11-13/gdp_and_co2_decoupling.meta.yml new file mode 100644 index 00000000000..7d8a0eb20e4 --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-13/gdp_and_co2_decoupling.meta.yml @@ -0,0 +1,3 @@ +dataset: + title: Decoupling of GDP and CO2 emissions + update_period_days: 365 diff --git a/etl/steps/data/garden/emissions/2024-11-13/gdp_and_co2_decoupling.py b/etl/steps/data/garden/emissions/2024-11-13/gdp_and_co2_decoupling.py new file mode 100644 index 00000000000..c0de7bc239f --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-13/gdp_and_co2_decoupling.py @@ -0,0 +1,158 @@ +"""This step takes the Global Carbon Budget and GDP data from World Bank's World Development Indicators, and creates a +dataset with the changes in emissions and GDP over time. + +We already have an interactive chart showing similar data, +for per capita GDP and per capita, consumption-based CO2 emissions: +https://ourworldindata.org/grapher/co2-emissions-and-gdp + +The data in the current step is not used by any grapher step, but will be used by the following static chart: + +The data from this step is used in this static chart: +https://drive.google.com/file/d/1PflfQpr4mceVWRSGEqMP6Gbo1tFQZzOp/view?usp=sharing + +""" + +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# First and final years whose (per capita) GDP and emissions will be compared. +START_YEAR = 2006 +END_YEAR = 2021 + +# Columns to select from WDI, and how to rename them. +COLUMNS_WDI = { + "country": "country", + "year": "year", + # GDP, PPP (constant 2017 international $) + # "ny_gdp_mktp_pp_kd": "gdp", + # GDP per capita, PPP (constant 2017 international $) + "ny_gdp_pcap_pp_kd": "gdp_per_capita", +} + +# Columns to select from GCB, and how to rename them. +COLUMNS_GCB = { + "country": "country", + "year": "year", + # "emissions_total": "production_emissions", + # "emissions_total_per_capita": "production_emissions_per_capita", + # "consumption_emissions": "consumption_emissions", + "consumption_emissions_per_capita": "consumption_emissions_per_capita", + # 'emissions_total_including_land_use_change': "", + # 'emissions_total_including_land_use_change_per_capita': "", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load Global Carbon Budget dataset and read its main table. + ds_gcb = paths.load_dataset("global_carbon_budget") + tb_gcb = ds_gcb["global_carbon_budget"].reset_index() + + # Load WDI dataset, read its main table. + ds_wdi = paths.load_dataset("wdi") + tb_wdi = ds_wdi["wdi"].reset_index() + + # + # Process data. + # + # Select and rename the required variables from GCB. + tb_gcb = tb_gcb[list(COLUMNS_GCB)].rename(columns=COLUMNS_GCB, errors="raise") + + # Select and rename the required variables from WDI. + tb_wdi = tb_wdi[list(COLUMNS_WDI)].rename(columns=COLUMNS_WDI, errors="raise") + + # Combine both tables. + tb = tb_gcb.merge(tb_wdi, on=["country", "year"], how="outer", short_name=paths.short_name) + + # Define list of non-index columns. + data_columns = [column for column in tb.columns if column not in ["country", "year"]] + + # Remove empty rows. + tb = tb.dropna(subset=data_columns, how="all").reset_index(drop=True) + + # Select years between START_YEAR and END_YEAR. + tb_start = tb[(tb["year"] == START_YEAR)].reset_index(drop=True) + + # Select data for all countries at the final year. + tb_end = tb[tb["year"] == END_YEAR].reset_index(drop=True) + + # Add columns for data on the final year to the main table. + tb = tb_start.merge(tb_end, on="country", how="left", suffixes=("_start_year", "_final_year")) + + # Add percent changes. + for column in data_columns: + tb[f"{column}_change"] = ( + (tb[f"{column}_final_year"] - tb[f"{column}_start_year"]) / tb[f"{column}_start_year"] * 100 + ) + + # Remove unnecessary columns. + tb = tb.drop(columns=[column for column in tb.columns if "year" in column]) + + # Drop rows that miss any of the main columns. + tb = tb.dropna(how="any").reset_index(drop=True) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["country"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True, formats=["csv"]) + ds_garden.save() + + +# To quickly inspect the decoupling of GDP per capita vs consumption-based emissions per capita, use this function. +# def plot_decoupling(tb, countries=None): +# import plotly.express as px +# import owid.catalog.processing as pr +# from tqdm.auto import tqdm + +# column = "gdp_per_capita_change" +# emissions_column = "consumption_emissions_per_capita_change" +# _tb = tb.reset_index().astype({"country": str})[["country", column, emissions_column]] +# _tb["year"] = START_YEAR +# if countries is None: +# countries = sorted(set(_tb["country"])) +# for country in tqdm(countries): +# tb_old = _tb[_tb["country"] == country].reset_index(drop=True) +# if (tb_old[emissions_column].isna().all()) or (tb_old[column].isna().all()): +# continue +# title = tb_old[column].metadata.title or column +# tb_new = tb_old.copy() +# tb_new["year"] = END_YEAR +# tb_old[column] = 0 +# tb_old[emissions_column] = 0 +# tb_plot = pr.concat([tb_old, tb_new], ignore_index=True) +# tb_plot = tb_plot.melt(id_vars=["country", "year"], var_name="Indicator") +# plot = px.line(tb_plot, x="year", y="value", color="Indicator", title=f"{country} - {title}") +# plot.show() + +# List of countries currently considered for the static chart: +# countries = ["Ireland", "Finland", "Sweden", "Denmark", "Netherlands", "Estonia", "United States", "Canada", "Germany", +# "Belgium", "New Zealand", "Israel", "Japan", "Singapore", "Dominican Republic", "Hungary", "Australia", "Zimbabwe", +# "Ukraine", "Bulgaria", "Switzerland", "Hong Kong", "Slovakia", "Romania", "Czechia", "Nicaragua", "Nigeria", +# "Azerbaijan", "Slovenia", "Croatia"] +# Check that the chosen countries still fulfil the expected conditions. +# print("Countries in the list where GDP has increased less than 5% or emissions have decreased less than 5%:") +# for c in countries: +# if not tb.loc[c]["consumption_emissions_per_capita_change"] < -5: +# print("emissions", c, tb.loc[c]["consumption_emissions_per_capita_change"]) +# if not tb.loc[c]["gdp_per_capita_change"] > 5: +# print("gdp", c, tb.loc[c]["gdp_per_capita_change"]) + +# If not, print other countries that do fulfil the conditions and are not in the chart. +# other_countries = sorted(set(tb.index) - set(countries)) +# for c in other_countries: +# if (tb.loc[c]["consumption_emissions_per_capita_change"] < -5) and (tb.loc[c]["gdp_per_capita_change"] > 5): +# print(c, f' -> GDP: {tb.loc[c]["gdp_per_capita_change"]: .1f}%, Emissions: {tb.loc[c]["consumption_emissions_per_capita_change"]:.1f}%') + +# plot_decoupling(tb, countries=countries) diff --git a/etl/steps/data/garden/emissions/2024-11-13/owid_co2.py b/etl/steps/data/garden/emissions/2024-11-13/owid_co2.py new file mode 100644 index 00000000000..c93ad47b92b --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-13/owid_co2.py @@ -0,0 +1,490 @@ +"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset. + +Datasets combined: +* Global Carbon Budget - Global Carbon Project. +* National contributions to climate change - Jones et al. +* Greenhouse gas emissions by sector - Climate Watch. +* Primary energy consumption - EI & EIA. + +Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2023) on +GDP are included. + +""" + +import re + +import numpy as np +import pandas as pd +from owid.catalog import Dataset, Origin, Table +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +# Initialize logger. +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Conversion factor from tonnes to million tonnes. +TONNES_TO_MILLION_TONNES = 1e-6 + +# Select columns to use from each dataset, and how to rename them. +GCP_COLUMNS = { + "country": "country", + "year": "year", + "emissions_total": "co2", + "emissions_total_per_capita": "co2_per_capita", + "traded_emissions": "trade_co2", + "emissions_from_cement": "cement_co2", + "emissions_from_cement_per_capita": "cement_co2_per_capita", + "emissions_from_coal": "coal_co2", + "emissions_from_coal_per_capita": "coal_co2_per_capita", + "emissions_from_flaring": "flaring_co2", + "emissions_from_flaring_per_capita": "flaring_co2_per_capita", + "emissions_from_gas": "gas_co2", + "emissions_from_gas_per_capita": "gas_co2_per_capita", + "emissions_from_oil": "oil_co2", + "emissions_from_oil_per_capita": "oil_co2_per_capita", + "emissions_from_other_industry": "other_industry_co2", + "emissions_from_other_industry_per_capita": "other_co2_per_capita", + "pct_growth_emissions_total": "co2_growth_prct", + "growth_emissions_total": "co2_growth_abs", + "emissions_total_per_gdp": "co2_per_gdp", + "emissions_total_per_unit_energy": "co2_per_unit_energy", + "consumption_emissions": "consumption_co2", + "consumption_emissions_per_capita": "consumption_co2_per_capita", + "consumption_emissions_per_gdp": "consumption_co2_per_gdp", + "cumulative_emissions_total": "cumulative_co2", + "cumulative_emissions_from_cement": "cumulative_cement_co2", + "cumulative_emissions_from_coal": "cumulative_coal_co2", + "cumulative_emissions_from_flaring": "cumulative_flaring_co2", + "cumulative_emissions_from_gas": "cumulative_gas_co2", + "cumulative_emissions_from_oil": "cumulative_oil_co2", + "cumulative_emissions_from_other_industry": "cumulative_other_co2", + "pct_traded_emissions": "trade_co2_share", + "emissions_total_as_share_of_global": "share_global_co2", + "emissions_from_cement_as_share_of_global": "share_global_cement_co2", + "emissions_from_coal_as_share_of_global": "share_global_coal_co2", + "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", + "emissions_from_gas_as_share_of_global": "share_global_gas_co2", + "emissions_from_oil_as_share_of_global": "share_global_oil_co2", + "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", + "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", + "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", + "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", + "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", + "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", + "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", + "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", + # New variables, related to land-use change emissions. + "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", + "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", + "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", + "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", + "emissions_from_land_use_change": "land_use_change_co2", + "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", + "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", + "emissions_total_including_land_use_change": "co2_including_luc", + "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", + "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", + "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", + "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", + "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", + "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", +} +JONES_COLUMNS = { + "country": "country", + "year": "year", + "temperature_response_co2_total": "temperature_change_from_co2", + "temperature_response_ghg_total": "temperature_change_from_ghg", + "temperature_response_ch4_total": "temperature_change_from_ch4", + "temperature_response_n2o_total": "temperature_change_from_n2o", + "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", +} +CLIMATE_WATCH_GHG_COLUMNS = { + "country": "country", + "year": "year", + "total_ghg_emissions_excluding_lucf": "total_ghg_excluding_lucf", + "total_ghg_emissions_excluding_lucf_per_capita": "ghg_excluding_lucf_per_capita", + "total_ghg_emissions_including_lucf": "total_ghg", + "total_ghg_emissions_including_lucf_per_capita": "ghg_per_capita", +} +CLIMATE_WATCH_CH4_COLUMNS = { + "country": "country", + "year": "year", + "total_ch4_emissions_including_lucf": "methane", + "total_ch4_emissions_including_lucf_per_capita": "methane_per_capita", +} +CLIMATE_WATCH_N2O_COLUMNS = { + "country": "country", + "year": "year", + "total_n2o_emissions_including_lucf": "nitrous_oxide", + "total_n2o_emissions_including_lucf_per_capita": "nitrous_oxide_per_capita", +} +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", + "primary_energy_consumption_per_capita__kwh": "energy_per_capita", + "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", +} +REGIONS_COLUMNS = { + "name": "country", + "iso_alpha3": "iso_code", +} +POPULATION_COLUMNS = { + "country": "country", + "year": "year", + "population": "population", +} +GDP_COLUMNS = { + "country": "country", + "year": "year", + "gdp": "gdp", +} + +UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes", "new_short_unit": "Mt"}} + + +def convert_units(table: Table) -> Table: + """Convert units of table. + + Parameters + ---------- + table : Table + Data with its original units. + + Returns + ------- + Table + Data after converting units of specific columns. + + """ + table = table.copy() + # Check units and convert to more convenient ones. + for column in table.columns: + unit = table[column].metadata.unit + title = table[column].metadata.title + description_short = table[column].metadata.description or table[column].metadata.description_short + if unit in list(UNITS): + table[column] *= UNITS[unit]["conversion"] + table[column].metadata.unit = UNITS[unit]["new_unit"] + table[column].metadata.short_unit = UNITS[unit]["new_short_unit"] + table[column].metadata.title = title.replace(unit, UNITS[unit]["new_unit"]) + table[column].metadata.description_short = description_short.replace(unit, UNITS[unit]["new_unit"]) + + return table + + +def combine_tables( + tb_gcp: Table, + tb_jones: Table, + tb_climate_watch_ghg: Table, + tb_climate_watch_ch4: Table, + tb_climate_watch_n2o: Table, + tb_energy: Table, + tb_gdp: Table, + tb_population: Table, + tb_regions: Table, +) -> Table: + """Combine tables. + + Parameters + ---------- + tb_gcp : Table + Global Carbon Budget table (from Global Carbon Project). + tb_jones : Table + National contributions to climate change (from Jones et al. (2023)). + tb_climate_watch_ghg : Table + Greenhouse gas emissions table (from Climate Watch). + tb_climate_watch_ch4 : Table + CH4 emissions table (from Climate Watch). + tb_climate_watch_n2o : Table + N2O emissions table (from Climate Watch). + tb_energy : Table + Primary energy consumption table (from BP & EIA). + tb_gdp : Table + Maddison GDP table (from GGDC). + tb_population : Table + OWID population table (from various sources). + tb_regions : Table + OWID regions table. + + Returns + ------- + combined : Table + Combined table with metadata and variables metadata. + + """ + # Combine main tables (with an outer join, to gather all entities from all tables). + combined = tb_gcp.copy() + for table in [tb_jones, tb_climate_watch_ghg, tb_climate_watch_ch4, tb_climate_watch_n2o]: + combined = combined.merge(table, on=["country", "year"], how="outer", short_name=paths.short_name) + + # Add secondary tables (with a left join, to keep only entities for which we have emissions data). + for table in [tb_energy, tb_gdp, tb_population]: + combined = combined.merge(table, on=["country", "year"], how="left") + + # Countries-regions dataset does not have a year column, so it has to be merged on country. + combined = combined.merge(tb_regions, on="country", how="left") + + # Check that there were no repetition in column names. + error = "Repeated columns in combined data." + assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error + + # Adjust units. + combined = convert_units(combined) + + return combined + + +def prepare_outputs(combined: Table, ds_regions: Dataset) -> Table: + """Clean and prepare output table. + + Parameters + ---------- + combined : Table + Combined table. + ds_regions : Dataset + Regions dataset, only used to get its version. + + Returns + ------- + combined: Table + Cleaned combined table. + + """ + # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). + columns_that_must_have_data = [ + column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] + ] + combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) + + # Add metadata to the ISO column (loaded from the regions dataset). + combined["iso_code"].m.origins = [ + Origin( + producer="International Organization for Standardization", + title="Regions", + date_published=ds_regions.version, + ) + ] + combined["iso_code"].metadata.title = "ISO code" + combined["iso_code"].metadata.description_short = "ISO 3166-1 alpha-3 three-letter country codes." + combined["iso_code"].metadata.unit = "" + + # Sanity check. + columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] + assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" + + # Sort rows and columns conveniently. + first_columns = ["country", "year", "iso_code", "population", "gdp"] + combined = combined[first_columns + [column for column in sorted(combined.columns) if column not in first_columns]] + + # Improve table format. + combined = combined.format() + + return combined + + +def remove_details_on_demand(text: str) -> str: + # Remove references to details on demand from a text. + # Example: "This is a [description](#dod:something)." -> "This is a description." + regex = r"\(\#dod\:.*\)" + if "(#dod:" in text: + text = re.sub(regex, "", text).replace("[", "").replace("]", "") + + return text + + +def prepare_codebook(tb: Table) -> pd.DataFrame: + table = tb.reset_index() + + # Manually create an origin for the regions dataset. + regions_origin = [Origin(producer="Our World in Data", title="Regions", date_published=str(table["year"].max()))] + + # Manually edit some of the metadata fields. + table["country"].metadata.title = "Country" + table["country"].metadata.description_short = "Geographic location." + table["country"].metadata.description = None + table["country"].metadata.unit = "" + table["country"].metadata.origins = regions_origin + table["year"].metadata.title = "Year" + table["year"].metadata.description_short = "Year of observation." + table["year"].metadata.description = None + table["year"].metadata.unit = "" + table["year"].metadata.origins = regions_origin + + #################################################################################################################### + if table["population"].metadata.description is None: + print("WARNING: Column population has no longer a description field. Remove this part of the code") + else: + table["population"].metadata.description = None + + #################################################################################################################### + + # Gather column names, titles, short descriptions, unit and origins from the indicators' metadata. + metadata = {"column": [], "description": [], "unit": [], "source": []} + for column in table.columns: + metadata["column"].append(column) + + if hasattr(table[column].metadata, "description") and table[column].metadata.description is not None: + print(f"WARNING: Column {column} still has a 'description' field.") + # Prepare indicator's description. + description = "" + if ( + hasattr(table[column].metadata.presentation, "title_public") + and table[column].metadata.presentation.title_public is not None + ): + description += table[column].metadata.presentation.title_public + else: + description += table[column].metadata.title + if table[column].metadata.description_short: + description += f" - {table[column].metadata.description_short}" + description = remove_details_on_demand(description) + metadata["description"].append(description) + + # Prepare indicator's unit. + if table[column].metadata.unit is None: + print(f"WARNING: Column {column} does not have a unit.") + unit = "" + else: + unit = table[column].metadata.unit + metadata["unit"].append(unit) + + # Gather unique origins of current variable. + unique_sources = [] + for origin in table[column].metadata.origins: + # Construct the source name from the origin's attribution. + # If not defined, build it using the default format "Producer - Data product (year)". + source_name = ( + origin.attribution + or f"{origin.producer} - {origin.title or origin.title_snapshot} ({origin.date_published.split('-')[0]})" + ) + + # Add url at the end of the source. + if origin.url_main: + source_name += f" [{origin.url_main}]" + + # Add the source to the list of unique sources. + if source_name not in unique_sources: + unique_sources.append(source_name) + + # Concatenate all sources. + sources_combined = "; ".join(unique_sources) + metadata["source"].append(sources_combined) + + # Create a dataframe with the gathered metadata and sort conveniently by column name. + codebook = pd.DataFrame(metadata).set_index("column").sort_index() + # For clarity, ensure column descriptions are in the same order as the columns in the data. + first_columns = ["country", "year", "iso_code", "population", "gdp"] + codebook = pd.concat([codebook.loc[first_columns], codebook.drop(first_columns, errors="raise")]).reset_index() + # Create a table with the appropriate metadata. + codebook = Table(codebook).format( + keys=["column"], sort_rows=False, sort_columns=False, short_name="owid_co2_codebook" + ) + codebook_origin = [ + Origin(producer="Our World in Data", title="CO2-data codebook", date_published=str(table["year"].max())) + ] + for column in ["description", "unit", "source"]: + codebook[column].metadata.origins = codebook_origin + + return codebook + + +def sanity_check_outputs(tb: Table, tb_codebook: Table) -> None: + error = "Dataset columns should coincide with the codebook 'columns'." + assert set(tb_codebook.reset_index()["column"]) == set(tb.reset_index().columns), error + + error = "All rows in dataset should contain at least one non-NaN value." + assert not tb.isnull().all(axis=1).any(), error + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load the global carbon budget dataset from the Global Carbon Project (GCP). + ds_gcp = paths.load_dataset("global_carbon_budget") + + # Load the Jones et al. (2023) dataset on national contributions to climate change. + ds_jones = paths.load_dataset("national_contributions") + + # Load the greenhouse gas emissions by sector dataset by Climate Watch. + ds_climate_watch = paths.load_dataset("emissions_by_sector") + + # Load the GDP dataset by GGDC Maddison. + ds_gdp = paths.load_dataset("maddison_project_database") + + # Load primary energy consumption dataset (by different sources in our 'energy' namespace). + ds_energy = paths.load_dataset("primary_energy_consumption") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load countries-regions dataset (required to get ISO codes). + ds_regions = paths.load_dataset("regions") + + # Gather all required tables from all datasets. + tb_gcp = ds_gcp["global_carbon_budget"] + tb_jones = ds_jones["national_contributions"] + tb_climate_watch_ghg = ds_climate_watch["greenhouse_gas_emissions_by_sector"] + tb_climate_watch_ch4 = ds_climate_watch["methane_emissions_by_sector"] + tb_climate_watch_n2o = ds_climate_watch["nitrous_oxide_emissions_by_sector"] + tb_energy = ds_energy["primary_energy_consumption"] + tb_gdp = ds_gdp["maddison_project_database"] + tb_population = ds_population["population"] + tb_regions = ds_regions["regions"] + + # + # Process data. + # + # Choose required columns and rename them. + tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS, errors="raise") + tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS, errors="raise") + tb_climate_watch_ghg = tb_climate_watch_ghg.reset_index()[list(CLIMATE_WATCH_GHG_COLUMNS)].rename( + columns=CLIMATE_WATCH_GHG_COLUMNS, errors="raise" + ) + tb_climate_watch_ch4 = tb_climate_watch_ch4.reset_index()[list(CLIMATE_WATCH_CH4_COLUMNS)].rename( + columns=CLIMATE_WATCH_CH4_COLUMNS, errors="raise" + ) + tb_climate_watch_n2o = tb_climate_watch_n2o.reset_index()[list(CLIMATE_WATCH_N2O_COLUMNS)].rename( + columns=CLIMATE_WATCH_N2O_COLUMNS, errors="raise" + ) + tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename( + columns=PRIMARY_ENERGY_COLUMNS, errors="raise" + ) + tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") + tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename( + columns=POPULATION_COLUMNS, errors="raise" + ) + tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS, errors="raise") + + # Combine tables. + combined = combine_tables( + tb_gcp=tb_gcp, + tb_jones=tb_jones, + tb_climate_watch_ghg=tb_climate_watch_ghg, + tb_climate_watch_ch4=tb_climate_watch_ch4, + tb_climate_watch_n2o=tb_climate_watch_n2o, + tb_energy=tb_energy, + tb_gdp=tb_gdp, + tb_population=tb_population, + tb_regions=tb_regions, + ) + + # Prepare output data table. + tb = prepare_outputs(combined=combined, ds_regions=ds_regions) + + # Prepare codebook. + tb_codebook = prepare_codebook(tb=tb) + + # Sanity check. + sanity_check_outputs(tb=tb, tb_codebook=tb_codebook) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb, tb_codebook], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.meta.yml b/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.meta.yml new file mode 100644 index 00000000000..7d8a0eb20e4 --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.meta.yml @@ -0,0 +1,3 @@ +dataset: + title: Decoupling of GDP and CO2 emissions + update_period_days: 365 diff --git a/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.py b/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.py new file mode 100644 index 00000000000..c0de7bc239f --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.py @@ -0,0 +1,158 @@ +"""This step takes the Global Carbon Budget and GDP data from World Bank's World Development Indicators, and creates a +dataset with the changes in emissions and GDP over time. + +We already have an interactive chart showing similar data, +for per capita GDP and per capita, consumption-based CO2 emissions: +https://ourworldindata.org/grapher/co2-emissions-and-gdp + +The data in the current step is not used by any grapher step, but will be used by the following static chart: + +The data from this step is used in this static chart: +https://drive.google.com/file/d/1PflfQpr4mceVWRSGEqMP6Gbo1tFQZzOp/view?usp=sharing + +""" + +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# First and final years whose (per capita) GDP and emissions will be compared. +START_YEAR = 2006 +END_YEAR = 2021 + +# Columns to select from WDI, and how to rename them. +COLUMNS_WDI = { + "country": "country", + "year": "year", + # GDP, PPP (constant 2017 international $) + # "ny_gdp_mktp_pp_kd": "gdp", + # GDP per capita, PPP (constant 2017 international $) + "ny_gdp_pcap_pp_kd": "gdp_per_capita", +} + +# Columns to select from GCB, and how to rename them. +COLUMNS_GCB = { + "country": "country", + "year": "year", + # "emissions_total": "production_emissions", + # "emissions_total_per_capita": "production_emissions_per_capita", + # "consumption_emissions": "consumption_emissions", + "consumption_emissions_per_capita": "consumption_emissions_per_capita", + # 'emissions_total_including_land_use_change': "", + # 'emissions_total_including_land_use_change_per_capita': "", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load Global Carbon Budget dataset and read its main table. + ds_gcb = paths.load_dataset("global_carbon_budget") + tb_gcb = ds_gcb["global_carbon_budget"].reset_index() + + # Load WDI dataset, read its main table. + ds_wdi = paths.load_dataset("wdi") + tb_wdi = ds_wdi["wdi"].reset_index() + + # + # Process data. + # + # Select and rename the required variables from GCB. + tb_gcb = tb_gcb[list(COLUMNS_GCB)].rename(columns=COLUMNS_GCB, errors="raise") + + # Select and rename the required variables from WDI. + tb_wdi = tb_wdi[list(COLUMNS_WDI)].rename(columns=COLUMNS_WDI, errors="raise") + + # Combine both tables. + tb = tb_gcb.merge(tb_wdi, on=["country", "year"], how="outer", short_name=paths.short_name) + + # Define list of non-index columns. + data_columns = [column for column in tb.columns if column not in ["country", "year"]] + + # Remove empty rows. + tb = tb.dropna(subset=data_columns, how="all").reset_index(drop=True) + + # Select years between START_YEAR and END_YEAR. + tb_start = tb[(tb["year"] == START_YEAR)].reset_index(drop=True) + + # Select data for all countries at the final year. + tb_end = tb[tb["year"] == END_YEAR].reset_index(drop=True) + + # Add columns for data on the final year to the main table. + tb = tb_start.merge(tb_end, on="country", how="left", suffixes=("_start_year", "_final_year")) + + # Add percent changes. + for column in data_columns: + tb[f"{column}_change"] = ( + (tb[f"{column}_final_year"] - tb[f"{column}_start_year"]) / tb[f"{column}_start_year"] * 100 + ) + + # Remove unnecessary columns. + tb = tb.drop(columns=[column for column in tb.columns if "year" in column]) + + # Drop rows that miss any of the main columns. + tb = tb.dropna(how="any").reset_index(drop=True) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["country"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True, formats=["csv"]) + ds_garden.save() + + +# To quickly inspect the decoupling of GDP per capita vs consumption-based emissions per capita, use this function. +# def plot_decoupling(tb, countries=None): +# import plotly.express as px +# import owid.catalog.processing as pr +# from tqdm.auto import tqdm + +# column = "gdp_per_capita_change" +# emissions_column = "consumption_emissions_per_capita_change" +# _tb = tb.reset_index().astype({"country": str})[["country", column, emissions_column]] +# _tb["year"] = START_YEAR +# if countries is None: +# countries = sorted(set(_tb["country"])) +# for country in tqdm(countries): +# tb_old = _tb[_tb["country"] == country].reset_index(drop=True) +# if (tb_old[emissions_column].isna().all()) or (tb_old[column].isna().all()): +# continue +# title = tb_old[column].metadata.title or column +# tb_new = tb_old.copy() +# tb_new["year"] = END_YEAR +# tb_old[column] = 0 +# tb_old[emissions_column] = 0 +# tb_plot = pr.concat([tb_old, tb_new], ignore_index=True) +# tb_plot = tb_plot.melt(id_vars=["country", "year"], var_name="Indicator") +# plot = px.line(tb_plot, x="year", y="value", color="Indicator", title=f"{country} - {title}") +# plot.show() + +# List of countries currently considered for the static chart: +# countries = ["Ireland", "Finland", "Sweden", "Denmark", "Netherlands", "Estonia", "United States", "Canada", "Germany", +# "Belgium", "New Zealand", "Israel", "Japan", "Singapore", "Dominican Republic", "Hungary", "Australia", "Zimbabwe", +# "Ukraine", "Bulgaria", "Switzerland", "Hong Kong", "Slovakia", "Romania", "Czechia", "Nicaragua", "Nigeria", +# "Azerbaijan", "Slovenia", "Croatia"] +# Check that the chosen countries still fulfil the expected conditions. +# print("Countries in the list where GDP has increased less than 5% or emissions have decreased less than 5%:") +# for c in countries: +# if not tb.loc[c]["consumption_emissions_per_capita_change"] < -5: +# print("emissions", c, tb.loc[c]["consumption_emissions_per_capita_change"]) +# if not tb.loc[c]["gdp_per_capita_change"] > 5: +# print("gdp", c, tb.loc[c]["gdp_per_capita_change"]) + +# If not, print other countries that do fulfil the conditions and are not in the chart. +# other_countries = sorted(set(tb.index) - set(countries)) +# for c in other_countries: +# if (tb.loc[c]["consumption_emissions_per_capita_change"] < -5) and (tb.loc[c]["gdp_per_capita_change"] > 5): +# print(c, f' -> GDP: {tb.loc[c]["gdp_per_capita_change"]: .1f}%, Emissions: {tb.loc[c]["consumption_emissions_per_capita_change"]:.1f}%') + +# plot_decoupling(tb, countries=countries) diff --git a/etl/steps/archive/garden/emissions/2023-05-02/national_contributions.countries.json b/etl/steps/data/garden/emissions/2024-11-21/national_contributions.countries.json similarity index 89% rename from etl/steps/archive/garden/emissions/2023-05-02/national_contributions.countries.json rename to etl/steps/data/garden/emissions/2024-11-21/national_contributions.countries.json index 10202052c29..f426d423849 100644 --- a/etl/steps/archive/garden/emissions/2023-05-02/national_contributions.countries.json +++ b/etl/steps/data/garden/emissions/2024-11-21/national_contributions.countries.json @@ -31,7 +31,6 @@ "British Virgin Islands": "British Virgin Islands", "Brunei Darussalam": "Brunei", "Bulgaria": "Bulgaria", - "Bunkers": "International transport", "Burkina Faso": "Burkina Faso", "Burundi": "Burundi", "Cambodia": "Cambodia", @@ -52,7 +51,7 @@ "Cuba": "Cuba", "Cura\u00e7ao": "Curacao", "Cyprus": "Cyprus", - "Czech Republic": "Czechia", + "Czechia": "Czechia", "C\u00f4te d'Ivoire": "Cote d'Ivoire", "Democratic Republic of the Congo": "Democratic Republic of Congo", "Denmark": "Denmark", @@ -158,7 +157,6 @@ "Philippines": "Philippines", "Poland": "Poland", "Portugal": "Portugal", - "Puerto Rico": "Puerto Rico", "Qatar": "Qatar", "Romania": "Romania", "Russia": "Russia", @@ -201,7 +199,7 @@ "Tonga": "Tonga", "Trinidad and Tobago": "Trinidad and Tobago", "Tunisia": "Tunisia", - "Turkey": "Turkey", + "Türkiye": "Turkey", "Turkmenistan": "Turkmenistan", "Turks and Caicos Islands": "Turks and Caicos Islands", "Tuvalu": "Tuvalu", @@ -219,13 +217,8 @@ "Yemen": "Yemen", "Zambia": "Zambia", "Zimbabwe": "Zimbabwe", - "LDC": "Least developed countries (Jones et al. 2023)", - "OECD": "OECD (Jones et al. 2023)", - "French Equatorial Africa": "French Equatorial Africa (Jones et al. 2023)", - "French West Africa": "French West Africa (Jones et al. 2023)", - "Kuwaiti Oil Fires": "Kuwaiti Oil Fires (Jones et al. 2023)", - "Leeward Islands": "Leeward Islands (Jones et al. 2023)", - "Panama Canal Zone": "Panama Canal Zone (Jones et al. 2023)", - "Ryukyu Islands": "Ryukyu Islands (Jones et al. 2023)", - "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla (Jones et al. 2023)" + "Kuwaiti Oil Fires": "Kuwaiti Oil Fires", + "Ryukyu Islands": "Ryukyu Islands", + "LDC": "Least developed countries (Jones et al.)", + "OECD": "OECD (Jones et al.)" } diff --git a/etl/steps/archive/garden/emissions/2023-05-02/national_contributions.excluded_countries.json b/etl/steps/data/garden/emissions/2024-11-21/national_contributions.excluded_countries.json similarity index 100% rename from etl/steps/archive/garden/emissions/2023-05-02/national_contributions.excluded_countries.json rename to etl/steps/data/garden/emissions/2024-11-21/national_contributions.excluded_countries.json diff --git a/etl/steps/data/garden/emissions/2024-11-21/national_contributions.meta.yml b/etl/steps/data/garden/emissions/2024-11-21/national_contributions.meta.yml new file mode 100644 index 00000000000..e6b59d5f14d --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-21/national_contributions.meta.yml @@ -0,0 +1,444 @@ +definitions: + measured_in_celsius: &measured-in-celsius |- + Measured in °C. + measured_in_tonnes: &measured-in-tonnes |- + Measured in tonnes. + measured_in_tonnes_per_person: &measured-in-tonnes-per-person |- + Measured in tonnes per person. + measured_in_co2_eq: &measured-in-co2-eq |- + Measured in tonnes of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + measured_in_co2_eq_per_person: &measured-in-co2-eq-per-person |- + Measured in tonnes per person of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + ghg_emissions: &ghg-emissions |- + [Greenhouse gas emissions](#dod:ghgemissions) are measured in tonnes of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + ghg_emissions_per_person: &ghg-emissions-per-person |- + [Greenhouse gas emissions](#dod:ghgemissions) are measured in tonnes per person of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + processing_methane: &processing-methane |- + Methane emissions in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources and 27.2 for agricultural and land use sources. These factors are taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC). + processing_nitrous_oxide: &processing-nitrous-oxide |- + Nitrous oxide emissions in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273. This factor is taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC). + processing_greenhouse_gases: &processing-greenhouse-gases |- + Emissions given in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources. These factors are taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC). + common: + processing_level: major + presentation: + topic_tags: + - CO2 & Greenhouse Gas Emissions + +dataset: + update_period_days: 365 + description: |- + Jones et al. quantify national and regional contributions to the increase of global mean surface temperature over the last few centuries. + +tables: + national_contributions: + variables: + # Emissions of CH4, CO2, N2O in tonnes (as originally given in the data). + annual_emissions_ch4_fossil: + title: Annual methane emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual methane emissions from fossil fuels and industry + annual_emissions_ch4_land: + title: Annual methane emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual methane emissions from agriculture and land use + annual_emissions_ch4_total: + title: Annual methane emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual methane emissions + annual_emissions_co2_fossil: + title: Annual CO₂ emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual CO₂ emissions from fossil fuels and industry + annual_emissions_co2_land: + title: Annual CO₂ emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual CO₂ emissions from agriculture and land use + annual_emissions_co2_total: + title: Annual CO₂ emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual CO₂ emissions including land use + annual_emissions_n2o_fossil: + title: Annual nitrous oxide emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual nitrous oxide emissions from fossil fuels and industry + annual_emissions_n2o_land: + title: Annual nitrous oxide emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual nitrous oxide emissions from agriculture and land use + annual_emissions_n2o_total: + title: Annual nitrous oxide emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual nitrous oxide emissions + # Emissions (calculated by OWID) of CH4, CO2, N2O in tonnes of CO2eq, as well as combined GHG emissions in CO2eq. + annual_emissions_ghg_fossil_co2eq: + title: Annual greenhouse gas emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + description_processing: *processing-greenhouse-gases + presentation: + title_public: Annual greenhouse gas emissions from fossil fuels and industry + annual_emissions_ghg_land_co2eq: + title: Annual greenhouse gas emissions from agriculture and land use in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + description_processing: *processing-greenhouse-gases + presentation: + title_public: Annual greenhouse gas emissions from agriculture and land use + annual_emissions_ghg_total_co2eq: + title: Annual greenhouse gas emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + description_processing: *processing-greenhouse-gases + presentation: + title_public: Annual greenhouse gas emissions including land use + annual_emissions_ch4_fossil_co2eq: + title: Annual methane emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-methane + presentation: + title_public: Annual methane emissions from fossil fuels and industry + annual_emissions_ch4_land_co2eq: + title: Annual methane emissions from agriculture and land use in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-methane + presentation: + title_public: Annual methane emissions from agriculture and land use + annual_emissions_ch4_total_co2eq: + title: Annual methane emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-methane + presentation: + title_public: Annual methane emissions including land use + annual_emissions_n2o_fossil_co2eq: + title: Annual nitrous oxide emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-nitrous-oxide + presentation: + title_public: Annual nitrous oxide emissions from fossil fuels and industry + annual_emissions_n2o_land_co2eq: + title: Annual nitrous oxide emissions from agriculture and land use in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-nitrous-oxide + presentation: + title_public: Annual nitrous oxide emissions from agriculture and land use + annual_emissions_n2o_total_co2eq: + title: Annual nitrous oxide emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-nitrous-oxide + presentation: + title_public: Annual nitrous oxide emissions including land use + # Cumulative emissions of CH4, CO2, N2O and GHG, in tonnes of CO2eq (as originally given in the data). + cumulative_emissions_ghg_fossil: + title: Cumulative greenhouse gas emissions from fossil fuels and industry + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + presentation: + title_public: Cumulative greenhouse gas emissions from fossil fuels and industry + cumulative_emissions_ghg_land: + title: Cumulative greenhouse gas emissions from agriculture and land use + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + presentation: + title_public: Cumulative greenhouse gas emissions from agriculture and land use + cumulative_emissions_ghg_total: + title: Cumulative greenhouse gas emissions + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + presentation: + title_public: Cumulative greenhouse gas emissions + cumulative_emissions_ch4_fossil: + title: Cumulative methane emissions from fossil fuels and industry + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative methane emissions from fossil fuels and industry + cumulative_emissions_ch4_land: + title: Cumulative methane emissions from agriculture and land use + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative methane emissions from agriculture and land use + cumulative_emissions_ch4_total: + title: Cumulative methane emissions + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative methane emissions + cumulative_emissions_co2_fossil: + title: Cumulative CO₂ emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Cumulative CO₂ emissions from fossil fuels and industry + cumulative_emissions_co2_land: + title: Cumulative CO₂ emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Cumulative CO₂ emissions from agriculture and land use + cumulative_emissions_co2_total: + title: Cumulative CO₂ emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Cumulative CO₂ emissions + cumulative_emissions_n2o_fossil: + title: Cumulative nitrous oxide emissions from fossil fuels and industry + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative nitrous oxide emissions from fossil fuels and industry + cumulative_emissions_n2o_land: + title: Cumulative nitrous oxide emissions from agriculture and land use + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative nitrous oxide emissions from agriculture and land use + cumulative_emissions_n2o_total: + title: Cumulative nitrous oxide emissions + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative nitrous oxide emissions + # Temperature response to emissions of CH4, CO2, N2O and GHG, in °C (as originally given in the data). + temperature_response_ghg_fossil: + title: Change in global mean surface temperature caused by greenhouse gas emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by greenhouse gas emissions from fossil fuels and industry + temperature_response_ghg_land: + title: Change in global mean surface temperature caused by greenhouse gas emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by greenhouse gas emissions from agriculture and land use + temperature_response_ghg_total: + title: Change in global mean surface temperature caused by greenhouse gas emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by greenhouse gas emissions + temperature_response_ch4_fossil: + title: Change in global mean surface temperature caused by methane emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by methane emissions from fossil fuels and industry + temperature_response_ch4_land: + title: Change in global mean surface temperature caused by methane emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by methane emissions from agriculture and land use + temperature_response_ch4_total: + title: Change in global mean surface temperature caused by methane emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of methane. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by methane emissions + temperature_response_co2_fossil: + title: Change in global mean surface temperature caused by CO₂ emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by CO₂ emissions from fossil fuels and industry + temperature_response_co2_land: + title: Change in global mean surface temperature caused by CO₂ emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by CO₂ emissions from agriculture and land use + temperature_response_co2_total: + title: Change in global mean surface temperature caused by CO₂ emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by CO₂ emissions + temperature_response_n2o_fossil: + title: Change in global mean surface temperature caused by nitrous oxide emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by nitrous oxide emissions from fossil fuels and industry + temperature_response_n2o_land: + title: Change in global mean surface temperature caused by nitrous oxide emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by nitrous oxide emissions from agriculture and land use + temperature_response_n2o_total: + title: Change in global mean surface temperature caused by nitrous oxide emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of nitrous oxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by nitrous oxide emissions + # Share of emissions (calculated by OWID), e.g. methane emissions as a percentage of global methane emissions. + # NOTE: Using CO2eq or tonnes of the original gas is irrelevant when calculated as a share of global emissions. + share_of_annual_emissions_ghg_total: + title: Share of global greenhouse gas emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's greenhouse gas emissions." + description_processing: *processing-greenhouse-gases + presentation: + title_public: Share of global greenhouse gas emissions + share_of_annual_emissions_ch4_total: + title: Share of global methane emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's methane emissions." + presentation: + title_public: Share of global methane emissions + share_of_annual_emissions_co2_total: + title: Share of global CO₂ emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's carbon dioxide emissions." + presentation: + title_public: Share of global CO₂ emissions + share_of_annual_emissions_n2o_total: + title: Share of global nitrous oxide emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's nitrous oxide emissions." + presentation: + title_public: Share of global nitrous oxide emissions + # Share of global temperature change caused by greenhouse gas emissions from each country (calculated by OWID). + share_of_temperature_response_ghg_total: + title: Share of contribution to global warming + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's temperature change." + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Share of contribution to global warming + # Per capita emissions (calculated by OWID). + annual_emissions_co2_total_per_capita: + title: Per capita CO₂ emissions + unit: tonnes per person + short_unit: t/person + description_short: *measured-in-tonnes-per-person + presentation: + title_public: Per capita CO₂ emissions including land use + annual_emissions_co2_fossil_per_capita: + title: Per capita CO₂ emissions from fossil fuels and industry + unit: tonnes per person + short_unit: t/person + description_short: *measured-in-tonnes-per-person + presentation: + title_public: Per capita CO₂ emissions from fossil fuels and industry + annual_emissions_ch4_total_co2eq_per_capita: + title: Per capita methane emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents per person + short_unit: t/person + description_short: *measured-in-co2-eq-per-person + description_processing: *processing-methane + presentation: + title_public: Per capita methane emissions including land use + annual_emissions_n2o_total_co2eq_per_capita: + title: Per capita nitrous oxide emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents per person + short_unit: t/person + description_short: *measured-in-co2-eq-per-person + description_processing: *processing-nitrous-oxide + presentation: + title_public: Per capita nitrous oxide emissions including land use + annual_emissions_ghg_total_co2eq_per_capita: + title: Per capita greenhouse gas emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents per person + short_unit: t/person + description_short: *ghg-emissions-per-person + description_processing: *processing-greenhouse-gases + presentation: + title_public: Per capita greenhouse gas emissions including land use + annual_emissions_ghg_fossil_co2eq_per_capita: + title: Per capita greenhouse gas emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents per person + short_unit: t/person + description_short: *ghg-emissions-per-person + description_processing: *processing-greenhouse-gases + presentation: + title_public: Per capita greenhouse gas emissions from fossil fuels and industry + diff --git a/etl/steps/data/garden/emissions/2024-11-21/national_contributions.py b/etl/steps/data/garden/emissions/2024-11-21/national_contributions.py new file mode 100644 index 00000000000..ebe2fb4ae8c --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-21/national_contributions.py @@ -0,0 +1,356 @@ +"""Load a meadow dataset and create a garden dataset.""" + + +import owid.catalog.processing as pr +from owid.catalog import Dataset, Table, Variable +from owid.datautils.dataframes import map_series + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Conversion factor to change from teragrams to tonnes. +TERAGRAMS_TO_TONNES = 1e6 +# Conversion factor to change from petagrams to tonnes. +PETAGRAMS_TO_TONNES = 1e9 + +# Conversion factors to change from tonnes of gases emitted to tonnes of CO2 equivalents (taken from IPCC AR6). +CH4_FOSSIL_EMISSIONS_TO_CO2_EQUIVALENTS = 29.8 +CH4_LAND_EMISSIONS_TO_CO2_EQUIVALENTS = 27.2 +N2O_EMISSIONS_TO_CO2_EQUIVALENTS = 273 + +# Gases and components expected to be in the data, and how to rename them. +GASES_RENAMING = { + "3-GHG": "ghg", + "CH[4]": "ch4", + "CO[2]": "co2", + "N[2]*O": "n2o", +} +COMPONENTS_RENAMING = { + "Fossil": "fossil", + "LULUCF": "land", + "Total": "total", +} + +# Columns for which we will create "share" variables, e.g. the percentage of methane emissions that a country produces +# in a year with respect to the world's methane emissions on the same year. +# NOTE: For this calculation, it doesn't matter if we use the total or the CO2-equivalent emissions. +SHARE_VARIABLES = [ + "annual_emissions_ch4_total", + "annual_emissions_co2_total", + "annual_emissions_n2o_total", + "annual_emissions_ghg_total_co2eq", + "temperature_response_ghg_total", +] + +# Columns for which a per-capita variable will be created. +PER_CAPITA_VARIABLES = [ + "annual_emissions_ch4_total_co2eq", + "annual_emissions_co2_total", + "annual_emissions_co2_fossil", + "annual_emissions_n2o_total_co2eq", + "annual_emissions_ghg_total_co2eq", + "annual_emissions_ghg_fossil_co2eq", +] + +# Regions to be added by aggregating data from their member countries. +REGIONS = { + # Default continents. + "Africa": {}, + "Asia": {}, + "Europe": {}, + "North America": {}, + "Oceania": {}, + "South America": {}, + # Income groups. + "Low-income countries": {}, + "Upper-middle-income countries": {}, + "Lower-middle-income countries": {}, + "High-income countries": {}, + # Additional composite regions. + "Asia (excl. China and India)": { + "additional_regions": ["Asia"], + "excluded_members": ["China", "India"], + }, + "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, + "Europe (excl. EU-28)": { + "additional_regions": ["Europe"], + "excluded_regions": ["European Union (27)"], + "excluded_members": ["United Kingdom"], + }, + "European Union (28)": { + "additional_regions": ["European Union (27)"], + "additional_members": ["United Kingdom"], + }, + "North America (excl. USA)": { + "additional_regions": ["North America"], + "excluded_members": ["United States"], + }, + # EU27 is already included in the original data. + # "European Union (27)": {}, +} + + +def run_sanity_checks_on_inputs(tb): + # Sanity checks. + error = "Names of gases have changed." + assert set(tb["gas"]) == set(GASES_RENAMING), error + error = "Names of components have changed." + assert set(tb["component"]) == set(COMPONENTS_RENAMING), error + error = "Units have changed." + assert set(tb["unit"]) == set( + ["Tg~CH[4]~year^-1", "Pg~CO[2]~year^-1", "Tg~N[2]*O~year^-1", "Pg~CO[2]*-e[100]", "°C"] + ), error + + +def add_kuwaiti_oil_fires_to_kuwait(tb: Table) -> Table: + tb = tb.copy() + + # NOTE: Use this function before harmonizing country names. Otherwise adapt the following definitions. + kuwait = "Kuwait" + oil_fires = "Kuwaiti Oil Fires" + + # Sanity check. + error = f"'{kuwait}' or '{oil_fires}' not found in the data." + assert kuwait in set(tb["country"]), error + assert oil_fires in set(tb["country"]), error + + # Add the emissions from the Kuwaiti oil fires (in 1991) to Kuwait. + tb_kuwait = tb[tb["country"] == kuwait].drop(columns="country").set_index("year") + tb_oil_fires = tb[tb["country"] == oil_fires].drop(columns="country").fillna(0).set_index(["year"]) + tb_combined = (tb_kuwait + tb_oil_fires).reset_index().assign(**{"country": kuwait}) + + # Replace the original data for Kuwait by the combined data. + tb_updated = pr.concat([tb[tb["country"] != kuwait].reset_index(drop=True), tb_combined], ignore_index=True) + + # Sort conveniently. + tb_updated = tb_updated.sort_values(["country", "year"]).reset_index(drop=True) + + return tb_updated + + +def add_emissions_in_co2_equivalents(tb: Table) -> Table: + # Add columns for fossil/land/total emissions of CH4 in terms of CO2 equivalents. + # NOTE: For methane, we apply different conversion factors for fossil and land-use emissions. + tb["annual_emissions_ch4_fossil_co2eq"] = ( + tb["annual_emissions_ch4_fossil"] * CH4_FOSSIL_EMISSIONS_TO_CO2_EQUIVALENTS + ) + tb["annual_emissions_ch4_land_co2eq"] = tb["annual_emissions_ch4_land"] * CH4_LAND_EMISSIONS_TO_CO2_EQUIVALENTS + tb["annual_emissions_ch4_total_co2eq"] = ( + tb["annual_emissions_ch4_fossil_co2eq"] + tb["annual_emissions_ch4_land_co2eq"] + ) + + # Add columns for fossil/land/total emissions of N2O in terms of CO2 equivalents. + # NOTE: For nitrous oxide, we apply the same conversion factors for fossil and land-use emissions. + for component in ["fossil", "land", "total"]: + tb[f"annual_emissions_n2o_{component}_co2eq"] = ( + tb[f"annual_emissions_n2o_{component}"] * N2O_EMISSIONS_TO_CO2_EQUIVALENTS + ) + + # Add columns for fossil/land/total emissions of all GHG in terms of CO2 equivalents. + # NOTE: The file of annual emissions does not include GHG emissions, which is why we need to add them now. + # However, the files of temperature response and cumulative emissions do include GHG emissions. + for component in ["fossil", "land", "total"]: + tb[f"annual_emissions_ghg_{component}_co2eq"] = ( + tb[f"annual_emissions_co2_{component}"] + + tb[f"annual_emissions_ch4_{component}_co2eq"] + + tb[f"annual_emissions_n2o_{component}_co2eq"] + ) + + return tb + + +def add_share_variables(tb: Table) -> Table: + tb = tb.copy() + + # Create "share" variables (percentages with respect to global). + # To do that, first create a separate table for global data, and add it to the main table. + tb_global = tb[tb["country"] == "World"][["year"] + SHARE_VARIABLES].reset_index(drop=True) + + tb = tb.merge(tb_global, on=["year"], how="left", suffixes=("", "_global")) + # For a list of variables, add the percentage with respect to global. + for variable in SHARE_VARIABLES: + new_variable = f"share_of_{variable.replace('_co2eq', '')}" + tb[new_variable] = 100 * tb[variable] / tb[f"{variable}_global"] + + # Drop unnecessary columns for global data. + tb = tb.drop(columns=[column for column in tb.columns if column.endswith("_global")], errors="raise") + + return tb + + +def add_per_capita_variables(tb: Table, ds_population: Dataset) -> Table: + tb = tb.copy() + + # Add population to data. + tb = geo.add_population_to_table( + tb=tb, + ds_population=ds_population, + warn_on_missing_countries=False, + ) + + # Add per-capita variables. + for variable in PER_CAPITA_VARIABLES: + tb[f"{variable}_per_capita"] = tb[variable] / tb["population"] + + # Drop population column. + tb = tb.drop(columns="population", errors="raise") + + return tb + + +def fix_emissions_jump_in_1850(tb: Table) -> Table: + # There is data from 1830 for some variables and from 1850 for others. + # However, when inspecting data between 1830 and 1850 (e.g. annual_emissions_co2_total) there is an abrupt jump + # between 1849 and 1850, which happens for many countries (e.g. Spain, or World). + # This jump seems to be spurious, and therefore we start all time series from 1850. + + # First check that the jump is still in the data. + emissions_before_jump = tb[(tb["country"] == "World") & (tb["year"] == 1849)]["annual_emissions_co2_total"].item() + emissions_after_jump = tb[(tb["country"] == "World") & (tb["year"] == 1850)]["annual_emissions_co2_total"].item() + error = "Spurious jump between 1849 and 1850 is not in the data anymore. Remove this part of the code." + assert emissions_after_jump / emissions_before_jump > 10, error + + # Visually inspect the jump. + # import plotly.express as px + # px.line(tb[tb["country"]=="World"], x="year", y="annual_emissions_co2_total", markers=True) + + # Start all data after the jump. + tb = tb[tb["year"] >= 1850].reset_index(drop=True) + + return tb + + +def run_sanity_checks_on_outputs(tb: Table) -> None: + error = "Share of global emissions cannot be larger than 101%" + assert (tb[[column for column in tb.columns if "share" in column]].max() < 101).all(), error + error = "Share of global emissions was not expected to be smaller than -1%" + # Some countries did contribute negatively to CO2 emissions, however overall the negative contribution is always + # smaller than 1% in absolute value. + assert (tb[[column for column in tb.columns if "share" in column]].min() > -1).all(), error + + # Ensure that no country contributes to emissions more than the entire world. + columns_that_should_be_smaller_than_global = [ + column for column in tb.drop(columns=["country", "year"]).columns if "capita" not in column + ] + tb_global = tb[tb["country"] == "World"].drop(columns="country") + check = pr.merge( + tb[tb["country"] != "World"].reset_index(drop=True), tb_global, on="year", how="left", suffixes=("", "_global") + ) + for column in columns_that_should_be_smaller_than_global: + # It is in principle possible that some region would emit more than the world, if the rest of regions + # were contributing with negative CO2 emissions (e.g. High-income countries in 1854). + # However, the difference should be very small. + error = f"Region contributed to {column} more than the entire world." + assert check[(check[column] - check[f"{column}_global"]) / check[f"{column}_global"] > 0.00001].empty, error + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("national_contributions") + tb = ds_meadow.read("national_contributions") + + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # + # Process data. + # + # Sanity checks. + run_sanity_checks_on_inputs(tb=tb) + + # Rename gases and components. + tb["gas"] = Variable( + map_series( + series=tb["gas"], mapping=GASES_RENAMING, warn_on_missing_mappings=True, warn_on_unused_mappings=True + ) + ).copy_metadata(tb["gas"]) + tb["component"] = Variable( + map_series( + series=tb["component"], + mapping=COMPONENTS_RENAMING, + warn_on_missing_mappings=True, + warn_on_unused_mappings=True, + ) + ).copy_metadata(tb["component"]) + + # Convert units from teragrams and petagrams to tonnes. + tb.loc[tb["unit"].str.startswith("Tg"), "data"] *= TERAGRAMS_TO_TONNES + tb.loc[tb["unit"].str.startswith("Pg"), "data"] *= PETAGRAMS_TO_TONNES + + # Transpose data. + tb = tb.pivot( + index=["country", "year"], columns=["file", "gas", "component"], values="data", join_column_levels_with="_" + ) + + # We add the emissions from the Kuwaiti oil fires in 1991 (which are also included as a separate country) as part + # of the emissions of Kuwait. + # This ensures that these emissions will be included in aggregates of regions that include Kuwait. + tb = add_kuwaiti_oil_fires_to_kuwait(tb=tb) + + # Harmonize country names. + tb = geo.harmonize_countries( + tb, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + ) + + # Replace spurious negative values with zeros (and ensure they are small numbers, within the uncertainty). + columns_that_cannot_be_negative = [column for column in tb.columns if "fossil" in column] + #################################################################################################################### + # TODO: For some reason, cumulative_emissions_ch4_fossil (and therefore cumulative_emissions_ghg_fossil) have + # big negative values. For example for Ireland's value in 2022 is of -2.93e+08! + # I will look into this, but, for now, I'll ignore those negative values (we are not using these indicators in + # any chart). + columns_that_cannot_be_negative = [ + column + for column in columns_that_cannot_be_negative + if column not in ["cumulative_emissions_ch4_fossil", "cumulative_emissions_ghg_fossil"] + ] + #################################################################################################################### + for column in columns_that_cannot_be_negative: + # Ensure all negative values are just numerical noise. + assert (tb[column].fillna(0) >= -2e-4).all() + # Replace those values by zero. + tb[column] = tb[column].clip(lower=0) + + # Add region aggregates. + tb = geo.add_regions_to_table( + tb=tb, ds_regions=ds_regions, ds_income_groups=ds_income_groups, regions=REGIONS, min_num_values_per_year=1 + ) + + # Add columns for emissions in terms of CO2 equivalents. + tb = add_emissions_in_co2_equivalents(tb=tb) + + # Add "share" variables (percentages with respect to global emissions). + tb = add_share_variables(tb=tb) + + # Add per-capita variables. + tb = add_per_capita_variables(tb=tb, ds_population=ds_population) + + # Fix spurious jump in the data in 1850. + tb = fix_emissions_jump_in_1850(tb=tb) + + # Sanity checks. + run_sanity_checks_on_outputs(tb=tb) + + # Set an appropriate index and sort conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/emissions/2024-11-21/owid_co2.py b/etl/steps/data/garden/emissions/2024-11-21/owid_co2.py new file mode 100644 index 00000000000..a816d3ec30a --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-21/owid_co2.py @@ -0,0 +1,508 @@ +"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset. + +Datasets combined: +* Global Carbon Budget - Global Carbon Project. +* National contributions to climate change - Jones et al. +* Greenhouse gas emissions by sector - Climate Watch. +* Primary energy consumption - EI & EIA. + +Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2023) on +GDP are included. + +""" + +import re + +import numpy as np +import pandas as pd +from owid.catalog import Dataset, Origin, Table +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +# Initialize logger. +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Conversion factor from tonnes to million tonnes. +TONNES_TO_MILLION_TONNES = 1e-6 + +# Select columns to use from each dataset, and how to rename them. +GCP_COLUMNS = { + "country": "country", + "year": "year", + "emissions_total": "co2", + "emissions_total_per_capita": "co2_per_capita", + "traded_emissions": "trade_co2", + "emissions_from_cement": "cement_co2", + "emissions_from_cement_per_capita": "cement_co2_per_capita", + "emissions_from_coal": "coal_co2", + "emissions_from_coal_per_capita": "coal_co2_per_capita", + "emissions_from_flaring": "flaring_co2", + "emissions_from_flaring_per_capita": "flaring_co2_per_capita", + "emissions_from_gas": "gas_co2", + "emissions_from_gas_per_capita": "gas_co2_per_capita", + "emissions_from_oil": "oil_co2", + "emissions_from_oil_per_capita": "oil_co2_per_capita", + "emissions_from_other_industry": "other_industry_co2", + "emissions_from_other_industry_per_capita": "other_co2_per_capita", + "pct_growth_emissions_total": "co2_growth_prct", + "growth_emissions_total": "co2_growth_abs", + "emissions_total_per_gdp": "co2_per_gdp", + "emissions_total_per_unit_energy": "co2_per_unit_energy", + "consumption_emissions": "consumption_co2", + "consumption_emissions_per_capita": "consumption_co2_per_capita", + "consumption_emissions_per_gdp": "consumption_co2_per_gdp", + "cumulative_emissions_total": "cumulative_co2", + "cumulative_emissions_from_cement": "cumulative_cement_co2", + "cumulative_emissions_from_coal": "cumulative_coal_co2", + "cumulative_emissions_from_flaring": "cumulative_flaring_co2", + "cumulative_emissions_from_gas": "cumulative_gas_co2", + "cumulative_emissions_from_oil": "cumulative_oil_co2", + "cumulative_emissions_from_other_industry": "cumulative_other_co2", + "pct_traded_emissions": "trade_co2_share", + "emissions_total_as_share_of_global": "share_global_co2", + "emissions_from_cement_as_share_of_global": "share_global_cement_co2", + "emissions_from_coal_as_share_of_global": "share_global_coal_co2", + "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", + "emissions_from_gas_as_share_of_global": "share_global_gas_co2", + "emissions_from_oil_as_share_of_global": "share_global_oil_co2", + "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", + "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", + "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", + "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", + "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", + "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", + "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", + "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", + # New variables, related to land-use change emissions. + "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", + "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", + "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", + "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", + "emissions_from_land_use_change": "land_use_change_co2", + "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", + "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", + "emissions_total_including_land_use_change": "co2_including_luc", + "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", + "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", + "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", + "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", + "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", + "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", +} +JONES_COLUMNS = { + "country": "country", + "year": "year", + "temperature_response_co2_total": "temperature_change_from_co2", + "temperature_response_ghg_total": "temperature_change_from_ghg", + "temperature_response_ch4_total": "temperature_change_from_ch4", + "temperature_response_n2o_total": "temperature_change_from_n2o", + "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", + # NOTE: The following columns used to come from climate watch. But Jones et al. provides a much wider coverage, and it's more up-to-date. + "annual_emissions_ghg_fossil_co2eq": "total_ghg_excluding_lucf", + "annual_emissions_ghg_fossil_co2eq_per_capita": "ghg_excluding_lucf_per_capita", + "annual_emissions_ghg_total_co2eq": "total_ghg", + "annual_emissions_ghg_total_co2eq_per_capita": "ghg_per_capita", + "annual_emissions_ch4_total_co2eq": "methane", + "annual_emissions_ch4_total_co2eq_per_capita": "methane_per_capita", + "annual_emissions_n2o_total_co2eq": "nitrous_oxide", + "annual_emissions_n2o_total_co2eq_per_capita": "nitrous_oxide_per_capita", +} +# NOTE: All climate watch indicators now come from Jones et al. +# CLIMATE_WATCH_GHG_COLUMNS = { +# "country": "country", +# "year": "year", +# "total_ghg_emissions_excluding_lucf": "total_ghg_excluding_lucf", +# "total_ghg_emissions_excluding_lucf_per_capita": "ghg_excluding_lucf_per_capita", +# "total_ghg_emissions_including_lucf": "total_ghg", +# "total_ghg_emissions_including_lucf_per_capita": "ghg_per_capita", +# } +# CLIMATE_WATCH_CH4_COLUMNS = { +# "country": "country", +# "year": "year", +# "total_ch4_emissions_including_lucf": "methane", +# "total_ch4_emissions_including_lucf_per_capita": "methane_per_capita", +# } +# CLIMATE_WATCH_N2O_COLUMNS = { +# "country": "country", +# "year": "year", +# "total_n2o_emissions_including_lucf": "nitrous_oxide", +# "total_n2o_emissions_including_lucf_per_capita": "nitrous_oxide_per_capita", +# } +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", + "primary_energy_consumption_per_capita__kwh": "energy_per_capita", + "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", +} +REGIONS_COLUMNS = { + "name": "country", + "iso_alpha3": "iso_code", +} +POPULATION_COLUMNS = { + "country": "country", + "year": "year", + "population": "population", +} +GDP_COLUMNS = { + "country": "country", + "year": "year", + "gdp": "gdp", +} + +UNITS = { + "tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes", "new_short_unit": "Mt"}, + "tonnes of CO₂ equivalents": { + "conversion": TONNES_TO_MILLION_TONNES, + "new_unit": "million tonnes", + "new_short_unit": "Mt", + }, +} + + +def convert_units(table: Table) -> Table: + """Convert units of table. + + Parameters + ---------- + table : Table + Data with its original units. + + Returns + ------- + Table + Data after converting units of specific columns. + + """ + table = table.copy() + # Check units and convert to more convenient ones. + for column in table.columns: + unit = table[column].metadata.unit + title = table[column].metadata.title + description_short = table[column].metadata.description or table[column].metadata.description_short + if unit in list(UNITS): + table[column] *= UNITS[unit]["conversion"] + table[column].metadata.unit = UNITS[unit]["new_unit"] + table[column].metadata.short_unit = UNITS[unit]["new_short_unit"] + table[column].metadata.title = title.replace(unit, UNITS[unit]["new_unit"]) + table[column].metadata.description_short = description_short.replace(unit, UNITS[unit]["new_unit"]) + + return table + + +def combine_tables( + tb_gcp: Table, + tb_jones: Table, + # tb_climate_watch_ghg: Table, + # tb_climate_watch_ch4: Table, + # tb_climate_watch_n2o: Table, + tb_energy: Table, + tb_gdp: Table, + tb_population: Table, + tb_regions: Table, +) -> Table: + """Combine tables. + + Parameters + ---------- + tb_gcp : Table + Global Carbon Budget table (from Global Carbon Project). + tb_jones : Table + National contributions to climate change (from Jones et al. (2023)). + # tb_climate_watch_ghg : Table + # Greenhouse gas emissions table (from Climate Watch). + # tb_climate_watch_ch4 : Table + # CH4 emissions table (from Climate Watch). + # tb_climate_watch_n2o : Table + # N2O emissions table (from Climate Watch). + tb_energy : Table + Primary energy consumption table (from BP & EIA). + tb_gdp : Table + Maddison GDP table (from GGDC). + tb_population : Table + OWID population table (from various sources). + tb_regions : Table + OWID regions table. + + Returns + ------- + combined : Table + Combined table with metadata and variables metadata. + + """ + # Combine main tables (with an outer join, to gather all entities from all tables). + combined = tb_gcp.copy() + # for table in [tb_jones, tb_climate_watch_ghg, tb_climate_watch_ch4, tb_climate_watch_n2o]: + for table in [tb_jones]: + combined = combined.merge(table, on=["country", "year"], how="outer", short_name=paths.short_name) + + # Add secondary tables (with a left join, to keep only entities for which we have emissions data). + for table in [tb_energy, tb_gdp, tb_population]: + combined = combined.merge(table, on=["country", "year"], how="left") + + # Countries-regions dataset does not have a year column, so it has to be merged on country. + combined = combined.merge(tb_regions, on="country", how="left") + + # Check that there were no repetition in column names. + error = "Repeated columns in combined data." + assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error + + # Adjust units. + combined = convert_units(combined) + + return combined + + +def prepare_outputs(combined: Table, ds_regions: Dataset) -> Table: + """Clean and prepare output table. + + Parameters + ---------- + combined : Table + Combined table. + ds_regions : Dataset + Regions dataset, only used to get its version. + + Returns + ------- + combined: Table + Cleaned combined table. + + """ + # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). + columns_that_must_have_data = [ + column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] + ] + combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) + + # Add metadata to the ISO column (loaded from the regions dataset). + combined["iso_code"].m.origins = [ + Origin( + producer="International Organization for Standardization", + title="Regions", + date_published=ds_regions.version, + ) + ] + combined["iso_code"].metadata.title = "ISO code" + combined["iso_code"].metadata.description_short = "ISO 3166-1 alpha-3 three-letter country codes." + combined["iso_code"].metadata.unit = "" + + # Sanity check. + columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] + assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" + + # Sort rows and columns conveniently. + first_columns = ["country", "year", "iso_code", "population", "gdp"] + combined = combined[first_columns + [column for column in sorted(combined.columns) if column not in first_columns]] + + # Improve table format. + combined = combined.format() + + return combined + + +def remove_details_on_demand(text: str) -> str: + # Remove references to details on demand from a text. + # Example: "This is a [description](#dod:something)." -> "This is a description." + regex = r"\(\#dod\:.*\)" + if "(#dod:" in text: + text = re.sub(regex, "", text).replace("[", "").replace("]", "") + + return text + + +def prepare_codebook(tb: Table) -> pd.DataFrame: + table = tb.reset_index() + + # Manually create an origin for the regions dataset. + regions_origin = [Origin(producer="Our World in Data", title="Regions", date_published=str(table["year"].max()))] + + # Manually edit some of the metadata fields. + table["country"].metadata.title = "Country" + table["country"].metadata.description_short = "Geographic location." + table["country"].metadata.description = None + table["country"].metadata.unit = "" + table["country"].metadata.origins = regions_origin + table["year"].metadata.title = "Year" + table["year"].metadata.description_short = "Year of observation." + table["year"].metadata.description = None + table["year"].metadata.unit = "" + table["year"].metadata.origins = regions_origin + + #################################################################################################################### + if table["population"].metadata.description is None: + print("WARNING: Column population has no longer a description field. Remove this part of the code") + else: + table["population"].metadata.description = None + + #################################################################################################################### + + # Gather column names, titles, short descriptions, unit and origins from the indicators' metadata. + metadata = {"column": [], "description": [], "unit": [], "source": []} + for column in table.columns: + metadata["column"].append(column) + + if hasattr(table[column].metadata, "description") and table[column].metadata.description is not None: + print(f"WARNING: Column {column} still has a 'description' field.") + # Prepare indicator's description. + description = "" + if ( + hasattr(table[column].metadata.presentation, "title_public") + and table[column].metadata.presentation.title_public is not None + ): + description += table[column].metadata.presentation.title_public + else: + description += table[column].metadata.title + if table[column].metadata.description_short: + description += f" - {table[column].metadata.description_short}" + description = remove_details_on_demand(description) + metadata["description"].append(description) + + # Prepare indicator's unit. + if table[column].metadata.unit is None: + print(f"WARNING: Column {column} does not have a unit.") + unit = "" + else: + unit = table[column].metadata.unit + metadata["unit"].append(unit) + + # Gather unique origins of current variable. + unique_sources = [] + for origin in table[column].metadata.origins: + # Construct the source name from the origin's attribution. + # If not defined, build it using the default format "Producer - Data product (year)". + source_name = ( + origin.attribution + or f"{origin.producer} - {origin.title or origin.title_snapshot} ({origin.date_published.split('-')[0]})" + ) + + # Add url at the end of the source. + if origin.url_main: + source_name += f" [{origin.url_main}]" + + # Add the source to the list of unique sources. + if source_name not in unique_sources: + unique_sources.append(source_name) + + # Concatenate all sources. + sources_combined = "; ".join(unique_sources) + metadata["source"].append(sources_combined) + + # Create a dataframe with the gathered metadata and sort conveniently by column name. + codebook = pd.DataFrame(metadata).set_index("column").sort_index() + # For clarity, ensure column descriptions are in the same order as the columns in the data. + first_columns = ["country", "year", "iso_code", "population", "gdp"] + codebook = pd.concat([codebook.loc[first_columns], codebook.drop(first_columns, errors="raise")]).reset_index() + # Create a table with the appropriate metadata. + codebook = Table(codebook).format( + keys=["column"], sort_rows=False, sort_columns=False, short_name="owid_co2_codebook" + ) + codebook_origin = [ + Origin(producer="Our World in Data", title="CO2-data codebook", date_published=str(table["year"].max())) + ] + for column in ["description", "unit", "source"]: + codebook[column].metadata.origins = codebook_origin + + return codebook + + +def sanity_check_outputs(tb: Table, tb_codebook: Table) -> None: + error = "Dataset columns should coincide with the codebook 'columns'." + assert set(tb_codebook.reset_index()["column"]) == set(tb.reset_index().columns), error + + error = "All rows in dataset should contain at least one non-NaN value." + assert not tb.isnull().all(axis=1).any(), error + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load the global carbon budget dataset from the Global Carbon Project (GCP). + ds_gcp = paths.load_dataset("global_carbon_budget") + + # Load the Jones et al. dataset on national contributions to climate change. + ds_jones = paths.load_dataset("national_contributions") + + # Load the greenhouse gas emissions by sector dataset by Climate Watch. + # ds_climate_watch = paths.load_dataset("emissions_by_sector") + + # Load the GDP dataset by GGDC Maddison. + ds_gdp = paths.load_dataset("maddison_project_database") + + # Load primary energy consumption dataset (by different sources in our 'energy' namespace). + ds_energy = paths.load_dataset("primary_energy_consumption") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load countries-regions dataset (required to get ISO codes). + ds_regions = paths.load_dataset("regions") + + # Gather all required tables from all datasets. + tb_gcp = ds_gcp["global_carbon_budget"] + tb_jones = ds_jones["national_contributions"] + # tb_climate_watch_ghg = ds_climate_watch["greenhouse_gas_emissions_by_sector"] + # tb_climate_watch_ch4 = ds_climate_watch["methane_emissions_by_sector"] + # tb_climate_watch_n2o = ds_climate_watch["nitrous_oxide_emissions_by_sector"] + tb_energy = ds_energy["primary_energy_consumption"] + tb_gdp = ds_gdp["maddison_project_database"] + tb_population = ds_population["population"] + tb_regions = ds_regions["regions"] + + # + # Process data. + # + # Choose required columns and rename them. + tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS, errors="raise") + tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS, errors="raise") + # tb_climate_watch_ghg = tb_climate_watch_ghg.reset_index()[list(CLIMATE_WATCH_GHG_COLUMNS)].rename( + # columns=CLIMATE_WATCH_GHG_COLUMNS, errors="raise" + # ) + # tb_climate_watch_ch4 = tb_climate_watch_ch4.reset_index()[list(CLIMATE_WATCH_CH4_COLUMNS)].rename( + # columns=CLIMATE_WATCH_CH4_COLUMNS, errors="raise" + # ) + # tb_climate_watch_n2o = tb_climate_watch_n2o.reset_index()[list(CLIMATE_WATCH_N2O_COLUMNS)].rename( + # columns=CLIMATE_WATCH_N2O_COLUMNS, errors="raise" + # ) + tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename( + columns=PRIMARY_ENERGY_COLUMNS, errors="raise" + ) + tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") + tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename( + columns=POPULATION_COLUMNS, errors="raise" + ) + tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS, errors="raise") + + # Combine tables. + combined = combine_tables( + tb_gcp=tb_gcp, + tb_jones=tb_jones, + # tb_climate_watch_ghg=tb_climate_watch_ghg, + # tb_climate_watch_ch4=tb_climate_watch_ch4, + # tb_climate_watch_n2o=tb_climate_watch_n2o, + tb_energy=tb_energy, + tb_gdp=tb_gdp, + tb_population=tb_population, + tb_regions=tb_regions, + ) + + # Prepare output data table. + tb = prepare_outputs(combined=combined, ds_regions=ds_regions) + + # Prepare codebook. + tb_codebook = prepare_codebook(tb=tb) + + # Sanity check. + sanity_check_outputs(tb=tb, tb_codebook=tb_codebook) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb, tb_codebook], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/garden/energy/2024-11-01/photovoltaic_cost_and_capacity.meta.yml b/etl/steps/data/garden/energy/2024-11-01/photovoltaic_cost_and_capacity.meta.yml new file mode 100644 index 00000000000..3d39a1eee89 --- /dev/null +++ b/etl/steps/data/garden/energy/2024-11-01/photovoltaic_cost_and_capacity.meta.yml @@ -0,0 +1,30 @@ +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Energy + +dataset: + update_period_days: 365 + +tables: + # NOTE: Some metadata fields are filled out programmatically in the garden step. + photovoltaic_cost_and_capacity: + title: Solar photovoltaic cost and capacity + variables: + cost: + title: Solar photovoltaic module price + presentation: + title_public: Solar photovoltaic module price + cost_source: + title: Source for each value of cost data + unit: '' + cumulative_capacity: + title: Solar photovoltaic cumulative capacity + presentation: + # NOTE: A title_public is needed, since a display.name is defined (it's propagated from one of the combined datasets, namely farmer_lafond_2016). + title_public: Solar photovoltaic cumulative capacity + cumulative_capacity_source: + title: Source for each value of cumulative capacity data + unit: '' diff --git a/etl/steps/data/garden/energy/2024-11-01/photovoltaic_cost_and_capacity.py b/etl/steps/data/garden/energy/2024-11-01/photovoltaic_cost_and_capacity.py new file mode 100644 index 00000000000..b5d5ae925b6 --- /dev/null +++ b/etl/steps/data/garden/energy/2024-11-01/photovoltaic_cost_and_capacity.py @@ -0,0 +1,171 @@ +"""Combine data from Nemet (2009), Farmer & Lafond (2016) and IRENA on photovoltaic cost and capacity. + +Data content: +* Nemet (2009) provides cumulative capacity data between 1975 and 2003. +* Nemet (2009) provides cost data between 1975 and 2003. +* IRENA provides cumulative capacity data from 2000 onwards. +* IRENA provides cost data from 2010 onwards. +* Farmer & Lafond (2016) provide cost data between 1980 and 2013. + +For each informed year, we need to combine these sources with the following two constraints: +* Having data from the most recent source. +* Avoid (as much as possible) having cost and capacity data on a given year from different sources. + +Therefore, for capacity data, we use Nemet (2009) between 1975 and 2003, and IRENA from 2004 onwards. +For cost data, we use Nemet (2009) between 1975 and 2003, Farmer & Lafond (2016) between 2004 and 2009, and IRENA +from 2010 onwards. + +""" + +import owid.catalog.processing as pr +from owid.catalog import Table +from owid.datautils.dataframes import combine_two_overlapping_dataframes + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current data step. +paths = PathFinder(__file__) + +# Conversion factors. +# IRENA costs are given in the latest year's USD, so we convert other costs to the same currency. +LATEST_YEAR = 2023 +# Convert 2004 USD and 2013 USD to LATEST_YEAR USD , using +# https://www.usinflationcalculator.com/ +USD2004_TO_USDLATEST = 1.61 +USD2013_TO_USDLATEST = 1.31 + + +def prepare_capacity_data(tb_nemet: Table, tb_irena_capacity: Table) -> Table: + # Column "previous_capacity" is equivalent to tb_nemet["yearly_capacity"].shift(1).cumsum() + # As they explain in the paper, "Following Epple et al. (1991), cumulative capacity is lagged one year to account + # for the time it takes to incorporate new techniques obtained as a result of learning from experience." + tb_nemet_capacity = tb_nemet[["year", "cost", "previous_capacity"]].rename( + columns={"previous_capacity": "cumulative_capacity"}, errors="raise" + )[["year", "cumulative_capacity"]] + # Add column of origin of the data. + tb_nemet_capacity["cumulative_capacity_source"] = "Nemet (2009)" + + # Select solar PV cumulative capacity from IRENA's dataset. + tb_irena_capacity = ( + tb_irena_capacity[tb_irena_capacity["country"] == "World"][["year", "solar_photovoltaic"]] + .rename(columns={"solar_photovoltaic": "cumulative_capacity"}, errors="raise") + .reset_index(drop=True) + ) + tb_irena_capacity["cumulative_capacity_source"] = "IRENA" + + # Combine cumulative capacity from Nemet (2009) and IRENA, prioritizing the former on overlapping years. + cumulative_capacity = ( + combine_two_overlapping_dataframes(df1=tb_nemet_capacity, df2=tb_irena_capacity, index_columns=["year"]) + .astype({"year": int}) + .sort_values("year") + .reset_index(drop=True) + ) + + # Improve metadata. + cumulative_capacity[ + "cumulative_capacity" + ].metadata.description_processing = "Photovoltaic capacity data between 1975 and 2003 has been taken from Nemet (2009). Data since 2004 has been taken from IRENA." + + # Since sources column has been manually created, it does not have metadata. Copy origins from another column. + cumulative_capacity["cumulative_capacity_source"].metadata.origins = cumulative_capacity[ + "cumulative_capacity" + ].metadata.origins.copy() + + return cumulative_capacity + + +def prepare_cost_data(tb_nemet: Table, tb_irena_cost: Table, tb_farmer_lafond: Table) -> Table: + tb_nemet = tb_nemet.copy() + tb_irena_cost = tb_irena_cost.copy() + tb_farmer_lafond = tb_farmer_lafond.copy() + + # Prepare solar photovoltaic cost data from Nemet (2009). + tb_nemet_cost = tb_nemet[["year", "cost"]].copy() + tb_nemet_cost["cost_source"] = "Nemet (2009)" + # Costs are given in "2004 USD/watt", so we need to convert them to the latest year USD. + tb_nemet_cost["cost"] *= USD2004_TO_USDLATEST + tb_nemet_cost["cost"].metadata.unit = f"constant {LATEST_YEAR} US$ per watt" + + # Prepare solar photovoltaic cost data from Farmer & Lafond (2016). + tb_farmer_lafond = ( + tb_farmer_lafond[["year", "photovoltaics"]] + .dropna() + .reset_index(drop=True) + .rename(columns={"photovoltaics": "cost"}, errors="raise") + ) + tb_farmer_lafond["cost_source"] = "Farmer & Lafond (2016)" + # Costs are given in "2013 USD/Wp", so we need to convert them to the latest year USD. + tb_farmer_lafond["cost"] *= USD2013_TO_USDLATEST + tb_farmer_lafond["cost"].metadata.unit = f"constant {LATEST_YEAR} US$ per watt" + + # Prepare solar photovoltaic cost data from IRENA. + tb_irena_cost = tb_irena_cost.drop(columns="country", errors="raise") + + tb_irena_cost["cost_source"] = "IRENA" + # Costs are given in latest year "USD/W", so we do not need to correct them. + + # Combine Nemet (2009) and Farmer & Lafond (2016), prioritizing the former. + combined = combine_two_overlapping_dataframes(df1=tb_nemet_cost, df2=tb_farmer_lafond, index_columns="year") + + # Combine the previous with IRENA, prioritizing the latter. + combined = combine_two_overlapping_dataframes(df1=tb_irena_cost, df2=combined, index_columns="year") + + # Improve metadata. + combined[ + "cost" + ].metadata.description_processing = f"Photovoltaic cost data between 1975 and 2003 has been taken from Nemet (2009), between 2004 and 2009 from Farmer & Lafond (2016), and since 2010 from IRENA. Prices from Nemet (2009) and Farmer & Lafond (2016) have been converted to {LATEST_YEAR} US$ using: https://www.usinflationcalculator.com/" + + # Since sources column has been manually created, it does not have metadata. Copy origins from another column. + combined["cost_source"].metadata.origins = combined["cost"].metadata.origins.copy() + + return combined + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load Nemet (2009) dataset from garden and read its main table. + ds_nemet = paths.load_dataset("nemet_2009") + tb_nemet = ds_nemet["nemet_2009"].reset_index() + + # Load Farmer & Lafond (2016) dataset from garden and read its main table. + ds_farmer_lafond = paths.load_dataset("farmer_lafond_2016") + tb_farmer_lafond = ds_farmer_lafond["farmer_lafond_2016"].reset_index() + + # Load IRENA dataset on capacity from garden and read its main table. + ds_irena_capacity = paths.load_dataset("renewable_capacity_statistics") + tb_irena_capacity = ds_irena_capacity["renewable_capacity_statistics"].reset_index() + + # Load IRENA dataset on cost from garden and read its main table. + ds_irena_cost = paths.load_dataset("renewable_power_generation_costs") + tb_irena_cost = ds_irena_cost["solar_photovoltaic_module_prices"].reset_index() + + # + # Process data. + # + # Create a table of cumulative solar photovoltaic capacity, by combining Nemet (2009) and IRENA data. + cumulative_capacity = prepare_capacity_data(tb_nemet=tb_nemet, tb_irena_capacity=tb_irena_capacity) + + # Sanity check. + error = "IRENA data has changed, prices may need to be deflated to the latest year." + assert tb_irena_cost["year"].max() == LATEST_YEAR, error + + # Create a table of solar photovoltaic cost, by combining Nemet (2009), Farmer & Lafond (2016) and IRENA data. + cost = prepare_cost_data(tb_nemet=tb_nemet, tb_irena_cost=tb_irena_cost, tb_farmer_lafond=tb_farmer_lafond) + + # Combine capacity and cost data. + tb_combined = pr.merge(cost, cumulative_capacity, on="year", how="outer") + + # Add column for region. + tb_combined = tb_combined.assign(**{"country": "World"}) + + # Format table conveniently. + tb_combined = tb_combined.format(short_name=paths.short_name) + + # + # Save outputs. + # + # Create a new dataset with the same metadata as meadow + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_combined], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/energy/2024-11-15/photovoltaic_cost_and_capacity.meta.yml b/etl/steps/data/garden/energy/2024-11-15/photovoltaic_cost_and_capacity.meta.yml new file mode 100644 index 00000000000..3d39a1eee89 --- /dev/null +++ b/etl/steps/data/garden/energy/2024-11-15/photovoltaic_cost_and_capacity.meta.yml @@ -0,0 +1,30 @@ +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Energy + +dataset: + update_period_days: 365 + +tables: + # NOTE: Some metadata fields are filled out programmatically in the garden step. + photovoltaic_cost_and_capacity: + title: Solar photovoltaic cost and capacity + variables: + cost: + title: Solar photovoltaic module price + presentation: + title_public: Solar photovoltaic module price + cost_source: + title: Source for each value of cost data + unit: '' + cumulative_capacity: + title: Solar photovoltaic cumulative capacity + presentation: + # NOTE: A title_public is needed, since a display.name is defined (it's propagated from one of the combined datasets, namely farmer_lafond_2016). + title_public: Solar photovoltaic cumulative capacity + cumulative_capacity_source: + title: Source for each value of cumulative capacity data + unit: '' diff --git a/etl/steps/data/garden/energy/2024-11-15/photovoltaic_cost_and_capacity.py b/etl/steps/data/garden/energy/2024-11-15/photovoltaic_cost_and_capacity.py new file mode 100644 index 00000000000..b5d5ae925b6 --- /dev/null +++ b/etl/steps/data/garden/energy/2024-11-15/photovoltaic_cost_and_capacity.py @@ -0,0 +1,171 @@ +"""Combine data from Nemet (2009), Farmer & Lafond (2016) and IRENA on photovoltaic cost and capacity. + +Data content: +* Nemet (2009) provides cumulative capacity data between 1975 and 2003. +* Nemet (2009) provides cost data between 1975 and 2003. +* IRENA provides cumulative capacity data from 2000 onwards. +* IRENA provides cost data from 2010 onwards. +* Farmer & Lafond (2016) provide cost data between 1980 and 2013. + +For each informed year, we need to combine these sources with the following two constraints: +* Having data from the most recent source. +* Avoid (as much as possible) having cost and capacity data on a given year from different sources. + +Therefore, for capacity data, we use Nemet (2009) between 1975 and 2003, and IRENA from 2004 onwards. +For cost data, we use Nemet (2009) between 1975 and 2003, Farmer & Lafond (2016) between 2004 and 2009, and IRENA +from 2010 onwards. + +""" + +import owid.catalog.processing as pr +from owid.catalog import Table +from owid.datautils.dataframes import combine_two_overlapping_dataframes + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current data step. +paths = PathFinder(__file__) + +# Conversion factors. +# IRENA costs are given in the latest year's USD, so we convert other costs to the same currency. +LATEST_YEAR = 2023 +# Convert 2004 USD and 2013 USD to LATEST_YEAR USD , using +# https://www.usinflationcalculator.com/ +USD2004_TO_USDLATEST = 1.61 +USD2013_TO_USDLATEST = 1.31 + + +def prepare_capacity_data(tb_nemet: Table, tb_irena_capacity: Table) -> Table: + # Column "previous_capacity" is equivalent to tb_nemet["yearly_capacity"].shift(1).cumsum() + # As they explain in the paper, "Following Epple et al. (1991), cumulative capacity is lagged one year to account + # for the time it takes to incorporate new techniques obtained as a result of learning from experience." + tb_nemet_capacity = tb_nemet[["year", "cost", "previous_capacity"]].rename( + columns={"previous_capacity": "cumulative_capacity"}, errors="raise" + )[["year", "cumulative_capacity"]] + # Add column of origin of the data. + tb_nemet_capacity["cumulative_capacity_source"] = "Nemet (2009)" + + # Select solar PV cumulative capacity from IRENA's dataset. + tb_irena_capacity = ( + tb_irena_capacity[tb_irena_capacity["country"] == "World"][["year", "solar_photovoltaic"]] + .rename(columns={"solar_photovoltaic": "cumulative_capacity"}, errors="raise") + .reset_index(drop=True) + ) + tb_irena_capacity["cumulative_capacity_source"] = "IRENA" + + # Combine cumulative capacity from Nemet (2009) and IRENA, prioritizing the former on overlapping years. + cumulative_capacity = ( + combine_two_overlapping_dataframes(df1=tb_nemet_capacity, df2=tb_irena_capacity, index_columns=["year"]) + .astype({"year": int}) + .sort_values("year") + .reset_index(drop=True) + ) + + # Improve metadata. + cumulative_capacity[ + "cumulative_capacity" + ].metadata.description_processing = "Photovoltaic capacity data between 1975 and 2003 has been taken from Nemet (2009). Data since 2004 has been taken from IRENA." + + # Since sources column has been manually created, it does not have metadata. Copy origins from another column. + cumulative_capacity["cumulative_capacity_source"].metadata.origins = cumulative_capacity[ + "cumulative_capacity" + ].metadata.origins.copy() + + return cumulative_capacity + + +def prepare_cost_data(tb_nemet: Table, tb_irena_cost: Table, tb_farmer_lafond: Table) -> Table: + tb_nemet = tb_nemet.copy() + tb_irena_cost = tb_irena_cost.copy() + tb_farmer_lafond = tb_farmer_lafond.copy() + + # Prepare solar photovoltaic cost data from Nemet (2009). + tb_nemet_cost = tb_nemet[["year", "cost"]].copy() + tb_nemet_cost["cost_source"] = "Nemet (2009)" + # Costs are given in "2004 USD/watt", so we need to convert them to the latest year USD. + tb_nemet_cost["cost"] *= USD2004_TO_USDLATEST + tb_nemet_cost["cost"].metadata.unit = f"constant {LATEST_YEAR} US$ per watt" + + # Prepare solar photovoltaic cost data from Farmer & Lafond (2016). + tb_farmer_lafond = ( + tb_farmer_lafond[["year", "photovoltaics"]] + .dropna() + .reset_index(drop=True) + .rename(columns={"photovoltaics": "cost"}, errors="raise") + ) + tb_farmer_lafond["cost_source"] = "Farmer & Lafond (2016)" + # Costs are given in "2013 USD/Wp", so we need to convert them to the latest year USD. + tb_farmer_lafond["cost"] *= USD2013_TO_USDLATEST + tb_farmer_lafond["cost"].metadata.unit = f"constant {LATEST_YEAR} US$ per watt" + + # Prepare solar photovoltaic cost data from IRENA. + tb_irena_cost = tb_irena_cost.drop(columns="country", errors="raise") + + tb_irena_cost["cost_source"] = "IRENA" + # Costs are given in latest year "USD/W", so we do not need to correct them. + + # Combine Nemet (2009) and Farmer & Lafond (2016), prioritizing the former. + combined = combine_two_overlapping_dataframes(df1=tb_nemet_cost, df2=tb_farmer_lafond, index_columns="year") + + # Combine the previous with IRENA, prioritizing the latter. + combined = combine_two_overlapping_dataframes(df1=tb_irena_cost, df2=combined, index_columns="year") + + # Improve metadata. + combined[ + "cost" + ].metadata.description_processing = f"Photovoltaic cost data between 1975 and 2003 has been taken from Nemet (2009), between 2004 and 2009 from Farmer & Lafond (2016), and since 2010 from IRENA. Prices from Nemet (2009) and Farmer & Lafond (2016) have been converted to {LATEST_YEAR} US$ using: https://www.usinflationcalculator.com/" + + # Since sources column has been manually created, it does not have metadata. Copy origins from another column. + combined["cost_source"].metadata.origins = combined["cost"].metadata.origins.copy() + + return combined + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load Nemet (2009) dataset from garden and read its main table. + ds_nemet = paths.load_dataset("nemet_2009") + tb_nemet = ds_nemet["nemet_2009"].reset_index() + + # Load Farmer & Lafond (2016) dataset from garden and read its main table. + ds_farmer_lafond = paths.load_dataset("farmer_lafond_2016") + tb_farmer_lafond = ds_farmer_lafond["farmer_lafond_2016"].reset_index() + + # Load IRENA dataset on capacity from garden and read its main table. + ds_irena_capacity = paths.load_dataset("renewable_capacity_statistics") + tb_irena_capacity = ds_irena_capacity["renewable_capacity_statistics"].reset_index() + + # Load IRENA dataset on cost from garden and read its main table. + ds_irena_cost = paths.load_dataset("renewable_power_generation_costs") + tb_irena_cost = ds_irena_cost["solar_photovoltaic_module_prices"].reset_index() + + # + # Process data. + # + # Create a table of cumulative solar photovoltaic capacity, by combining Nemet (2009) and IRENA data. + cumulative_capacity = prepare_capacity_data(tb_nemet=tb_nemet, tb_irena_capacity=tb_irena_capacity) + + # Sanity check. + error = "IRENA data has changed, prices may need to be deflated to the latest year." + assert tb_irena_cost["year"].max() == LATEST_YEAR, error + + # Create a table of solar photovoltaic cost, by combining Nemet (2009), Farmer & Lafond (2016) and IRENA data. + cost = prepare_cost_data(tb_nemet=tb_nemet, tb_irena_cost=tb_irena_cost, tb_farmer_lafond=tb_farmer_lafond) + + # Combine capacity and cost data. + tb_combined = pr.merge(cost, cumulative_capacity, on="year", how="outer") + + # Add column for region. + tb_combined = tb_combined.assign(**{"country": "World"}) + + # Format table conveniently. + tb_combined = tb_combined.format(short_name=paths.short_name) + + # + # Save outputs. + # + # Create a new dataset with the same metadata as meadow + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_combined], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/energy/2024-11-20/energy_prices.meta.yml b/etl/steps/data/garden/energy/2024-11-20/energy_prices.meta.yml new file mode 100644 index 00000000000..6a934bc8bd5 --- /dev/null +++ b/etl/steps/data/garden/energy/2024-11-20/energy_prices.meta.yml @@ -0,0 +1,11 @@ +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Energy + +dataset: + title: European Energy Prices + update_period_days: 365 + diff --git a/etl/steps/data/garden/energy/2024-11-20/energy_prices.py b/etl/steps/data/garden/energy/2024-11-20/energy_prices.py new file mode 100644 index 00000000000..f40c3b4fb44 --- /dev/null +++ b/etl/steps/data/garden/energy/2024-11-20/energy_prices.py @@ -0,0 +1,61 @@ +"""Compilation of energy prices datasets. + +""" +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current data step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load Eurostat data on European gas and electricity prices. + ds_eurostat = paths.load_dataset("gas_and_electricity_prices") + tb_eurostat_euro = ds_eurostat.read("gas_and_electricity_price_components_euro_flat") + tb_eurostat_pps = ds_eurostat.read("gas_and_electricity_price_components_pps_flat") + + # Load Ember data on European wholesale electricity prices. + ds_ember = paths.load_dataset("european_wholesale_electricity_prices") + tb_ember_monthly = ds_ember.read("european_wholesale_electricity_prices_monthly") + tb_ember_annual = ds_ember.read("european_wholesale_electricity_prices_annual") + + # + # Process data. + # + # Rename columns in all tables to have consistent dimensions. + tb_eurostat_euro = tb_eurostat_euro.rename( + columns={ + column: f"annual_{column}" for column in tb_eurostat_euro.columns if column not in ["country", "year"] + }, + errors="raise", + ) + tb_eurostat_pps = tb_eurostat_pps.rename( + columns={column: f"annual_{column}" for column in tb_eurostat_pps.columns if column not in ["country", "year"]}, + errors="raise", + ) + tb_ember_monthly = tb_ember_monthly.rename( + columns={"price": "monthly_electricity_all_wholesale_euro"}, errors="raise" + ) + tb_ember_annual = tb_ember_annual.rename(columns={"price": "annual_electricity_all_wholesale_euro"}, errors="raise") + + # Create a combined annual table. + tb_annual = pr.multi_merge( + tables=[tb_eurostat_euro, tb_eurostat_pps, tb_ember_annual], on=["country", "year"], how="outer" + ) + tb_annual = tb_annual.format(short_name="energy_prices_annual") + + # Create a combined monthly table. + # For now, only Ember has monthly data. + tb_monthly = tb_ember_monthly.copy() + tb_monthly = tb_monthly.format(keys=["country", "date"], short_name="energy_prices_monthly") + + # + # Save outputs. + # + # Create a new dataset with the same metadata as meadow + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_annual, tb_monthly], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/eth/2023-03-15/ethnic_power_relations.py b/etl/steps/data/garden/eth/2023-03-15/ethnic_power_relations.py index 2fc6db04590..9eda9a1728a 100644 --- a/etl/steps/data/garden/eth/2023-03-15/ethnic_power_relations.py +++ b/etl/steps/data/garden/eth/2023-03-15/ethnic_power_relations.py @@ -251,10 +251,7 @@ def run(dest_dir: str) -> None: ds_meadow: Dataset = paths.load_dependency("growup") # Read table from meadow dataset. - tb_meadow = ds_meadow["growup"] - - # Create a dataframe with data from the table. - df = pd.DataFrame(tb_meadow).reset_index() + df = pd.DataFrame(ds_meadow.read("growup", safe_types=False)) # # Process data. diff --git a/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.countries.json b/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.countries.json new file mode 100644 index 00000000000..f11ceb2730a --- /dev/null +++ b/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.countries.json @@ -0,0 +1,44 @@ +{ + "BE": "Belgium", + "BG": "Bulgaria", + "CZ": "Czechia", + "DK": "Denmark", + "DE": "Germany", + "EE": "Estonia", + "IE": "Ireland", + "EL": "Greece", + "ES": "Spain", + "FR": "France", + "HR": "Croatia", + "IT": "Italy", + "CY": "Cyprus", + "LV": "Latvia", + "LT": "Lithuania", + "LU": "Luxembourg", + "HU": "Hungary", + "MT": "Malta", + "NL": "Netherlands", + "AT": "Austria", + "PL": "Poland", + "PT": "Portugal", + "RO": "Romania", + "SI": "Slovenia", + "SK": "Slovakia", + "FI": "Finland", + "SE": "Sweden", + "IS": "Iceland", + "LI": "Liechtenstein", + "NO": "Norway", + "BA": "Bosnia and Herzegovina", + "ME": "Montenegro", + "MD": "Moldova", + "GE": "Georgia", + "MK": "North Macedonia", + "AL": "Albania", + "RS": "Serbia", + "TR": "Turkey", + "XK": "Kosovo", + "EU27_2020": "European Union (27)", + "UA": "Ukraine", + "UK": "United Kingdom" +} diff --git a/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.excluded_countries.json b/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.excluded_countries.json new file mode 100644 index 00000000000..7745db9ff44 --- /dev/null +++ b/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.excluded_countries.json @@ -0,0 +1 @@ +["EA"] diff --git a/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.meta.yml b/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.meta.yml new file mode 100644 index 00000000000..5b27ffbfd55 --- /dev/null +++ b/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.meta.yml @@ -0,0 +1,1334 @@ +definitions: + common: + presentation: + topic_tags: + - Energy + processing_level: major + + # Definitions of consumer type: + description_key_household_consumer: |- + A household consumer of gas or electricity is a residential user who consumes energy for domestic purposes like heating, cooking, lighting, and appliances. + description_key_non_household_consumer: |- + A non-household consumer of gas or electricity is a business or organization using energy for purposes like industry, services, offices, or agriculture. + # Definitions of consumer bands: + description_key_all_consumer_bands: |- + Prices are a weighted average across all consumption bands, from low to high energy consumers, based on the relative consumption shares reported by each country. + # Definitions of price components: + # Eurostat's definitions can be found: + # * For gas: https://ec.europa.eu/eurostat/cache/metadata/en/nrg_pc_202_sims.htm + # * For electricity: https://ec.europa.eu/eurostat/cache/metadata/en/nrg_pc_204_sims.htm + # * Gas - Energy and supply: commodity price for natural gas paid by the supplier or the price of natural gas at the point of entry into the transmission system, including, if applicable, the following end-user costs: storage costs plus costs relating to the sale of natural gas to final customers. + # * Electricity - Energy and supply: generation, aggregation, balancing energy, supplied energy costs, customer services, after-sales management and other supply costs. + # * Gas/electricity - Network cost: transmission and distribution tariffs, transmission and distribution losses, network costs, after-sale service costs, system service costs, and meter rental and metering costs. + # * Gas/electricity - Value added taxes (VAT): as defined in Council Directive 2006/112/EC. + # * Gas/electricity - Renewable taxes: taxes, fees, levies or charges relating to the promotion of renewable energy sources, energy efficiency and CHP generation. + # * Gas - Capacity taxes: taxes, fees, levies or charges relating to strategic stockpiles, capacity payments and energy security; taxes on natural gas distribution; stranded costs and levies on financing energy regulatory authorities or market and system operators. + # * Electricity - Capacity taxes: Taxes, fees, levies or charges relating to capacity payments, energy security and generation adequacy; taxes on coal industry restructuring; taxes on electricity distribution; stranded costs and levies on financing energy regulatory authorities or market and system operators. + # * Gas/electricity - Environmental taxes: taxes, fees, levies or charges relating to air quality and for other environmental purposes; taxes on emissions of CO2 or other greenhouse gases. This component includes the excise duties. + # * Gas/electricity - All other taxes: taxes, fees, levies or charges not covered by any of the previous four categories: support for district heating; local or regional fiscal charges; island compensation; concession fees relating to licences and fees for the occupation of land and public or private property by networks or other devices. + # * Electricity - Nuclear taxes: taxes, fees, levies or charges relating to the nuclear sector, including nuclear decommissioning, inspections and fees for nuclear installations. + description_key_gas_component_energy_and_supply: |- + "Energy and supply" is the cost of natural gas, including the commodity price paid by the supplier or the price at the point of entry into the transmission system. It also includes, where applicable, end-user costs such as storage and expenses related to the sale of gas to final customers. + description_key_gas_component_network_cost: &description_key_gas_component_network_cost |- + "Network costs" include the tariffs for transmission and distribution, costs from transmission and distribution losses, and system-related expenses such as after-sale services, system services, meter rental, and metering. + description_key_gas_component_taxes_fees_levies_and_charges: &description_key_gas_component_taxes_fees_levies_and_charges |- + "Taxes, fees, levies, and charges" include value-added tax (VAT), renewable taxes, capacity taxes, environmental taxes, and all other taxes, fees, levies, or charges. + description_key_gas_component_vat: &description_key_gas_component_vat |- + "Value-added tax (VAT)" is a consumption tax applied to the final price of gas or electricity, as defined in Council Directive 2006/112/EC. + description_key_gas_component_renewable_taxes: &description_key_gas_component_renewable_taxes |- + "Renewable taxes" are taxes, fees, levies, or charges that promote renewable energy sources, energy efficiency, and combined heat and power (CHP) generation. + description_key_gas_component_capacity_taxes: |- + "Capacity taxes" are charges related to maintaining energy security and the gas distribution system. They include taxes or fees for strategic gas stockpiles, payments to ensure sufficient supply capacity, costs for supporting energy regulatory authorities or system operators, and levies to cover past infrastructure investments (stranded costs). + description_key_gas_component_environmental_taxes: &description_key_gas_component_environmental_taxes |- + "Environmental taxes" are charges aimed at addressing environmental concerns, such as improving air quality and reducing greenhouse gas emissions (e.g., CO₂ taxes). This category also includes excise duties—specific taxes applied to goods or activities like fuels and energy use, which contribute to environmental impacts. + description_key_gas_component_other_taxes: &description_key_gas_component_other_taxes |- + "All other taxes" include fees, levies, or charges not covered by other tax components. They may relate to district heating support, local or regional fiscal charges, island compensation, concession fees for licenses, or fees for occupying land or property with networks or other devices. + description_key_gas_component_total_price_including_taxes: &description_key_gas_component_total_price_including_taxes |- + "Total price including taxes" is the sum of all price components, including energy and supply, network costs, taxes, fees, levies and charges. + description_key_electricity_component_energy_and_supply: |- + "Energy and supply" is the cost of electricity generation, aggregation, balancing, and other supply-related activities, including end-user costs such as customer services and after-sales management. + description_key_electricity_component_network_cost: *description_key_gas_component_network_cost + description_key_electricity_component_vat: *description_key_gas_component_vat + description_key_electricity_component_renewable_taxes: *description_key_gas_component_renewable_taxes + description_key_electricity_component_capacity_taxes: |- + "Capacity taxes" are charges aimed at ensuring energy security and sufficient electricity supply. They include fees for maintaining generation capacity, supporting energy regulatory authorities or system operators, restructuring the coal industry, and improving electricity distribution. They may also cover costs from past infrastructure investments (stranded costs). + description_key_electricity_component_environmental_taxes: *description_key_gas_component_environmental_taxes + description_key_electricity_component_other_taxes: *description_key_gas_component_other_taxes + description_key_electricity_component_nuclear_taxes: |- + "Nuclear taxes" are charges related to the nuclear sector, including decommissioning, inspections, and fees for nuclear installations. + description_key_electricity_component_total_price_including_taxes: *description_key_gas_component_total_price_including_taxes + description_key_electricity_component_taxes_fees_levies_and_charges: *description_key_gas_component_taxes_fees_levies_and_charges + # Definitions of price levels: + # Original Eurostat definitions of price levels: + # Level 1 prices ("X_TAX"): Prices excluding taxes and levies. + # Level 2 prices ("X_VAT"): Prices excluding VAT and other recoverable taxes and levies. + # Level 3 prices ("I_TAX"): Prices including all taxes and levies. + description_key_price_excluding_taxes: |- + Prices represent the base cost of energy, excluding all taxes, levies, and VAT. These prices reflect the pure costs of energy and supply, network, and other market-related services. + description_key_price_excluding_vat: |- + Prices include the base cost of energy and non-recoverable taxes and levies. They exclude VAT and other recoverable taxes. + description_key_price_including_all_taxes: |- + Prices represent the total price paid by end consumers, including all taxes, levies, and VAT. + + +dataset: + update_period_days: 365 + + +tables: + gas_and_electricity_prices: + variables: + price_euro: + title: Price + unit: euro + short_unit: "€" + description_short: Energy price in euro. + price_pps: + title: Price (purchasing power) + unit: purchasing power standard + short_unit: PPS + description_short: Energy price in purchasing power standard. + gas_and_electricity_price_components_euro_flat: + title: Gas and electricity price components in Europe + common: + description_short: Price components are given in euros per [megawatt-hour](#dod:watt-hours). They are not adjusted for inflation or differences in living costs between countries. + unit: 'current euros per megawatt-hour' + short_unit: "€/MWh" + variables: + electricity_household_capacity_taxes_euro: + title: Electricity price component (euros) for household consumers - Capacity taxes + display: + name: Capacity taxes + presentation: + title_public: Electricity price component for household consumers - Capacity taxes + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_capacity_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_capacity_taxes_euro: + title: Electricity price component (euros) for non-household consumers - Capacity taxes + display: + name: Capacity taxes + presentation: + title_public: Electricity price component for non-household consumers - Capacity taxes + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_capacity_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_capacity_taxes_allowances_euro: + title: Electricity price component (euros) for household consumers - Capacity taxes allowances + display: + name: Capacity taxes allowances + presentation: + title_public: Electricity price component for household consumers - Capacity taxes allowances + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_capacity_taxes_allowances}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_energy_and_supply_euro: + title: Electricity price component (euros) for household consumers - Energy and supply + display: + name: Energy and supply + presentation: + title_public: Electricity price component for household consumers - Energy and supply + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_energy_and_supply}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_energy_and_supply_euro: + title: Electricity price component (euros) for non-household consumers - Energy and supply + display: + name: Energy and supply + presentation: + title_public: Electricity price component for non-household consumers - Energy and supply + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_energy_and_supply}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_environmental_taxes_euro: + title: Electricity price component (euros) for household consumers - Environmental taxes + display: + name: Environmental taxes + presentation: + title_public: Electricity price component for household consumers - Environmental taxes + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_environmental_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_environmental_taxes_euro: + title: Electricity price component (euros) for non-household consumers - Environmental taxes + display: + name: Environmental taxes + presentation: + title_public: Electricity price component for non-household consumers - Environmental taxes + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_environmental_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_environmental_taxes_allowance_euro: + title: Electricity price component (euros) for household consumers - Environmental taxes allowance + display: + name: Environmental taxes allowance + presentation: + title_public: Electricity price component for household consumers - Environmental taxes allowance + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_environmental_taxes_allowance}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_network_costs_euro: + title: Electricity price component (euros) for household consumers - Network costs + display: + name: Network costs + presentation: + title_public: Electricity price component for household consumers - Network costs + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_network_cost}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_network_costs_euro: + title: Electricity price component (euros) for non-household consumers - Network costs + display: + name: Network costs + presentation: + title_public: Electricity price component for non-household consumers - Network costs + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_network_cost}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_nuclear_taxes_euro: + title: Electricity price component (euros) for household consumers - Nuclear taxes + display: + name: Nuclear taxes + presentation: + title_public: Electricity price component for household consumers - Nuclear taxes + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_nuclear_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_nuclear_taxes_euro: + title: Electricity price component (euros) for non-household consumers - Nuclear taxes + display: + name: Nuclear taxes + presentation: + title_public: Electricity price component for non-household consumers - Nuclear taxes + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_nuclear_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_nuclear_taxes_allowance_euro: + title: Electricity price component (euros) for household consumers - Nuclear taxes allowance + display: + name: Nuclear taxes allowance + presentation: + title_public: Electricity price component for household consumers - Nuclear taxes allowance + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_nuclear_taxes_allowance}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_other_euro: + title: Electricity price component (euros) for household consumers - Other + display: + name: Other + presentation: + title_public: Electricity price component for household consumers - Other + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_other_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_other_euro: + title: Electricity price component (euros) for non-household consumers - Other + display: + name: Other + presentation: + title_public: Electricity price component for non-household consumers - Other + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_other_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_other_allowance_euro: + title: Electricity price component (euros) for household consumers - Other allowance + display: + name: Other allowance + presentation: + title_public: Electricity price component for household consumers - Other allowance + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_other_allowance}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_renewable_taxes_euro: + title: Electricity price component (euros) for household consumers - Renewable taxes + display: + name: Renewable taxes + presentation: + title_public: Electricity price component for household consumers - Renewable taxes + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_renewable_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_renewable_taxes_euro: + title: Electricity price component (euros) for non-household consumers - Renewable taxes + display: + name: Renewable taxes + presentation: + title_public: Electricity price component for non-household consumers - Renewable taxes + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_renewable_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_renewable_taxes_allowance_euro: + title: Electricity price component (euros) for household consumers - Renewable taxes allowance + display: + name: Renewable taxes allowance + presentation: + title_public: Electricity price component for household consumers - Renewable taxes allowance + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_renewable_taxes_allowance}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_taxes_fees_levies_and_charges_euro: + title: Electricity price component (euros) for household consumers - Taxes, fees, levies, and charges + display: + name: Taxes, fees, levies, and charges + presentation: + title_public: Electricity price component for household consumers - Taxes, fees, levies, and charges + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_taxes_fees_levies_and_charges}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_taxes_fees_levies_and_charges_euro: + title: Electricity price component (euros) for non-household consumers - Taxes, fees, levies, and charges + display: + name: Taxes, fees, levies, and charges + presentation: + title_public: Electricity price component for non-household consumers - Taxes, fees, levies, and charges + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_taxes_fees_levies_and_charges}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_value_added_tax_vat_euro: + title: Electricity price component (euros) for household consumers - Value added tax (VAT) + display: + name: Value added tax (VAT) + presentation: + title_public: Electricity price component for household consumers - Value added tax (VAT) + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_vat}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_value_added_tax_vat_euro: + title: Electricity price component (euros) for non-household consumers - Value added tax (VAT) + display: + name: Value added tax (VAT) + presentation: + title_public: Electricity price component for non-household consumers - Value added tax (VAT) + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_vat}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_capacity_taxes_euro: + title: Gas price component (euros) for household consumers - Capacity taxes + display: + name: Capacity taxes + presentation: + title_public: Gas price component for household consumers - Capacity taxes + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_capacity_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_energy_and_supply_euro: + title: Gas price component (euros) for household consumers - Energy and supply + display: + name: Energy and supply + presentation: + title_public: Gas price component for household consumers - Energy and supply + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_energy_and_supply}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_environmental_taxes_euro: + title: Gas price component (euros) for household consumers - Environmental taxes + display: + name: Environmental taxes + presentation: + title_public: Gas price component for household consumers - Environmental taxes + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_environmental_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_network_costs_euro: + title: Gas price component (euros) for household consumers - Network costs + display: + name: Network costs + presentation: + title_public: Gas price component for household consumers - Network costs + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_network_cost}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_other_euro: + title: Gas price component (euros) for household consumers - Other + display: + name: Other + presentation: + title_public: Gas price component for household consumers - Other + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_other_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_renewable_taxes_euro: + title: Gas price component (euros) for household consumers - Renewable taxes + display: + name: Renewable taxes + presentation: + title_public: Gas price component for household consumers - Renewable taxes + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_renewable_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_taxes_fees_levies_and_charges_euro: + title: Gas price component (euros) for household consumers - Taxes, fees, levies, and charges + display: + name: Taxes, fees, levies, and charges + presentation: + title_public: Gas price component for household consumers - Taxes, fees, levies, and charges + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_taxes_fees_levies_and_charges}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_value_added_tax_vat_euro: + title: Gas price component (euros) for household consumers - Value added tax (VAT) + display: + name: Value added tax (VAT) + presentation: + title_public: Gas price component for household consumers - Value added tax (VAT) + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_vat}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_capacity_taxes_euro: + title: Gas price component (euros) for non-household consumers - Capacity taxes + display: + name: Capacity taxes + presentation: + title_public: Gas price component for non-household consumers - Capacity taxes + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_capacity_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_energy_and_supply_euro: + title: Gas price component (euros) for non-household consumers - Energy and supply + display: + name: Energy and supply + presentation: + title_public: Gas price component for non-household consumers - Energy and supply + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_energy_and_supply}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_environmental_taxes_euro: + title: Gas price component (euros) for non-household consumers - Environmental taxes + display: + name: Environmental taxes + presentation: + title_public: Gas price component for non-household consumers - Environmental taxes + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_environmental_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_network_costs_euro: + title: Gas price component (euros) for non-household consumers - Network costs + display: + name: Network costs + presentation: + title_public: Gas price component for non-household consumers - Network costs + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_network_cost}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_other_euro: + title: Gas price component (euros) for non-household consumers - Other + display: + name: Other + presentation: + title_public: Gas price component for non-household consumers - Other + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_other_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_renewable_taxes_euro: + title: Gas price component (euros) for non-household consumers - Renewable taxes + display: + name: Renewable taxes + presentation: + title_public: Gas price component for non-household consumers - Renewable taxes + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_renewable_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_taxes_fees_levies_and_charges_euro: + title: Gas price component (euros) for non-household consumers - Taxes, fees, levies, and charges + display: + name: Taxes, fees, levies, and charges + presentation: + title_public: Gas price component for non-household consumers - Taxes, fees, levies, and charges + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_taxes_fees_levies_and_charges}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_value_added_tax_vat_euro: + title: Gas price component (euros) for non-household consumers - Value added tax (VAT) + display: + name: Value added tax (VAT) + presentation: + title_public: Gas price component for non-household consumers - Value added tax (VAT) + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_vat}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_taxes_fees_levies_and_charges_allowance_euro: + title: Electricity price component (euros) for household consumers - Taxes, fees, levies, and charges allowance + display: + name: Taxes, fees, levies, and charges allowance + presentation: + title_public: Electricity price component for household consumers - Taxes, fees, levies, and charges allowance + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_taxes_fees_levies_and_charges_allowance}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_total_price_including_taxes_euro: + title: Electricity price component (euros) for household consumers - Total price including taxes + display: + name: Total price including taxes + presentation: + title_public: Electricity price component for household consumers - Total price including taxes + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_total_price_including_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_total_price_including_taxes_euro: + title: Electricity price component (euros) for non-household consumers - Total price including taxes + display: + name: Total price including taxes + presentation: + title_public: Electricity price component for non-household consumers - Total price including taxes + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_total_price_including_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_total_price_including_taxes_euro: + title: Gas price component (euros) for household consumers - Total price including taxes + display: + name: Total price including taxes + presentation: + title_public: Gas price component for household consumers - Total price including taxes + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_total_price_including_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_total_price_including_taxes_euro: + title: Gas price component (euros) for non-household consumers - Total price including taxes + display: + name: Total price including taxes + presentation: + title_public: Gas price component for non-household consumers - Total price including taxes + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_total_price_including_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_and_electricity_price_components_pps_flat: + title: Gas and electricity price components in Europe (PPS) + common: + description_short: Price components are given in purchasing power standard (PPS) per [megawatt-hour](#dod:watt-hours). + unit: "purchasing power standard per megawatt-hour" + short_unit: "PPS/MWh" + variables: + electricity_household_capacity_taxes_pps: + title: Electricity price component (PPS) for household consumers - Capacity taxes + display: + name: Capacity taxes + presentation: + title_public: Electricity price component (PPS) for household consumers - Capacity taxes + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_capacity_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_capacity_taxes_pps: + title: Electricity price component (PPS) for non-household consumers - Capacity taxes + display: + name: Capacity taxes + presentation: + title_public: Electricity price component (PPS) for non-household consumers - Capacity taxes + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_capacity_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_capacity_taxes_allowances_pps: + title: Electricity price component (PPS) for household consumers - Capacity taxes allowances + display: + name: Capacity taxes allowances + presentation: + title_public: Electricity price component (PPS) for household consumers - Capacity taxes allowances + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_capacity_taxes_allowances}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_energy_and_supply_pps: + title: Electricity price component (PPS) for household consumers - Energy and supply + display: + name: Energy and supply + presentation: + title_public: Electricity price component (PPS) for household consumers - Energy and supply + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_energy_and_supply}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_energy_and_supply_pps: + title: Electricity price component (PPS) for non-household consumers - Energy and supply + display: + name: Energy and supply + presentation: + title_public: Electricity price component (PPS) for non-household consumers - Energy and supply + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_energy_and_supply}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_environmental_taxes_pps: + title: Electricity price component (PPS) for household consumers - Environmental taxes + display: + name: Environmental taxes + presentation: + title_public: Electricity price component (PPS) for household consumers - Environmental taxes + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_environmental_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_environmental_taxes_pps: + title: Electricity price component (PPS) for non-household consumers - Environmental taxes + display: + name: Environmental taxes + presentation: + title_public: Electricity price component (PPS) for non-household consumers - Environmental taxes + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_environmental_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_environmental_taxes_allowance_pps: + title: Electricity price component (PPS) for household consumers - Environmental taxes allowance + display: + name: Environmental taxes allowance + presentation: + title_public: Electricity price component (PPS) for household consumers - Environmental taxes allowance + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_environmental_taxes_allowance}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_network_costs_pps: + title: Electricity price component (PPS) for household consumers - Network costs + display: + name: Network costs + presentation: + title_public: Electricity price component (PPS) for household consumers - Network costs + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_network_cost}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_network_costs_pps: + title: Electricity price component (PPS) for non-household consumers - Network costs + display: + name: Network costs + presentation: + title_public: Electricity price component (PPS) for non-household consumers - Network costs + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_network_cost}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_nuclear_taxes_pps: + title: Electricity price component (PPS) for household consumers - Nuclear taxes + display: + name: Nuclear taxes + presentation: + title_public: Electricity price component (PPS) for household consumers - Nuclear taxes + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_nuclear_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_nuclear_taxes_pps: + title: Electricity price component (PPS) for non-household consumers - Nuclear taxes + display: + name: Nuclear taxes + presentation: + title_public: Electricity price component (PPS) for non-household consumers - Nuclear taxes + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_nuclear_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_nuclear_taxes_allowance_pps: + title: Electricity price component (PPS) for household consumers - Nuclear taxes allowance + display: + name: Nuclear taxes allowance + presentation: + title_public: Electricity price component (PPS) for household consumers - Nuclear taxes allowance + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_nuclear_taxes_allowance}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_other_pps: + title: Electricity price component (PPS) for household consumers - Other + display: + name: Other + presentation: + title_public: Electricity price component (PPS) for household consumers - Other + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_other_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_other_pps: + title: Electricity price component (PPS) for non-household consumers - Other + display: + name: Other + presentation: + title_public: Electricity price component (PPS) for non-household consumers - Other + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_other_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_other_allowance_pps: + title: Electricity price component (PPS) for household consumers - Other allowance + display: + name: Other allowance + presentation: + title_public: Electricity price component (PPS) for household consumers - Other allowance + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_other_allowance}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_renewable_taxes_pps: + title: Electricity price component (PPS) for household consumers - Renewable taxes + display: + name: Renewable taxes + presentation: + title_public: Electricity price component (PPS) for household consumers - Renewable taxes + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_renewable_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_renewable_taxes_pps: + title: Electricity price component (PPS) for non-household consumers - Renewable taxes + display: + name: Renewable taxes + presentation: + title_public: Electricity price component (PPS) for non-household consumers - Renewable taxes + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_renewable_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_renewable_taxes_allowance_pps: + title: Electricity price component (PPS) for household consumers - Renewable taxes allowance + display: + name: Renewable taxes allowance + presentation: + title_public: Electricity price component (PPS) for household consumers - Renewable taxes allowance + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_renewable_taxes_allowance}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_taxes_fees_levies_and_charges_pps: + title: Electricity price component (PPS) for household consumers - Taxes, fees, levies, and charges + display: + name: Taxes, fees, levies, and charges + presentation: + title_public: Electricity price component (PPS) for household consumers - Taxes, fees, levies, and charges + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_taxes_fees_levies_and_charges}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_taxes_fees_levies_and_charges_pps: + title: Electricity price component (PPS) for non-household consumers - Taxes, fees, levies, and charges + display: + name: Taxes, fees, levies, and charges + presentation: + title_public: Electricity price component (PPS) for non-household consumers - Taxes, fees, levies, and charges + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_taxes_fees_levies_and_charges}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_value_added_tax_vat_pps: + title: Electricity price component (PPS) for household consumers - Value added tax (VAT) + display: + name: Value added tax (VAT) + presentation: + title_public: Electricity price component (PPS) for household consumers - Value added tax (VAT) + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_vat}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_value_added_tax_vat_pps: + title: Electricity price component (PPS) for non-household consumers - Value added tax (VAT) + display: + name: Value added tax (VAT) + presentation: + title_public: Electricity price component (PPS) for non-household consumers - Value added tax (VAT) + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_vat}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_capacity_taxes_pps: + title: Gas price component (PPS) for household consumers - Capacity taxes + display: + name: Capacity taxes + presentation: + title_public: Gas price component (PPS) for household consumers - Capacity taxes + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_capacity_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_energy_and_supply_pps: + title: Gas price component (PPS) for household consumers - Energy and supply + display: + name: Energy and supply + presentation: + title_public: Gas price component (PPS) for household consumers - Energy and supply + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_energy_and_supply}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_environmental_taxes_pps: + title: Gas price component (PPS) for household consumers - Environmental taxes + display: + name: Environmental taxes + presentation: + title_public: Gas price component (PPS) for household consumers - Environmental taxes + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_environmental_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_network_costs_pps: + title: Gas price component (PPS) for household consumers - Network costs + display: + name: Network costs + presentation: + title_public: Gas price component (PPS) for household consumers - Network costs + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_network_cost}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_other_pps: + title: Gas price component (PPS) for household consumers - Other + display: + name: Other + presentation: + title_public: Gas price component (PPS) for household consumers - Other + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_other_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_renewable_taxes_pps: + title: Gas price component (PPS) for household consumers - Renewable taxes + display: + name: Renewable taxes + presentation: + title_public: Gas price component (PPS) for household consumers - Renewable taxes + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_renewable_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_taxes_fees_levies_and_charges_pps: + title: Gas price component (PPS) for household consumers - Taxes, fees, levies, and charges + display: + name: Taxes, fees, levies, and charges + presentation: + title_public: Gas price component (PPS) for household consumers - Taxes, fees, levies, and charges + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_taxes_fees_levies_and_charges}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_value_added_tax_vat_pps: + title: Gas price component (PPS) for household consumers - Value added tax (VAT) + display: + name: Value added tax (VAT) + presentation: + title_public: Gas price component (PPS) for household consumers - Value added tax (VAT) + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_vat}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_capacity_taxes_pps: + title: Gas price component (PPS) for non-household consumers - Capacity taxes + display: + name: Capacity taxes + presentation: + title_public: Gas price component (PPS) for non-household consumers - Capacity taxes + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_capacity_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_energy_and_supply_pps: + title: Gas price component (PPS) for non-household consumers - Energy and supply + display: + name: Energy and supply + presentation: + title_public: Gas price component (PPS) for non-household consumers - Energy and supply + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_energy_and_supply}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_environmental_taxes_pps: + title: Gas price component (PPS) for non-household consumers - Environmental taxes + display: + name: Environmental taxes + presentation: + title_public: Gas price component (PPS) for non-household consumers - Environmental taxes + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_environmental_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_network_costs_pps: + title: Gas price component (PPS) for non-household consumers - Network costs + display: + name: Network costs + presentation: + title_public: Gas price component (PPS) for non-household consumers - Network costs + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_network_cost}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_other_pps: + title: Gas price component (PPS) for non-household consumers - Other + display: + name: Other + presentation: + title_public: Gas price component (PPS) for non-household consumers - Other + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_other_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_renewable_taxes_pps: + title: Gas price component (PPS) for non-household consumers - Renewable taxes + display: + name: Renewable taxes + presentation: + title_public: Gas price component (PPS) for non-household consumers - Renewable taxes + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_renewable_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_taxes_fees_levies_and_charges_pps: + title: Gas price component (PPS) for non-household consumers - Taxes, fees, levies, and charges + display: + name: Taxes, fees, levies, and charges + presentation: + title_public: Gas price component (PPS) for non-household consumers - Taxes, fees, levies, and charges + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_taxes_fees_levies_and_charges}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_value_added_tax_vat_pps: + title: Gas price component (PPS) for non-household consumers - Value added tax (VAT) + display: + name: Value added tax (VAT) + presentation: + title_public: Gas price component (PPS) for non-household consumers - Value added tax (VAT) + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_vat}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_taxes_fees_levies_and_charges_allowance_pps: + title: Electricity price component (PPS) for household consumers - Taxes, fees, levies, and charges allowance + display: + name: Taxes, fees, levies, and charges allowance + presentation: + title_public: Electricity price component (PPS) for household consumers - Taxes, fees, levies, and charges allowance + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + # - "{definitions.description_key_electricity_component_taxes_fees_levies_and_charges_allowance}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_household_total_price_including_taxes_pps: + title: Electricity price component (PPS) for household consumers - Total price including taxes + display: + name: Total price including taxes + presentation: + title_public: Electricity price component (PPS) for household consumers - Total price including taxes + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_electricity_component_total_price_including_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + electricity_non_household_total_price_including_taxes_pps: + title: Electricity price component (PPS) for non-household consumers - Total price including taxes + display: + name: Total price including taxes + presentation: + title_public: Electricity price component (PPS) for non-household consumers - Total price including taxes + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_electricity_component_total_price_including_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_household_total_price_including_taxes_pps: + title: Gas price component (PPS) for household consumers - Total price including taxes + display: + name: Total price including taxes + presentation: + title_public: Gas price component (PPS) for household consumers - Total price including taxes + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_gas_component_total_price_including_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_non_household_total_price_including_taxes_pps: + title: Gas price component (PPS) for non-household consumers - Total price including taxes + display: + name: Total price including taxes + presentation: + title_public: Gas price component (PPS) for non-household consumers - Total price including taxes + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_gas_component_total_price_including_taxes}" + - "{definitions.description_key_all_consumer_bands}" + + gas_and_electricity_prices_euro_flat: + title: Gas and electricity prices in Europe + common: + description_short: Prices are given in euros per [megawatt-hour](#dod:watt-hours). They are not adjusted for inflation or differences in living costs between countries. + unit: "current euros per megawatt-hour" + short_unit: "€/MWh" + variables: + electricity_household_all_taxes_and_levies_included_euro: + title: Electricity price (euros) for household consumers - All taxes and levies included + presentation: + title_public: Electricity price for household consumers - All taxes and levies included + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_including_all_taxes}" + electricity_household_excluding_taxes_and_levies_euro: + title: Electricity price (euros) for household consumers - Excluding taxes and levies + presentation: + title_public: Electricity price for household consumers - Excluding taxes and levies + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_taxes}" + electricity_household_excluding_vat_and_other_recoverable_taxes_and_levies_euro: + title: Electricity price (euros) for household consumers - Excluding VAT and other recoverable taxes and levies + presentation: + title_public: Electricity price for household consumers - Excluding VAT and other recoverable taxes and levies + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_vat}" + electricity_non_household_all_taxes_and_levies_included_euro: + title: Electricity price (euros) for non-household consumers - All taxes and levies included + presentation: + title_public: Electricity price for non-household consumers - All taxes and levies included + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_including_all_taxes}" + electricity_non_household_excluding_taxes_and_levies_euro: + title: Electricity price (euros) for non-household consumers - Excluding taxes and levies + presentation: + title_public: Electricity price for non-household consumers - Excluding taxes and levies + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_taxes}" + electricity_non_household_excluding_vat_and_other_recoverable_taxes_and_levies_euro: + title: Electricity price (euros) for non-household consumers - Excluding VAT and other recoverable taxes and levies + presentation: + title_public: Electricity price for non-household consumers - Excluding VAT and other recoverable taxes and levies + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_vat}" + gas_household_all_taxes_and_levies_included_euro: + title: Gas price (euros) for household consumers - All taxes and levies included + presentation: + title_public: Gas price for household consumers - All taxes and levies included + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_including_all_taxes}" + gas_household_excluding_taxes_and_levies_euro: + title: Gas price (euros) for household consumers - Excluding taxes and levies + presentation: + title_public: Gas price for household consumers - Excluding taxes and levies + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_taxes}" + gas_household_excluding_vat_and_other_recoverable_taxes_and_levies_euro: + title: Gas price (euros) for household consumers - Excluding VAT and other recoverable taxes and levies + presentation: + title_public: Gas price for household consumers - Excluding VAT and other recoverable taxes and levies + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_vat}" + gas_non_household_all_taxes_and_levies_included_euro: + title: Gas price (euros) for non-household consumers - All taxes and levies included + presentation: + title_public: Gas price for non-household consumers - All taxes and levies included + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_including_all_taxes}" + gas_non_household_excluding_taxes_and_levies_euro: + title: Gas price (euros) for non-household consumers - Excluding taxes and levies + presentation: + title_public: Gas price for non-household consumers - Excluding taxes and levies + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_taxes}" + gas_non_household_excluding_vat_and_other_recoverable_taxes_and_levies_euro: + title: Gas price (euros) for non-household consumers - Excluding VAT and other recoverable taxes and levies + presentation: + title_public: Gas price for non-household consumers - Excluding VAT and other recoverable taxes and levies + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_vat}" + gas_and_electricity_prices_pps_flat: + title: Gas and electricity prices in Europe (PPS) + common: + description_short: Prices are given in purchasing power standard (PPS) per [megawatt-hour](#dod:watt-hours). + unit: "purchasing power standard per megawatt-hour" + short_unit: "PPS/MWh" + variables: + electricity_household_all_taxes_and_levies_included_pps: + title: Electricity price (PPS) for household consumers - All taxes and levies included + presentation: + title_public: Electricity price for household consumers - All taxes and levies included + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_including_all_taxes}" + electricity_household_excluding_taxes_and_levies_pps: + title: Electricity price (PPS) for household consumers - Excluding taxes and levies + presentation: + title_public: Electricity price for household consumers - Excluding taxes and levies + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_taxes}" + electricity_household_excluding_vat_and_other_recoverable_taxes_and_levies_pps: + title: Electricity price (PPS) for household consumers - Excluding VAT and other recoverable taxes and levies + presentation: + title_public: Electricity price for household consumers - Excluding VAT and other recoverable taxes and levies + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_vat}" + electricity_non_household_all_taxes_and_levies_included_pps: + title: Electricity price (PPS) for non-household consumers - All taxes and levies included + presentation: + title_public: Electricity price for non-household consumers - All taxes and levies included + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_including_all_taxes}" + electricity_non_household_excluding_taxes_and_levies_pps: + title: Electricity price (PPS) for non-household consumers - Excluding taxes and levies + presentation: + title_public: Electricity price for non-household consumers - Excluding taxes and levies + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_taxes}" + electricity_non_household_excluding_vat_and_other_recoverable_taxes_and_levies_pps: + title: Electricity price (PPS) for non-household consumers - Excluding VAT and other recoverable taxes and levies + presentation: + title_public: Electricity price for non-household consumers - Excluding VAT and other recoverable taxes and levies + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_vat}" + gas_household_all_taxes_and_levies_included_pps: + title: Gas price (PPS) for household consumers - All taxes and levies included + presentation: + title_public: Gas price for household consumers - All taxes and levies included + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_including_all_taxes}" + gas_household_excluding_taxes_and_levies_pps: + title: Gas price (PPS) for household consumers - Excluding taxes and levies + presentation: + title_public: Gas price for household consumers - Excluding taxes and levies + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_taxes}" + gas_household_excluding_vat_and_other_recoverable_taxes_and_levies_pps: + title: Gas price (PPS) for household consumers - Excluding VAT and other recoverable taxes and levies + presentation: + title_public: Gas price for household consumers - Excluding VAT and other recoverable taxes and levies + title_variant: PPS + description_key: + - "{definitions.description_key_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_vat}" + gas_non_household_all_taxes_and_levies_included_pps: + title: Gas price (PPS) for non-household consumers - All taxes and levies included + presentation: + title_public: Gas price for non-household consumers - All taxes and levies included + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_including_all_taxes}" + gas_non_household_excluding_taxes_and_levies_pps: + title: Gas price (PPS) for non-household consumers - Excluding taxes and levies + presentation: + title_public: Gas price for non-household consumers - Excluding taxes and levies + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_taxes}" + gas_non_household_excluding_vat_and_other_recoverable_taxes_and_levies_pps: + title: Gas price (PPS) for non-household consumers - Excluding VAT and other recoverable taxes and levies + presentation: + title_public: Gas price for non-household consumers - Excluding VAT and other recoverable taxes and levies + title_variant: PPS + description_key: + - "{definitions.description_key_non_household_consumer}" + - "{definitions.description_key_all_consumer_bands}" + - "{definitions.description_key_price_excluding_vat}" diff --git a/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.py b/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.py new file mode 100644 index 00000000000..afc793eaabb --- /dev/null +++ b/etl/steps/data/garden/eurostat/2024-11-05/gas_and_electricity_prices.py @@ -0,0 +1,895 @@ +"""Load a meadow dataset and create a garden dataset.""" +from typing import Dict + +import owid.catalog.processing as pr +import pandas as pd +import plotly.express as px +from owid.catalog import Table +from owid.datautils.dataframes import map_series + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Dataset codes to select, and their corresponding names. +DATASET_CODES_AND_NAMES = { + #################################################################################################################### + # Gas and electricity prices. + # NOTE: Prices are given per semester. + "nrg_pc_202": "Gas prices for household consumers", # bi-annual data (from 2007) + "nrg_pc_203": "Gas prices for non-household consumers", # bi-annual data (from 2007) + "nrg_pc_204": "Electricity prices for household consumers", # bi-annual data (from 2007) + "nrg_pc_205": "Electricity prices for non-household consumers", # bi-annual data (from 2007) + #################################################################################################################### + # Gas and electricity prices components. + "nrg_pc_202_c": "Gas prices components for household consumers", # annual data (from 2007) + "nrg_pc_203_c": "Gas prices components for non-household consumers", # annual data (from 2007) + "nrg_pc_204_c": "Electricity prices components for household consumers", # annual data (from 2007) + "nrg_pc_205_c": "Electricity prices components for non-household consumers", # annual data (from 2007) + #################################################################################################################### + # Historical data. + # NOTE: For now I think we will have to ignore historical data. + # I doesn't have a band for total price. Instead, it has different consumption bands (defined by "consom"). + # This field is a bit problematic. + # The same value, e.g. "4141050" has different definitions for electricity ("Households - Da (annual consumption: 600 kWh)") and for gas ("Households - D1 (annual consumption: 8.37 GJ)"). + # The fact that the same value is used for different things is inconvenient, but not the main problem. + # The main problem is that we would need to figure out how to properly aggregate these values to get totals (meanwhile current data comes with totals). + # Additionally, historical data is disaggregated in "domestic" and "industrial", whereas current data is split in "households" and "non-households". + # "consom": {} + # "nrg_pc_202_h": "Gas prices for domestic consumers", # bi-annual data (until 2007) + # "nrg_pc_203_h": "Gas prices for industrial consumers", # bi-annual data (until 2007) + # "nrg_pc_204_h": "Electricity prices for domestic consumers", # bi-annual data (until 2007) + # "nrg_pc_205_h": "Electricity prices for industrial consumers", # bi-annual data (until 2007) + # "nrg_pc_206_h": "Electricity marker prices", # bi-annual data (until 2007) + #################################################################################################################### + # Share for transmission and distribution in the network cost for gas and electricity. + # NOTE: Decide if we could use the following. + # "nrg_pc_206": "Share for transmission and distribution in the network cost for gas and electricity", # annual data (from 2007) + #################################################################################################################### + # The following are consumption volumes of electricity by consumption bands. + # It doesn't give the relative consumption of each semester. If I understand correctly, it gives the percentage consumption of each band in the total consumption of the year. + # "nrg_pc_202_v": "Gas consumption volumes for households", # annual data (from 2007) + # "nrg_pc_203_v": "Gas consumption volumes for non-households", # annual data (from 2007) + # "nrg_pc_204_v": "Electricity consumption volumes for households", # annual data (from 2007) + # "nrg_pc_205_v": "Electricity consumption volumes for non-households", # annual data (from 2007) +} + +# Columns to keep and how to rename them. +COLUMNS = { + "nrg_cons": "consumption_band", + "unit": "energy_unit", + "tax": "price_level", + "currency": "currency", + "geo": "country", + "time": "year", + "dataset_code": "dataset_code", + "nrg_prc": "price_component", + "value": "value", +} + +# Mappings of indexes. +# The definitions are copied from (replace [DATASET_CODE] with the dataset code): +# https://ec.europa.eu/eurostat/databrowser/view/[DATASET_CODE]/default/table?lang=en&category=nrg.nrg_price.nrg_pc +INDEXES_MAPPING = { + # Currencies. + "currency": { + "EUR": "Euro", + # Purchasing Power Standard + "PPS": "PPS", + "NAC": "National currency", + "NAT": "National (former) currency", + }, + # Flags (found right next to the value, as a string). + # NOTE: Flag definitions are right below the data table in that page. + "flag": { + "e": "estimated", + "c": "confidential", + "d": "definition differs", + "b": "break in time series", + "p": "provisional", + "u": "low reliability", + "cd": "confidential, definition differs", + # NOTE: I couldn't find the meaning of the following flag. + # It happens for "Electricity prices for non-household consumers" for Cyprus in 2024 (for MWH_GE150000), and all values are zero. + "n": "unknown flag", + }, + # Price levels. + "price_level": { + # All taxes and levies included + "I_TAX": "All taxes and levies included", + # Excluding VAT and other recoverable taxes and levies + # NOTE: This value gives a baseline price for electricity before any additional costs imposed by taxes or fees are added. It represents the net price of electricity. + "X_TAX": "Excluding taxes and levies", + # Excluding value-added tax (VAT) and other recoverable taxes and levies + "X_VAT": "Excluding VAT and other recoverable taxes and levies", + }, + # Consumption bands. + # NOTE: This is only relevant for non-historical data. + "consumption_band": { + # Consumption bands for "Gas prices for household consumers" and "Gas price components for household consumers": + # Consumption of GJ - all bands + "TOT_GJ": "All bands", + # Consumption less than 20 GJ - band D1 + "GJ_LT20": "<20GJ", + # Consumption from 20 GJ to 199 GJ - band D2 + "GJ20-199": "20-199GJ", + # Consumption 200 GJ or over - band D3 + "GJ_GE200": ">=200GJ", + ################################################################################################################ + # Consumption bands for "Gas prices components for non-household consumers" and "Gas prices components for non-household consumers": + # 'TOT_GJ': "All bands", # Already defined above. + # Consumption less than 1 000 GJ - band I1 + "GJ_LT1000": "<1000GJ", + # Consumption from 1 000 GJ to 9 999 GJ -band I2 + "GJ1000-9999": "1000-9999GJ", + # Consumption from 10 000 GJ to 99 999 GJ - band I3 + "GJ10000-99999": "10000-99999GJ", + # Consumption from 100 000 GJ to 999 999 GJ - band I4 + "GJ100000-999999": "100000-999999GJ", + # Consumption from 1 000 000 GJ to 3 999 999 GJ - band I5 + "GJ1000000-3999999": "1000000-3999999GJ", + # Consumption 4 000 000 GJ or over - band I6 + "GJ_GE4000000": ">=4000000GJ", + ################################################################################################################ + # Consumption bands for "Electricity prices for household consumers" and "Electricity prices components for household consumers": + # Consumption of kWh - all bands + "TOT_KWH": "All bands", + # Consumption less than 1 000 kWh - band DA + "KWH_LT1000": "<1000kWh", + # Consumption from 1 000 kWh to 2 499 kWh - band DB + "KWH1000-2499": "1000-2499kWh", + # Consumption from 2 500 kWh to 4 999 kWh - band DC + "KWH2500-4999": "2500-4999kWh", + # Consumption from 5 000 kWh to 14 999 kWh - band DD + "KWH5000-14999": "5000-14999kWh", + # Consumption for 15 000 kWh or over - band DE + "KWH_GE15000": ">=15000kWh", + # NOTE: In the electricity components dataset, there is an additional band, which contains *LE* but in the metadata it seems to correspond to greater or equal, band DE, so it must be a typo in the band name. + # Consumption 15 000 kWh or over - band DE + "KWH_LE15000": ">=15000kWh", + ################################################################################################################ + # Consumption bands for "Electricity prices components for non-household consumers" and "Electricity prices components for non-household consumers": + # Consumption of kWh - all bands + # "TOT_KWH": "All bands", # Already defined above. + # Consumption less than 20 MWh - band IA + "MWH_LT20": "<20MWh", + # Consumption from 20 MWh to 499 MWh - band IB + "MWH20-499": "20-499MWh", + # Consumption from 500 MWh to 1 999 MWh - band IC + "MWH500-1999": "500-1999MWh", + # Consumption from 2 000 MWh to 19 999 MWh - band ID + "MWH2000-19999": "2000-19999MWh", + # Consumption from 20 000 MWh to 69 999 MWh - band IE + "MWH20000-69999": "20000-69999MWh", + # Consumption from 70 000 MWh to 149 999 MWh - band IF + "MWH70000-149999": "70000-149999MWh", + # Consumption 150 000 MWh or over - band IG + "MWH_GE150000": ">=150000MWh", + # NOTE: In the electricity components dataset, there is an additional band: + # Consumption 149 999 MWh or less - bandS IA-IF + "MWH_LE149999": "<=149999MWh", + #################################################################################################################### + }, + # Energy price components. + "price_component": { + # Gas prices components for household and non-household consumers + # Energy and supply + "NRG_SUP": "Energy and supply", + # Network costs + "NETC": "Network costs", + # Taxes, fees, levies and charges + "TAX_FEE_LEV_CHRG": "Taxes, fees, levies, and charges", + # Value added tax (VAT) + "VAT": "Value added tax (VAT)", + # Renewable taxes + "TAX_RNW": "Renewable taxes", + # Capacity taxes + "TAX_CAP": "Capacity taxes", + # Environmental taxes + "TAX_ENV": "Environmental taxes", + # Renewable taxes allowance + "TAX_RNW_ALLOW": "Renewable taxes allowance", + # Capacity taxes allowances + "TAX_CAP_ALLOW": "Capacity taxes allowances", + # Environmental taxes allowance + "TAX_ENV_ALLOW": "Environmental taxes allowance", + # Other allowance + "ALLOW_OTH": "Other allowance", + # Other + "OTH": "Other", + # Electricity prices components for household and non-household consumers + # All the above, plus the additional: + # Nuclear taxes + "TAX_NUC": "Nuclear taxes", + # Nuclear taxes allowance + "TAX_NUC_ALLOW": "Nuclear taxes allowance", + # Taxes, fees, levies and charges allowance + "TAX_FEE_LEV_CHRG_ALLOW": "Taxes, fees, levies, and charges allowance", + # From the metadata page (https://ec.europa.eu/eurostat/cache/metadata/en/nrg_pc_204_sims.htm), these are the components: + # * Energy and supply: generation, aggregation, balancing energy, supplied energy costs, customer services, after-sales management and other supply costs. + # * Network cost: transmission and distribution tariffs, transmission and distribution losses, network costs, after-sale service costs, system service costs, and meter rental and metering costs. + # * Value added taxes (VAT): as defined in Council Directive 2006/112/EC. + # * Renewable taxes: taxes, fees, levies or charges relating to the promotion of renewable energy sources, energy efficiency and CHP generation. + # * Capacity taxes: Taxes, fees, levies or charges relating to capacity payments, energy security and generation adequacy; taxes on coal industry restructuring; taxes on electricity distribution; stranded costs and levies on financing energy regulatory authorities or market and system operators. + # * Environmental taxes: taxes, fees, levies or charges relating to air quality and for other environmental purposes; taxes on emissions of CO2 or other greenhouse gases. This component includes the excise duties. + # * Nuclear taxes: taxes, fees, levies or charges relating to the nuclear sector, including nuclear decommissioning, inspections and fees for nuclear installations. + # * All other taxes: taxes, fees, levies or charges not covered by any of the previous five categories: support for district heating; local or regional fiscal charges; island compensation; concession fees relating to licences and fees for the occupation of land and public or private property by networks or other devices. + }, + # Energy units. + "energy_unit": { + # Gigajoule (gross calorific value - GCV) + "GJ_GCV": "GJ", + # Kilowatt-hour + "KWH": "kWh", + # The following is used in consumption volumes datasets. + # "PC": "Percentage", + }, +} + +# Dataset codes for prices and components. +DATASET_CODES_PRICES = ["nrg_pc_202", "nrg_pc_203", "nrg_pc_204", "nrg_pc_205"] +DATASET_CODES_COMPONENTS = ["nrg_pc_202_c", "nrg_pc_203_c", "nrg_pc_204_c", "nrg_pc_205_c"] +DATASET_CODE_TO_ENERGY_SOURCE = { + "nrg_pc_202": "Gas", + "nrg_pc_203": "Gas", + "nrg_pc_204": "Electricity", + "nrg_pc_205": "Electricity", + "nrg_pc_202_c": "Gas", + "nrg_pc_203_c": "Gas", + "nrg_pc_204_c": "Electricity", + "nrg_pc_205_c": "Electricity", +} +DATASET_CODE_TO_CONSUMER_TYPE_MAPPING = { + "nrg_pc_202": "Household", + "nrg_pc_203": "Non-household", + "nrg_pc_204": "Household", + "nrg_pc_205": "Non-household", + "nrg_pc_202_c": "Household", + "nrg_pc_203_c": "Non-household", + "nrg_pc_204_c": "Household", + "nrg_pc_205_c": "Non-household", +} + + +# The following components need to be present in the prices components datasets of a country-year-dataset-currency, otherwise its data will not be included. +MANDATORY_PRICE_COMPONENTS = [ + "Energy and supply", + "Network costs", + "Taxes, fees, levies, and charges", +] + +# List of components that add up to the total price. +# NOTE: See find_best_combination_of_components to understand how this choice was made. +COMPONENTS_THAT_ADD_UP_TO_TOTAL = ["Energy and supply", "Network costs", "Taxes, fees, levies, and charges"] + +# Label to use for the calculated total price based on the sum of the main components. +COMPONENTS_TOTAL_PRICE_LABEL = "Total price, including taxes" + + +def sanity_check_inputs(tb: Table) -> None: + # Ensure all relevant dataset codes are present. + error = "Some dataset codes are missing." + assert set(DATASET_CODES_AND_NAMES) <= set(tb["dataset_code"]), error + # Check that each dataset has only one value in fields "freq", "product", and "nrg_cons". + # error = "Some datasets have more than one value in field 'freq'." + # assert (tb.groupby("dataset_code")["freq"].nunique() == 1).all(), error + # error = "Expected 'freq' column to be either A (annual) or S (bi-annual)." + # assert set(tb["freq"].dropna()) == set(["A", "S"]), error + # error = "Some datasets have more than one value in field 'product'." + # assert (tb.dropna(subset="product").groupby("dataset_code")["product"].nunique() == 1).all(), error + # error = "Expected 'product' column to be either 4100 (gas) or 6000 (electricity)." + # assert set(tb["product"].dropna()) == set([4100, 6000]), error + error = "Expected electricity prices to be measured in kWh." + assert set( + tb[tb["dataset_code"].isin(["nrg_pc_204", "nrg_pc_205", "nrg_pc_204_h", "nrg_pc_205_h", "nrg_pc_206_h"])][ + "energy_unit" + ] + ) == set(["KWH"]), error + # error = "Expected 'customer' column to be empty, for the selected datasets." + # assert set(tb["customer"].dropna()) == set(), error + # error = "Expected 'consom' column to be empty, for the selected datasets (that column is only relevant for historical data)." + # assert set(tb["consom"].dropna()) == set(), error + for field, mapping in INDEXES_MAPPING.items(): + if field == "flag": + # Flags need to first be extracted from the value (so they will be sanity checked later). + continue + error = f"Unexpected values in field '{field}'." + assert set(tb[field].dropna()) == set(mapping), error + + +def prepare_inputs(tb: Table) -> Table: + # Values sometimes include a letter, which is a flag. Extract those letters and create a separate column with them. + # Note that sometimes there can be multiple letters (which means multiple flags). + tb["flag"] = tb["value"].astype("string").str.extract(r"([a-z]+)", expand=False) + tb["value"] = tb["value"].str.replace(r"[a-z]", "", regex=True) + + # Some values are start with ':' (namely ':', ': ', ': c', ': u', ': cd'). Replace them with nan. + tb.loc[tb["value"].str.startswith(":"), "value"] = None + + # Assign a proper type to the column of values. + tb["value"] = tb["value"].astype(float) + + # Create a clean column of years, and another of dates. + tb["year-semester"] = tb["year"].str.strip().copy() + tb["year"] = tb["year-semester"].str[0:4].astype(int) + # For the date column: + # * For the first semester, use April 1st. + # * For the second semester, use October 1st. + # * For annual data, use July 1st. + semester_1_mask = tb["year-semester"].str.contains("S1") + semester_2_mask = tb["year-semester"].str.contains("S2") + annual_mask = tb["year-semester"].str.isdigit() + error = "Unexpected values in field 'year-semester'." + assert (semester_1_mask | semester_2_mask | annual_mask).all(), error + tb["date"] = pd.to_datetime(tb["year"].astype(str) + "-07-01") + tb.loc[semester_1_mask, "date"] = pd.to_datetime(tb[semester_1_mask]["year"].astype(str) + "-04-01") + tb.loc[semester_2_mask, "date"] = pd.to_datetime(tb[semester_2_mask]["year"].astype(str) + "-10-01") + + return tb + + +def harmonize_indexes_and_countries(tb: Table) -> Table: + # Add a column with the dataset name. + tb["dataset_name"] = map_series( + tb["dataset_code"], + mapping=DATASET_CODES_AND_NAMES, + warn_on_missing_mappings=True, + warn_on_unused_mappings=True, + show_full_warning=True, + ) + + # Harmonize all other index names. + for field, mapping in INDEXES_MAPPING.items(): + # Avoid categorical dtypes. + tb[field] = tb[field].astype("string") + not_null_mask = tb[field].notnull() + tb.loc[not_null_mask, field] = map_series( + tb[not_null_mask][field], + mapping=mapping, + warn_on_missing_mappings=True, + warn_on_unused_mappings=True, + show_full_warning=True, + ) + + # Harmonize country names. + # Countries are given in NUTS (Nomenclature of Territorial Units for Statistics) codes. + # Region codes are defined in: https://ec.europa.eu/eurostat/web/nuts/correspondence-tables + # There are additional codes not included there, namely: + # EA: Countries in the Euro Area, that use the Euro as their official currency. + # In the historical datasets, there are some additional regions: + # EU15: The 15 countries that made up the EU prior to its 2004 expansion. + # EU25: The 25 member states after the 2004 enlargement, which added ten countries. + # EU27_2007: The 27 EU member states in 2007. + # EU27_2020: The 27 EU members after the United Kingdom left in 2020. + # UA: Ukraine (not a member of the EU, but often included in some European data). + # UK: United Kingdom (not a member since 2020, but included in some European data). + tb = geo.harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) + + return tb + + +######################################################################################################################## + +# The code in this block is not used in the data pipeline, but it was useful to understand the data and justify some of the choices. + + +def compare_components_and_prices_data(tb: Table) -> None: + # Compare biannual prices data (without averaging overs semesters) with annual components data. + price_level = "All taxes and levies included" + tb_biannual = tb[(tb["year-semester"].str.contains("S")) & (tb["currency"] == "Euro")].reset_index(drop=True) + tb_biannual = tb_biannual[(tb_biannual["price_component_or_level"] == price_level)][ + ["dataset_code", "country", "date", "value"] + ] + # Similarly, for annual data, assign July 1st. + tb_annual = tb[(~tb["year-semester"].str.contains("S")) & (tb["currency"] == "Euro")].reset_index(drop=True) + tb_annual["dataset_code"] = tb_annual["dataset_code"].str.replace("_c", "") + + combination = COMPONENTS_THAT_ADD_UP_TO_TOTAL + annual_components = ( + tb_annual[(tb_annual["price_component_or_level"].isin(combination))] + .groupby( + ["dataset_code", "country", "date"], + observed=True, + as_index=False, + ) + .agg({"value": lambda x: x.sum(min_count=1)}) + .dropna() + .reset_index(drop=True) + ) + + # Combine both datasets for plotting. + compared = pd.concat( + [annual_components.assign(**{"source": "components"}), tb_biannual.assign(**{"source": "prices"})], + ignore_index=True, + ) + # Only a few country-years could be compared this way. Most of the points in the prices datasets were missing. + for dataset_code in compared["dataset_code"].unique(): + for country in sorted(set(compared["country"])): + _compared = compared[(compared["dataset_code"] == dataset_code) & (compared["country"] == country)] + if len(set(_compared["source"])) < 2: + continue + px.line( + _compared, + x="date", + y="value", + color="source", + markers=True, + title=f"{dataset_code} - {country}", + ).update_yaxes(range=[0, None]).show() + + +def find_best_combination_of_components(tb: Table) -> None: + # Check that the resulting total price for the components dataset (summing up components) is similar to the biannual electricity prices data. + + # Ideally, the prices obtained by adding up components (in the components dataset) should be similar to those obtained in the prices dataset. + # However, both are very sparse (especially the prices dataset), and the prices dataset is also given in semesters, which makes it difficult to compare (without having the actual consumption of each semester to be able to compute a weighted average). + # Transforming biannual data into annual data is not straightforward. + # I tried simply taking the average, but what I found is that the annual components prices (summed over all components) tends to be systematically higher than the biannual prices (averaged over the two semester of the year). I suppose this was caused by doing a simple average instead of weighting by consumption. In semesters with higher consumption (e.g. winter), the increased demand tends to drive prices up. Annual prices, as far as I understand, are consumption-weighted averages, and therefore assign a larger weight to those semesters with higher prices. So, intuitively, it makes sense that the true annual prices tend to be higher than the averaged biannual prices. + # We could create a weighted average, but we would need the actual consumption of each semester (which I haven't found straightaway). + + # Compute an annual average only if there is data for the two semesters. + price_level = "All taxes and levies included" + tb_biannual = tb[(tb["year-semester"].str.contains("S")) & (tb["currency"] == "Euro")].reset_index(drop=True) + tb_biannual_filtered = ( + tb_biannual.dropna(subset="value") + .groupby( + ["country", "year", "dataset_code", "price_component_or_level"], + observed=True, + as_index=False, + ) + .filter(lambda x: len(x) == 2) + ) + tb_biannual = tb_biannual_filtered.groupby( + ["country", "year", "dataset_code", "price_component_or_level"], + observed=True, + as_index=False, + ).agg({"value": "mean"}) + tb_biannual = tb_biannual[(tb_biannual["price_component_or_level"] == price_level)][ + ["dataset_code", "country", "year", "value"] + ] + # Similarly, for annual data, assign July 1st. + tb_annual = tb[(~tb["year-semester"].str.contains("S")) & (tb["currency"] == "Euro")].reset_index(drop=True) + tb_annual["dataset_code"] = tb_annual["dataset_code"].str.replace("_c", "") + + def _get_annual_sum(tb_annual, combination): + annual_components = ( + tb_annual[(tb_annual["price_component_or_level"].isin(combination))] + .groupby( + ["dataset_code", "country", "year"], + observed=True, + as_index=False, + ) + .agg({"value": lambda x: x.sum(min_count=1)}) + .dropna() + .reset_index(drop=True) + ) + + return annual_components + + import itertools + + from tqdm.auto import tqdm + + # Get all possible combinations of components. + elements = INDEXES_MAPPING["price_component"].values() + combinations = [] + for r in range(1, len(elements) + 1): + combinations.extend(itertools.combinations(elements, r)) + # Keep only combinations that include "Energy and supply" and "Network costs". + combinations = [c for c in combinations if "Energy and supply" in c and "Network costs" in c] + + # Brute-force analysis: Check which combination of components minimizes the error between the sum of components and the prices data. + # NOTE: This takes about 4 minutes. + error_median = [] + error_mean = [] + error_max = [] + for combination in tqdm(combinations): + annual_components = _get_annual_sum(tb_annual, combination) + compared = ( + tb_biannual.merge( + annual_components, + on=["dataset_code", "country", "year"], + how="inner", + suffixes=("_prices", "_components"), + ) + .dropna() + .reset_index(drop=True) + ) + compared["pct"] = 100 * abs(compared["value_prices"] - compared["value_components"]) / compared["value_prices"] + error_mean.append(compared["pct"].mean()) + error_median.append(compared["pct"].median()) + error_max.append(compared["pct"].max()) + # Find the combination that minimizes the error. + results_df = pd.DataFrame( + { + "combination": combinations, + "error_mean": error_mean, + "error_median": error_median, + "error_max": error_max, + } + ) + # There is no single combination that minimizes all error. + set(results_df[results_df["error_mean"] == results_df["error_mean"].min()]["combination"]) + # After inspection, there are different combinations that minimize error (since some components are actually always zero). In terms of the minimum mean error, the combinations are: + # ('Energy and supply', 'Network costs', 'Taxes, fees, levies, and charges'), + # ('Energy and supply', 'Network costs', 'Taxes, fees, levies, and charges', 'Capacity taxes allowances'), + # ('Energy and supply', 'Network costs', 'Taxes, fees, levies, and charges', 'Capacity taxes allowances', 'Nuclear taxes allowance'), + # ('Energy and supply', 'Network costs', 'Taxes, fees, levies, and charges', 'Nuclear taxes allowance') + # Given that some of those allowance components are actually (almost) always zero, it seems clear that + # the best combination is, as expected: + components_optimal = ["Energy and supply", "Network costs", "Taxes, fees, levies, and charges"] + annual_components = _get_annual_sum(tb_annual, combination=components_optimal) + compared = ( + tb_biannual.merge( + annual_components, on=["dataset_code", "country", "year"], how="inner", suffixes=("_prices", "_components") + ) + .dropna() + .reset_index(drop=True) + ) + compared["pct"] = 100 * abs(compared["value_prices"] - compared["value_components"]) / compared["value_prices"] + compared.sort_values("pct", ascending=False).head(60) + # For most countries, the agreement is good, but some country-years, the discrepancy is significant, e.g. Denmark non-household electricity in 2022 and 2023, with an error of 26%. + # There are only a few other discrepancies above 10%. + + # Visually inspect these discrepancies. + compared = pd.concat( + [annual_components.assign(**{"source": "components"}), tb_biannual.assign(**{"source": "prices"})], + ignore_index=True, + ) + # Only a few country-years could be compared this way. Most of the points in the prices datasets were missing. + for dataset_code in compared["dataset_code"].unique(): + for country in sorted(set(compared["country"])): + if ( + len( + set( + compared[(compared["dataset_code"] == dataset_code) & (compared["country"] == country)][ + "source" + ] + ) + ) + < 2 + ): + continue + px.line( + compared[(compared["dataset_code"] == dataset_code) & (compared["country"] == country)], + x="year", + y="value", + color="source", + markers=True, + title=f"{dataset_code} - {country}", + ).update_yaxes(range=[0, None]).show() + + # Conclusions: + # * The prices and components datasets coincide reasonably well. To recover prices, it seems that the components to be added up are just "Energy and supply", "Network costs", and "Taxes, fees, levies, and charges". For most countries, this combination gives a good agreement with the prices dataset. However, for some countries, there is a significant discrepancy. + # * Numerically, I have checked that for all price components datasets, "Taxes, fees, levies and charges" coincides with the sum of 'Capacity taxes', 'Environmental taxes', 'Nuclear taxes', 'Renewable taxes', 'Value added tax (VAT)', 'Other'. For some country-years, there is a small discrepancy. + # * What's not so clear is what happens with the "allowances". Is "Taxes, fees, levies, and charges allowance" the sum of all other "* allowance"? It's hard to know, since it's non-zero only once (nrg_pc_204_c Netherlands 2023). At that point, it does coincide with the sum of all other "* allowance". But there are other instances of non-zero "* allowance" where "Taxes...allowance" is not defined. It may be possible that allowances are not included in the prices dataset. + + +def plot_final_comparison_between_prices_and_components_data(tb: Table) -> None: + for country in tb["country"].unique(): + for consumer_type in tb["consumer_type"].unique(): + for source in tb["source"].unique(): + _tb = tb[ + (tb["country"] == country) + & (tb["source"] == source) + & (tb["consumer_type"] == consumer_type) + & ( + tb["price_component_or_level"].isin( + [COMPONENTS_TOTAL_PRICE_LABEL, "All taxes and levies included"] + ) + ) + ] + if len(_tb["price_component_or_level"].unique()) < 2: + continue + px.line( + _tb, + x="date", + y="price_euro", + color="price_component_or_level", + markers=True, + title=f"{consumer_type} {source} - {country}", + ).show() + + +######################################################################################################################## + + +def select_and_prepare_relevant_data(tb: Table) -> Table: + # All datasets have a energy unit except electricity components (both for household and non-households). + # I assume the energy unit is kWh. + error = "Expected electricity components (both for household and non-households) to have no energy unit. Remove this code." + assert tb[tb["dataset_code"].isin(["nrg_pc_204_c", "nrg_pc_205_c"])]["energy_unit"].isnull().all(), error + tb.loc[tb["dataset_code"].isin(["nrg_pc_204_c", "nrg_pc_205_c"]), "energy_unit"] = "kWh" + + error = "Expected all datasets to have the same energy unit (kWh)." + assert ( + tb.groupby(["dataset_code"], observed=True, as_index=False) + .agg({"energy_unit": lambda x: "kWh" in x.unique()})["energy_unit"] + .all() + ), error + # Select the same energy unit for all datasets (kWh). + tb = tb[tb["energy_unit"] == "kWh"].drop(columns=["energy_unit"], errors="raise").reset_index(drop=True) + + # Convert prices from price per kWh to price per MWh. + tb["value"] *= 1000 + + # For convenience, instead of having a column for price component (for components datasets) and price level (for prices datasets), create a single column with the price component or level. + assert tb[(tb["price_level"].isnull()) & (tb["price_component"].isnull())].empty + assert tb[(tb["price_level"].notnull()) & (tb["price_component"].notnull())].empty + tb["price_component_or_level"] = tb["price_level"].fillna(tb["price_component"]) + tb = tb.drop(columns=["price_level", "price_component"], errors="raise") + + # After inspection, it looks like the "All bands" consumption is very sparse in the prices datasets. + # One option (if we decided to use the prices dataset) would be to use the more common consumption bands only, which are better informed. + # In the components dataset, "All bands" seems to be less sparse (at least from 2019 onwards). + # To get the total price from the components dataset, we would need to add up components. + # But we would need to figure out which one is the subset of components that ensures no double-counting. + tb = ( + tb[tb["consumption_band"] == "All bands"] + .drop(columns=["consumption_band"], errors="raise") + .reset_index(drop=True) + ) + + # Find the combination of price components that needs to be summed up to recover the full prices. + # NOTE: Uncomment to perform the analysis again, and see conclusions in the following function to understand the choices. + # find_best_combination_of_components(tb=tb) + + # Visually compare the resulting prices obtained by adding up certain components, with the original prices data. + # NOTE: Uncomment to perform some visual checks. + # compare_components_and_prices_data(tb=tb) + + # Remove groups (of country-year-dataset-currency) from the components dataset for which certain components (e.g. "Energy and supply") are not included. + # For example, Albania doesn't have "Energy and supply" costs for household electricity, but it does have other components (e.g. "Network costs"). + tb.loc[ + (tb["dataset_code"].isin(DATASET_CODES_COMPONENTS)) + & ( + ~tb.groupby(["country", "year", "currency"])["price_component_or_level"].transform( + lambda x: all(comp in x.tolist() for comp in MANDATORY_PRICE_COMPONENTS) + ) + ), + "value", + ] = None + + # Remove empty rows. + tb = tb.dropna(subset=["value"]).reset_index(drop=True) + + # Remove data with certain flags. + tb = tb[ + ~tb["flag"].isin( + [ + "confidential", + "definition differs", + "low reliability", + "confidential, definition differs", + "unknown flag", + ] + ) + ].reset_index(drop=True) + error = "Unexpected flag values." + assert set(tb["flag"].dropna()) <= set(["estimated", "break in time series", "provisional", "unknown flag"]), error + + # Add total price to the components dataset, by adding up the contribution of the main components. + tb_components_total = ( + tb[ + tb["dataset_code"].isin(DATASET_CODES_COMPONENTS) + & (tb["price_component_or_level"].isin(COMPONENTS_THAT_ADD_UP_TO_TOTAL)) + ] + .groupby( + ["currency", "country", "year", "dataset_code", "year-semester", "date", "dataset_name"], + observed=True, + as_index=False, + ) + .agg({"value": "sum"}) + .assign(**{"price_component_or_level": COMPONENTS_TOTAL_PRICE_LABEL}) + ) + tb = pr.concat([tb, tb_components_total], ignore_index=True) + + # Create a column for the energy source. + tb["source"] = map_series( + tb["dataset_code"], + mapping=DATASET_CODE_TO_ENERGY_SOURCE, + warn_on_missing_mappings=True, + warn_on_unused_mappings=True, + show_full_warning=True, + ) + error = "Unexpected energy source." + assert set(tb["source"]) == set(["Gas", "Electricity"]), error + + # Create a column for the consumer type. + tb["consumer_type"] = map_series( + tb["dataset_code"], + mapping=DATASET_CODE_TO_CONSUMER_TYPE_MAPPING, + warn_on_missing_mappings=True, + warn_on_unused_mappings=True, + show_full_warning=True, + ) + error = "Unexpected consumer type." + assert set(tb["consumer_type"]) == set(["Household", "Non-household"]), error + + # Drop unnecessary columns. + tb = tb.drop(columns=["flag", "year-semester", "dataset_name"], errors="raise") + + # It would be confusing to keep different national currencies, so, keep only Euro and PPS. + tb = tb[tb["currency"].isin(["Euro", "PPS"])].reset_index(drop=True) + + # Separate euros and PPS in two different columns. + tb = ( + tb[tb["currency"] == "Euro"] + .drop(columns=["currency"]) + .merge( + tb[tb["currency"] == "PPS"].drop(columns=["currency"]), + how="outer", + on=["country", "year", "date", "dataset_code", "source", "price_component_or_level", "consumer_type"], + suffixes=("_euro", "_pps"), + ) + .rename(columns={"value_euro": "price_euro", "value_pps": "price_pps"}, errors="raise") + ) + + return tb + + +def prepare_wide_tables(tb: Table) -> Dict[str, Table]: + # Table for average prices (in euros) of gas and electricity prices of household and non-household consumers. + tb_prices_euro = tb[tb["dataset_code"].isin(DATASET_CODES_PRICES)].pivot( + index=["country", "date"], + columns=["source", "consumer_type", "price_component_or_level"], + values="price_euro", + join_column_levels_with="-", + ) + # Table for average prices (in PPS) of gas and electricity prices of household and non-household consumers. + tb_prices_pps = tb[tb["dataset_code"].isin(DATASET_CODES_PRICES)].pivot( + index=["country", "date"], + columns=["source", "consumer_type", "price_component_or_level"], + values="price_pps", + join_column_levels_with="-", + ) + # Improve tables format. + tb_prices_euro = tb_prices_euro.format(["country", "date"], short_name="gas_and_electricity_prices_euro_flat") + tb_prices_pps = tb_prices_pps.format(["country", "date"], short_name="gas_and_electricity_prices_pps_flat") + + # Improve column names. + tb_prices_euro = tb_prices_euro.rename( + columns={column: column.replace("__", "_") + "_euro" for column in tb_prices_euro.columns}, errors="raise" + ) + tb_prices_pps = tb_prices_pps.rename( + columns={column: column.replace("__", "_") + "_pps" for column in tb_prices_pps.columns}, errors="raise" + ) + + # Table for price components (in euros) of gas and electricity prices of household and non-household consumers. + tb_price_components_euro = tb[tb["dataset_code"].isin(DATASET_CODES_COMPONENTS)].pivot( + index=["country", "year"], + columns=["source", "consumer_type", "price_component_or_level"], + values="price_euro", + join_column_levels_with="-", + ) + # Table for price components (in PPS) of gas and electricity prices of household and non-household consumers. + tb_price_components_pps = tb[tb["dataset_code"].isin(DATASET_CODES_COMPONENTS)].pivot( + index=["country", "year"], + columns=["source", "consumer_type", "price_component_or_level"], + values="price_pps", + join_column_levels_with="-", + ) + # Improve tables format. + tb_price_components_euro = tb_price_components_euro.format( + ["country", "year"], short_name="gas_and_electricity_price_components_euro_flat" + ) + tb_price_components_pps = tb_price_components_pps.format( + ["country", "year"], short_name="gas_and_electricity_price_components_pps_flat" + ) + + # Improve column names. + tb_price_components_euro = tb_price_components_euro.rename( + columns={column: column.replace("__", "_") + "_euro" for column in tb_price_components_euro.columns}, + errors="raise", + ) + tb_price_components_pps = tb_price_components_pps.rename( + columns={column: column.replace("__", "_") + "_pps" for column in tb_price_components_pps.columns}, + errors="raise", + ) + + return tb_prices_euro, tb_prices_pps, tb_price_components_euro, tb_price_components_pps + + +def sanity_check_outputs(tb: Table) -> None: + error = "Expected 'Energy and supply' and 'Network costs' to be non-negative." + assert tb[ + tb["dataset_code"].isin(DATASET_CODES_COMPONENTS) + & tb["price_component_or_level"].isin(["Energy and supply", "Network costs"]) + & (tb["price_euro"] < 0) + ].empty, error + + # Further sanity checks on component prices. + tb_components = tb[tb["dataset_code"].isin(DATASET_CODES_COMPONENTS)].reset_index(drop=True) + tb_taxes_sum = ( + tb_components[ + tb_components["price_component_or_level"].isin( + [ + "Capacity taxes", + "Environmental taxes", + "Nuclear taxes", + "Renewable taxes", + "Value added tax (VAT)", + "Other", + ] + ) + ] + .groupby(["dataset_code", "country", "year"], observed=True, as_index=False) + .agg({"price_euro": "sum"}) + ) + tb_taxes_original = tb_components[ + tb_components["price_component_or_level"] == "Taxes, fees, levies, and charges" + ].reset_index(drop=True)[["dataset_code", "country", "year", "price_euro"]] + # NOTE: The median value of the sum is 0.0191 euros/kWh. When comparing the percentage difference, ignore values that are too small. + compared = tb_taxes_sum.merge( + tb_taxes_original, how="outer", on=["dataset_code", "country", "year"], suffixes=("_sum", "_original") + ) + compared["dev"] = 100 * ( + abs(compared["price_euro_sum"] - compared["price_euro_original"]) / compared["price_euro_original"] + ) + error = "Expected the sum of 'Capacity taxes', 'Environmental taxes', 'Nuclear taxes', 'Renewable taxes', 'Value added tax (VAT)', 'Other' to coincide with 'Taxes, fees, levies, and charges', within 2% (ignoring any prices below 0.007, which is 17% of rows)." + # NOTE: Some dataset-country-year have a significant discrepancy, e.g. nrg_pc_202_c-Greece-2022, with a price of 6.7€/MWh. + assert compared[(compared["price_euro_original"] > 7) & (compared["dev"] > 2)].empty, error + # compared.sort_values("dev", ascending=False).head(60) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("gas_and_electricity_prices") + + # Read table from meadow dataset. + tb = ds_meadow.read("gas_and_electricity_prices") + + # + # Process data. + # + # Select relevant dataset codes, and add a column with the dataset name. + tb = tb[tb["dataset_code"].isin(DATASET_CODES_AND_NAMES.keys())].reset_index(drop=True) + + # Select and rename columns. + tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Sanity checks on inputs. + sanity_check_inputs(tb=tb) + + # Clean inputs. + tb = prepare_inputs(tb=tb) + + # Harmonize indexes and country names. + tb = harmonize_indexes_and_countries(tb=tb) + + # Select and prepare relevant data. + tb = select_and_prepare_relevant_data(tb=tb) + + # Sanity check outputs. + sanity_check_outputs(tb=tb) + + # Uncomment to plot a comparison (for each country, source, and consumer type) between the prices and the components data. + # NOTE: Some of the biggest discrepancies happen where prices data is given only for one of the semesters. This is the case of Georgia household electricity in 2021 and 2022, where we can't see the value of the missing semester (which could explain why the components data is significantly higher). + # plot_final_comparison_between_prices_and_components_data(tb=tb) + + # Create convenient wide tables. + tb_prices_euro, tb_prices_pps, tb_price_components_euro, tb_price_components_pps = prepare_wide_tables(tb=tb) + + # Improve main table format. + tb = tb.drop(columns=["dataset_code"]).format( + ["country", "date", "source", "consumer_type", "price_component_or_level"] + ) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb, tb_prices_euro, tb_prices_pps, tb_price_components_euro, tb_price_components_pps], + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + ) + ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py index 326485e94b8..3c77d0cb76f 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py @@ -498,7 +498,7 @@ def process_combined_data(tb: Table, ds_population: Dataset) -> Table: # Fill gaps in OWID population with FAO population (for "* (FAO)" countries, i.e. countries that were not # harmonized and for which there is no OWID population). # Then drop "fao_population", since it is no longer needed. - tb_wide["population"] = tb_wide["population"].fillna(tb_wide["fao_population"]) + tb_wide["population"] = tb_wide["population"].astype("Float64").fillna(tb_wide["fao_population"]).astype("Float64") tb_wide = tb_wide.drop(columns="fao_population") assert len(tb_wide.columns[tb_wide.isnull().all(axis=0)]) == 0, "Unexpected columns with only nan values." diff --git a/etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.py b/etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.py index a4968d65180..02d9f00c5ef 100644 --- a/etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.py +++ b/etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # snap = paths.load_snapshot() - tb = snap.read().set_index(["country", "year"]) + tb = snap.read(safe_types=False).set_index(["country", "year"]) # # Save outputs. diff --git a/etl/steps/archive/garden/gcp/2023-12-05/global_carbon_budget.countries.json b/etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.countries.json similarity index 100% rename from etl/steps/archive/garden/gcp/2023-12-05/global_carbon_budget.countries.json rename to etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.countries.json diff --git a/etl/steps/archive/garden/gcp/2023-12-05/global_carbon_budget.excluded_countries.json b/etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.excluded_countries.json similarity index 100% rename from etl/steps/archive/garden/gcp/2023-12-05/global_carbon_budget.excluded_countries.json rename to etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.excluded_countries.json diff --git a/etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.meta.yml b/etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.meta.yml new file mode 100644 index 00000000000..9d9470af087 --- /dev/null +++ b/etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.meta.yml @@ -0,0 +1,514 @@ +definitions: + production_emissions_description_key: &production_emissions_description_key + - This data is based on territorial emissions, which do not account for emissions embedded in traded goods. + traded_emissions_description_key: &traded_emissions_description_key + - Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter. + international_aviation_description_key: &international_aviation_description_key + - Emissions from international aviation and shipping are not included in any country or region's emissions. They are only included in the global total emissions. + consumption_emissions_description_key: &consumption_emissions_description_key + - Consumption-based emissions attribute the emissions generated in the production of goods and services according to where they were _consumed_, rather than where they were _produced_. + - "The data is calculated by adjusting 'production-based' emissions (emissions produced domestically) for trade: Consumption-based emissions equals production-based emissions, _minus_ emissions embedded in exports, _plus_ emissions embedded in imports." + - If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. If its consumption-based emissions are lower, then it is a net exporter. + - Consumption-based emissions are not available for all countries because not all countries have sufficient, high-quality trade data. But those without complete data are a small fraction (3%) of the global total. + - This data measures carbon dioxide (CO₂) emissions from fossil fuels and industry and does not include emissions from land use change, deforestation, soils, or vegetation. + per_capita_description_key: &per_capita_description_key + - Per capita emissions represent the emissions of an average person in a country or region - they are calculated as the total emissions divided by population. + # Common fields to be used in all indicators (unless overridden for specific indicators below). + common: + description_processing: &description_processing | + - Data on global emissions has been converted from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. + - Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. + - Country's share of the global population is calculated using our population dataset, based on [different sources](https://ourworldindata.org/population-sources). + - Each country's share of global CO₂ emissions from flaring has been calculated using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset. + description_key: + # NOTE: The description key points are re-defined for each indicator on consumption-based emissions and traded emissions, as well as on per-capita indicators. + - *production_emissions_description_key + - *international_aviation_description_key + presentation: + topic_tags: + - CO2 & Greenhouse Gas Emissions + attribution_short: GCB + processing_level: major + +dataset: + title: Global Carbon Budget + update_period_days: 365 + +tables: + global_carbon_budget: + variables: + consumption_emissions: + title: "Annual consumption-based CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description_short: Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes. + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + consumption_emissions_as_share_of_global: + title: "Share of global annual CO₂ consumption-based emissions" + unit: "%" + short_unit: "%" + description_short: "Annual consumption-based emissions of carbon dioxide (CO₂), measured as a percentage of global consumption-based emissions of CO₂ in the same year." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + ################################################################################################################## + # Curated indicator for data page. + consumption_emissions_per_capita: + title: Per capita consumption-based CO₂ emissions + description_short: | + Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes per person. + description_key: + - *consumption_emissions_description_key + - *per_capita_description_key + - *international_aviation_description_key + description_processing: *description_processing + unit: tonnes per person + short_unit: t/person + display: + shortUnit: t + numDecimalPlaces: 0 + presentation: + attribution_short: Global Carbon Project + topic_tags: + - CO2 & Greenhouse Gas Emissions + - Climate Change + - Energy + faqs: + - fragment_id: emissions-from-aviation-and-shipping + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: missing-consumption-based-emissions + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + subtitle: >- + [Consumption-based emissions](#dod:consumptionbasedemissions) are national + emissions that have been adjusted for trade. It's production-based emissions + minus emissions embedded in exports, plus emissions embedded in imports. + hideAnnotationFieldsInTitle: + time: true + entity: true + changeInPrefix: true + hideRelativeToggle: false + hasMapTab: true + tab: map + originUrl: https://ourworldindata.org/co2-and-greenhouse-gas-emissions + colorScale: + binningStrategy: equalInterval + map: + colorScale: + baseColorScheme: Reds + binningStrategy: manual + customNumericValues: + - 1 + - 2 + - 5 + - 10 + - 20 + - 50 + customNumericColors: + - null + - null + selectedEntityNames: + - United States + - United Kingdom + - European Union (27) + - China + - India + - Australia + - Brazil + - South Africa + relatedQuestions: + - url: https://ourworldindata.org/grapher/consumption-co2-per-capita#faqs + text: FAQs on this data + consumption_emissions_per_gdp: + title: "Annual consumption-based CO₂ emissions per GDP (kg per international-$)" + unit: "kilograms per international-$" + short_unit: "kg/$" + description_short: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in kilograms per dollar of GDP (2011 international-$)." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + cumulative_consumption_emissions: + title: "Cumulative CO₂ consumption-based emissions" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of available data, measured in tonnes." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + cumulative_consumption_emissions_as_share_of_global: + title: "Share of global cumulative CO₂ consumption-based emissions" + unit: "%" + short_unit: "%" + description_short: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of available data, measured as a percentage of global cumulative consumption-based emissions." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + cumulative_emissions_from_cement: + title: "Cumulative CO₂ emissions from cement" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from cement since the first year of available data, measured in tonnes." + cumulative_emissions_from_cement_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from cement" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from cement since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from cement." + cumulative_emissions_from_coal: + title: "Cumulative CO₂ emissions from coal" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from coal since the first year of available data, measured in tonnes." + cumulative_emissions_from_coal_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from coal" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from coal since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from coal." + cumulative_emissions_from_flaring: + title: "Cumulative CO₂ emissions from flaring" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from flaring since the first year of available data, measured in tonnes." + cumulative_emissions_from_flaring_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from flaring" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from flaring since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from flaring." + cumulative_emissions_from_gas: + title: "Cumulative CO₂ emissions from gas" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from gas since the first year of available data, measured in tonnes." + cumulative_emissions_from_gas_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from gas" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from gas since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from gas." + cumulative_emissions_from_land_use_change: + title: "Cumulative CO₂ emissions from land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from land-use change since the first year of available data, measured in tonnes." + cumulative_emissions_from_land_use_change_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from land-use change" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from land-use change since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from land-use change." + cumulative_emissions_from_oil: + title: "Cumulative CO₂ emissions from oil" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from oil since the first year of available data, measured in tonnes." + cumulative_emissions_from_oil_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from oil" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from oil since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from oil." + cumulative_emissions_from_other_industry: + title: "Cumulative CO₂ emissions from other industry" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from other industry sources since the first year of available data, measured in tonnes." + cumulative_emissions_from_other_industry_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from other industry" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from other industry sources since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from other industry sources." + cumulative_emissions_total: + title: "Cumulative CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of available data, measured in tonnes." + cumulative_emissions_total_as_share_of_global: + title: "Share of global cumulative CO₂ emissions" + unit: "%" + short_unit: "%" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of available data, measured as a percentage of global total cumulative emissions of CO₂." + cumulative_emissions_total_including_land_use_change: + title: "Cumulative CO₂ emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), including land-use change, since the first year of available data, measured in tonnes." + cumulative_emissions_total_including_land_use_change_as_share_of_global: + title: "Share of global cumulative CO₂ emissions including land-use change" + unit: "%" + short_unit: "%" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), including land-use change, since the first year of available data, measured as a percentage of global total cumulative emissions of CO₂ (including land-use change)." + emissions_from_cement: + title: "Annual CO₂ emissions from cement" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured in tonnes." + emissions_from_cement_as_share_of_global: + title: "Share of global annual CO₂ emissions from cement" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured as a percentage of global emissions of CO₂ from cement in the same year." + emissions_from_cement_per_capita: + title: "Annual CO₂ emissions from cement (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_coal: + title: "Annual CO₂ emissions from coal" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured in tonnes." + emissions_from_coal_as_share_of_global: + title: "Share of global annual CO₂ emissions from coal" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured as a percentage of global emissions of CO₂ from coal in the same year." + emissions_from_coal_per_capita: + title: "Annual CO₂ emissions from coal (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_flaring: + title: "Annual CO₂ emissions from flaring" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured in tonnes." + emissions_from_flaring_as_share_of_global: + title: "Share of global annual CO₂ emissions from flaring" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured as a percentage of global emissions of CO₂ from flaring in the same year." + emissions_from_flaring_per_capita: + title: "Annual CO₂ emissions from flaring (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_gas: + title: "Annual CO₂ emissions from gas" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured in tonnes." + emissions_from_gas_as_share_of_global: + title: "Share of global annual CO₂ emissions from gas" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured as a percentage of global emissions of CO₂ from gas in the same year." + emissions_from_gas_per_capita: + title: "Annual CO₂ emissions from gas (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_land_use_change: + title: "Annual CO₂ emissions from land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes." + emissions_from_land_use_change_as_share_of_global: + title: "Share of global annual CO₂ emissions from land-use change" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured as a percentage of global emissions of CO₂ from land-use change in the same year." + emissions_from_land_use_change_per_capita: + title: "Annual CO₂ emissions from land-use change per capita" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_oil: + title: "Annual CO₂ emissions from oil" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured in tonnes." + emissions_from_oil_as_share_of_global: + title: "Share of global annual CO₂ emissions from oil" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured as a percentage of global emissions of CO₂ from oil in the same year." + emissions_from_oil_per_capita: + title: "Annual CO₂ emissions from oil (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_other_industry: + title: "Annual CO₂ emissions from other industry" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes." + emissions_from_other_industry_as_share_of_global: + title: "Share of global annual CO₂ emissions from other industry" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured as a percentage of global emissions of CO₂ from other industry sources in the same year." + emissions_from_other_industry_per_capita: + title: "Annual CO₂ emissions from other industry (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_total: + title: "Annual CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes." + emissions_total_as_share_of_global: + title: "Share of global annual CO₂ emissions" + unit: "%" + short_unit: "%" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured as a percentage of global emissions of CO₂ in the same year." + emissions_total_including_land_use_change: + title: "Annual CO₂ emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes." + emissions_total_including_land_use_change_as_share_of_global: + title: "Share of global annual CO₂ emissions including land-use change" + unit: "%" + short_unit: "%" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured as a percentage of global total emissions of CO₂ in the same year." + emissions_total_including_land_use_change_per_capita: + title: "Annual CO₂ emissions including land-use change per capita" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_total_including_land_use_change_per_gdp: + title: "Annual CO₂ emissions including land-use change per GDP" + unit: "kilograms per international-$" + short_unit: "kg/$" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per dollar of GDP (2011 international-$)." + emissions_total_including_land_use_change_per_unit_energy: + title: "Annual CO₂ emissions including land-use change per unit energy" + unit: "kilograms per kilowatt-hour" + short_unit: "kg/kWh" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per kilowatt-hour of primary energy consumption." + emissions_total_per_capita: + title: "Annual CO₂ emissions (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_total_per_gdp: + title: "Annual CO₂ emissions per GDP (kg per international-$)" + unit: "kilograms per international-$" + short_unit: "kg/$" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per dollar of GDP (2011 international-$)." + emissions_total_per_unit_energy: + title: "Annual CO₂ emissions per unit energy (kg per kilowatt-hour)" + unit: "kilograms per kilowatt-hour" + short_unit: "kg/kWh" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per kilowatt-hour of primary energy consumption." + gdp: + title: "GDP" + unit: "2011 international-$" + short_unit: "$" + description_short: >- + Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) + and price differences between countries. + growth_emissions_total: + title: "Annual CO₂ emissions growth (abs)" + unit: "tonnes" + short_unit: "t" + description_short: "Annual growth in total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes." + growth_emissions_total_including_land_use_change: + title: "Growth rate of emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Annual growth in total emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes." + pct_growth_emissions_total: + title: "Annual CO₂ emissions growth (%)" + unit: "%" + short_unit: "%" + description_short: "Annual percentage growth in total emissions of carbon dioxide (CO₂), excluding land-use change." + pct_growth_emissions_total_including_land_use_change: + title: "Growth rate of emissions including land-use change (%)" + unit: "%" + short_unit: "%" + description_short: "Annual percentage growth in total emissions of carbon dioxide (CO₂), including land-use change." + pct_traded_emissions: + title: "Share of annual CO₂ emissions embedded in trade" + unit: "%" + short_unit: "%" + description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured as a percentage of emissions of CO₂." + description_key: + - *traded_emissions_description_key + - *international_aviation_description_key + population: + title: "Population" + unit: "persons" + short_unit: "persons" + population_as_share_of_global: + title: "Share of population" + unit: "%" + short_unit: "%" + description_short: "Population, measured as a percentage of global total population in the same year." + primary_energy_consumption: + title: "Primary energy consumption" + unit: "terawatt-hours" + short_unit: "TWh" + description_short: "Primary energy consumption, measured in terawatt-hours per year." + traded_emissions: + title: "Annual CO₂ emissions embedded in trade" + unit: "tonnes" + short_unit: "t" + description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes." + description_key: + - *traded_emissions_description_key + - *international_aviation_description_key + traded_emissions_per_capita: + title: "Annual CO₂ emissions embedded in trade (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *traded_emissions_description_key + - *international_aviation_description_key diff --git a/etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.py b/etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.py new file mode 100644 index 00000000000..449b89cf2ab --- /dev/null +++ b/etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.py @@ -0,0 +1,1141 @@ +"""This step creates the Global Carbon Budget (GCB) dataset, by the Global Carbon Project (GCP). + +It harmonizes and further processes meadow data, and uses the following auxiliary datasets: +- GGDC's Maddison dataset on GDP, used to calculate emissions per GDP. +- Primary Energy Consumption (mix of sources from the 'energy' namespace) to calculate emissions per unit energy. +- Population (mix of sources), to calculate emissions per capita. +- Regions (mix of sources), to generate aggregates for different continents. +- WorldBank's Income groups, to generate aggregates for different income groups. + +""" +import numpy as np +import owid.catalog.processing as pr +from owid.catalog import Dataset, Table +from owid.datautils import dataframes +from structlog import get_logger + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Expected outliers in consumption-based emissions (with negative emissions in the original data, that will be removed). +# NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. +OUTLIERS_IN_CONSUMPTION_DF = [ + ("Panama", 2003), + ("Panama", 2004), + ("Panama", 2005), + ("Panama", 2006), + ("Panama", 2012), + ("Panama", 2013), + ("Venezuela", 2018), +] + +# Regions and income groups to create by aggregating contributions from member countries. +# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. +# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and +# "countries_excluded". The aggregates will be calculated on the resulting countries. +REGIONS = { + # Default continents. + "Africa": {}, + "Asia": {}, + "Europe": {}, + # We exclude GCB's EU27 data, because it appears only in a few metrics, and, when it exists, it is identical to our + # aggregated European Union (27). + "European Union (27)": {}, + "North America": {}, + "Oceania": {}, + "South America": {}, + # Income groups. + "Low-income countries": {}, + "Upper-middle-income countries": {}, + "Lower-middle-income countries": {}, + "High-income countries": {}, + # Additional composite regions. + "Asia (excl. China and India)": { + "additional_regions": ["Asia"], + "excluded_members": ["China", "India"], + }, + "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, + "Europe (excl. EU-28)": { + "additional_regions": ["Europe"], + "excluded_regions": ["European Union (27)"], + "excluded_members": ["United Kingdom"], + }, + "European Union (28)": { + "additional_regions": ["European Union (27)"], + "additional_members": ["United Kingdom"], + }, + "North America (excl. USA)": { + "additional_regions": ["North America"], + "excluded_members": ["United States"], + }, +} + +# Columns to use from GCB fossil CO2 emissions data and how to rename them. +CO2_COLUMNS = { + "country": "country", + "year": "year", + "cement": "emissions_from_cement", + "coal": "emissions_from_coal", + "flaring": "emissions_from_flaring", + "gas": "emissions_from_gas", + "oil": "emissions_from_oil", + "other": "emissions_from_other_industry", + "total": "emissions_total", +} + +# List all sources of emissions considered. +EMISSION_SOURCES = [column for column in CO2_COLUMNS.values() if column not in ["country", "year"]] + +# Columns to use from primary energy consumption data and how to rename them. +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", +} + +# Columns to use from historical emissions data and how to rename them. +HISTORICAL_EMISSIONS_COLUMNS = { + "country": "country", + "year": "year", + # Global fossil emissions are used only for sanity checks. + "global_fossil_emissions": "global_fossil_emissions", + "global_land_use_change_emissions": "global_emissions_from_land_use_change", +} + +# Columns to use from consumption-based emissions data and how to rename them. +CONSUMPTION_EMISSIONS_COLUMNS = { + "country": "country", + "year": "year", + "consumption_emissions": "consumption_emissions", +} + +# Conversion from terawatt-hours to kilowatt-hours. +TWH_TO_KWH = 1e9 + +# Conversion factor to change from billion tonnes of carbon to tonnes of CO2. +BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e9 + +# Conversion factor to change from million tonnes of carbon to tonnes of CO2. +MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 + +# Conversion from million tonnes of CO2 to tonnes of CO2. +MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 = 1e6 + +# Conversion from tonnes of CO2 to kg of CO2 (used for emissions per GDP and per unit energy). +TONNES_OF_CO2_TO_KG_OF_CO2 = 1000 + +# In order to remove uninformative columns, keep only rows where at least one of the following columns has data. +# All other columns are either derived variables, or global variables, or auxiliary variables from other datasets. +COLUMNS_THAT_MUST_HAVE_DATA = [ + "emissions_from_cement", + "emissions_from_coal", + "emissions_from_flaring", + "emissions_from_gas", + "emissions_from_oil", + "emissions_from_other_industry", + "emissions_total", + "consumption_emissions", + "emissions_from_land_use_change", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read all its tables. + ds_meadow = paths.load_dataset("global_carbon_budget") + tb_co2 = ds_meadow.read("global_carbon_budget_fossil_co2_emissions", safe_types=False) + tb_historical = ds_meadow.read("global_carbon_budget_historical_budget", safe_types=False) + tb_consumption = ds_meadow.read("global_carbon_budget_consumption_emissions", safe_types=False) + tb_production = ds_meadow.read("global_carbon_budget_production_emissions", safe_types=False) + tb_land_use = ds_meadow.read("global_carbon_budget_land_use_change", safe_types=False) + + # Load primary energy consumption dataset and read its main table. + ds_energy = paths.load_dataset("primary_energy_consumption") + tb_energy = ds_energy["primary_energy_consumption"].reset_index() + + # Load GDP dataset. + ds_gdp = paths.load_dataset("maddison_project_database") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # + # Process data. + # + # Prepare fossil CO2 emissions data. + tb_co2 = prepare_fossil_co2_emissions(tb_co2=tb_co2) + + # Prepare consumption-based emission data. + tb_consumption = prepare_consumption_emissions(tb_consumption=tb_consumption) + + # Prepare production-based emission data. + tb_production = prepare_production_emissions(tb_production=tb_production) + + # Prepare land-use emission data. + tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) + + # Select and rename columns from primary energy data. + tb_energy = tb_energy[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS, errors="raise") + + # Prepare historical emissions data. + tb_historical = prepare_historical_emissions(tb_historical=tb_historical) + + # Run sanity checks on input data. + sanity_checks_on_input_data( + tb_production=tb_production, tb_consumption=tb_consumption, tb_historical=tb_historical, tb_co2=tb_co2 + ) + + # Extract global emissions, including bunker and land-use change emissions. + tb_global_emissions = extract_global_emissions( + tb_co2=tb_co2, tb_historical=tb_historical, ds_population=ds_population + ) + + # Harmonize country names. + tb_co2 = harmonize_country_names(tb=tb_co2) + tb_consumption = harmonize_country_names(tb=tb_consumption) + tb_production = harmonize_country_names(tb=tb_production) + tb_land_use = harmonize_country_names(tb=tb_land_use) + + # Fix duplicated rows for Palau. + tb_co2 = fix_duplicated_palau_data(tb_co2=tb_co2) + + # Add new variables to main table (consumption-based emissions, emission intensity, per-capita emissions, etc.). + tb_combined = combine_data_and_add_variables( + tb_co2=tb_co2, + tb_production=tb_production, + tb_consumption=tb_consumption, + tb_global_emissions=tb_global_emissions, + tb_land_use=tb_land_use, + tb_energy=tb_energy, + ds_gdp=ds_gdp, + ds_population=ds_population, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + ) + + #################################################################################################################### + # The data for emissions from other industry is quite sparse. + # This causes the share of emissions to have spurious jumps (because during some years only a few countries are informed). You can easily see these jumps for China and US. From 1990 on, more countries are informed, and therefore the data is more reliable. So I will set the share of emissions from other industry to None for years before 1990. + tb_combined.loc[(tb_combined["year"] < 1990), "emissions_from_other_industry_as_share_of_global"] = None + tb_combined.loc[(tb_combined["year"] < 1990), "cumulative_emissions_from_other_industry_as_share_of_global"] = None + #################################################################################################################### + + # Set an appropriate index, ensure there are no rows that only have nan, and sort conveniently. + tb_combined = tb_combined.format(sort_columns=True, short_name=paths.short_name) + + # Run sanity checks on output data. + sanity_checks_on_output_data(tb_combined) + + # + # Save outputs. + # + # Create a new garden dataset and use metadata from meadow dataset. + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[tb_combined], default_metadata=ds_meadow.metadata, check_variables_metadata=True + ) + ds_garden.save() + + +def sanity_checks_on_input_data( + tb_production: Table, tb_consumption: Table, tb_historical: Table, tb_co2: Table +) -> None: + """Run sanity checks on input data files. + + These checks should be used prior to country harmonization, but after basic processing of the tables. + + Parameters + ---------- + tb_production : Table + Production-based emissions from GCP's official national emissions dataset (excel file). + tb_consumption : Table + Consumption-based emissions from GCP's official national emissions dataset (excel file). + tb_historical : Table + Historical emissions from GCP's official global emissions dataset (excel file). + tb_co2 : Table + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). + + """ + tb_production = tb_production.copy() + tb_consumption = tb_consumption.copy() + tb_historical = tb_historical.copy() + tb_co2 = tb_co2.copy() + + # In the original data, Bunkers was included in the national data file, as another country. + # But I suppose it should be considered as another kind of global emission. + # In fact, bunker emissions should coincide for production and consumption emissions. + global_bunkers_emissions = ( + tb_production[tb_production["country"] == "Bunkers"][["year", "production_emissions"]] + .reset_index(drop=True) + .rename(columns={"production_emissions": "global_bunker_emissions"}, errors="raise") + ) + + # Check that we get exactly the same array of bunker emissions from the consumption emissions table + # (on years where there is data for bunker emissions in both datasets). + comparison = pr.merge( + global_bunkers_emissions, + tb_consumption[tb_consumption["country"] == "Bunkers"][["year", "consumption_emissions"]] + .reset_index(drop=True) + .rename(columns={"consumption_emissions": "global_bunker_emissions"}, errors="raise"), + how="inner", + on="year", + suffixes=("", "_check"), + ) + + error = "Bunker emissions were expected to coincide in production and consumption emissions tables." + assert (comparison["global_bunker_emissions"] == comparison["global_bunker_emissions_check"]).all(), error + + # Check that all production-based emissions are positive. + error = "There are negative emissions in tb_production (from the additional variables dataset)." + assert (tb_production.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that all production-based emissions from the fossil CO2 dataset are positive. + error = "There are negative emissions in tb_co2 (from the fossil CO2 dataset)." + assert (tb_co2.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that all consumption-based emissions are positive. + error = "There are negative emissions in tb_consumption (from the national emissions dataset)." + assert (tb_consumption.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that, for the World, production emissions coincides with consumption emissions (on common years). + error = "Production and consumption emissions for the world were expected to be identical." + comparison = pr.merge( + tb_production[tb_production["country"] == "World"].reset_index(drop=True), + tb_consumption[tb_consumption["country"] == "World"].reset_index(drop=True), + how="inner", + on="year", + ) + assert (comparison["production_emissions"] == comparison["consumption_emissions"]).all(), error + + # Check that production emissions for the World coincide with global (historical) emissions (on common years). + comparison = pr.merge( + tb_production[tb_production["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), + tb_historical[["year", "global_fossil_emissions"]], + how="inner", + on="year", + ) + # NOTE: The following percentage error used to be 0.1%, and in the last version it needed to be increased to 5% to work. + error = "Production emissions for the world were expected to coincide with global fossil emissions." + assert ( + 100 + * abs(comparison["production_emissions"] - comparison["global_fossil_emissions"]) + / (comparison["global_fossil_emissions"]) + < 5 + ).all(), error + + # In the Fossil CO2 file, international transport emissions has been separated into aviation and shipping. + # Emissions are also separated by fuel. + # We'll add them to the global emissions. + global_aviation_and_shipping = ( + tb_co2[tb_co2["country"].isin(["International Aviation", "International Shipping"])] + .dropna() + .pivot(index="year", columns="country", values="emissions_total") + .reset_index() + ) + global_aviation_and_shipping["global_aviation_and_shipping"] = ( + global_aviation_and_shipping["International Aviation"] + global_aviation_and_shipping["International Shipping"] + ) + comparison = ( + tb_production[tb_production["country"] == "Bunkers"] + .reset_index(drop=True) + .rename(columns={"production_emissions": "global_bunker_emissions"}) + .merge( + global_aviation_and_shipping[["year", "global_aviation_and_shipping"]], + how="outer", + on="year", + ) + .sort_values("year") + .reset_index(drop=True) + ) + # Keep only rows where both time series are informed. + comparison = comparison.dropna( + subset=["global_bunker_emissions", "global_aviation_and_shipping"], how="any" + ).reset_index(drop=True) + # NOTE: The following percentage error used to be 0.0001%, and now it is 2% to work. + error = ( + "Bunker emissions from national emissions file should coincide (within 0.0001%) with the sum of aviation" + " and shipping emissions from the Fossil CO2 file." + ) + assert ( + 100 + * abs(comparison["global_bunker_emissions"] - comparison["global_aviation_and_shipping"]) + / (comparison["global_bunker_emissions"]) + < 2 + ).all(), error + + # Now check that all other emissions (that are not from bunker fuels) in tb_production (emissions from the national + # excel file) coincide with emissions in tb_co2 (from the Fossil CO2 emissions csv file). + # Since country names have not yet been harmonized, rename the only countries that are present in both datasets. + comparison = pr.merge( + tb_co2[["country", "year", "emissions_total"]], + tb_production[tb_production["country"] != "Bunkers"].astype({"country": str}).replace({"World": "Global"}), + on=["country", "year"], + how="inner", + ).dropna(subset=["emissions_total", "production_emissions"], how="any") + # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in tb_production), + # omit that row in the comparison. + comparison = comparison.drop( + comparison[(comparison["country"] == "Kuwait") & (comparison["year"] == 1991)].index + ).reset_index(drop=True) + # Check that production emissions from national file coincide with the Fossil CO2 emissions dataset. + # Assert that the difference is smaller than 1%. + # TODO: There are big discrepancies between the national file and the fossil co2 file. For example, in 1942 and 1943, various countries have exactly zero territorial emissions (in the national file) but non-zero emissions in the fossil co2 file. Figure out what is going on, and bring this check back. + # error = "Production emissions from national file were expected to coincide with the Fossil CO2 emissions dataset." + # assert ( + # ( + # 100 + # * abs(comparison["production_emissions"] - comparison["emissions_total"]) + # / (comparison["emissions_total"]) + # ).fillna(0) + # < 1 + # ).all(), error + + +def sanity_checks_on_output_data(tb_combined: Table) -> None: + """Run sanity checks on output data. + + These checks should be run on the very final output table (with an index) prior to storing it as a table. + + Parameters + ---------- + tb_combined : Table + Combination of all input tables, after processing, harmonization, and addition of variables. + + """ + tb_combined = tb_combined.reset_index() + error = "All variables (except traded emissions, growth, and land-use change) should be >= 0 or nan." + positive_variables = [ + col + for col in tb_combined.columns + if col != "country" + if "traded" not in col + if "growth" not in col + if "land_use" not in col + ] + assert (tb_combined[positive_variables].fillna(0) >= 0).all().all(), error + + error = "Production emissions as a share of global emissions should be 100% for 'World' (within 2% error)." + assert tb_combined[ + (tb_combined["country"] == "World") & (abs(tb_combined["emissions_total_as_share_of_global"] - 100) > 2) + ].empty, error + + error = "Consumption emissions as a share of global emissions should be 100% for 'World' (within 2% error)." + assert tb_combined[ + (tb_combined["country"] == "World") & (abs(tb_combined["consumption_emissions_as_share_of_global"] - 100) > 2) + ].empty, error + + error = "Population as a share of global population should be 100% for 'World'." + assert tb_combined[ + (tb_combined["country"] == "World") & (tb_combined["population_as_share_of_global"].fillna(100) != 100) + ].empty, error + + error = "All share of global emissions should be smaller than 100% (within 1% error)." + share_variables = [ + col + for col in tb_combined.columns + if "share" in col + if col != "emissions_from_land_use_change_as_share_of_global" + ] + assert (tb_combined[share_variables].fillna(0) <= 101).all().all(), error + # NOTE: "emissions_from_land_use_change_as_share_of_global" from Upper-middle-income countries in 1982, 1984 and 1986 is >101%. + # This is, in principle, possible (since land use change emissions can be negative), but it would be good to look into it. + # For now, check this variable separately (and ensure it's smaller than, say, 109%). + assert (tb_combined["emissions_from_land_use_change_as_share_of_global"].fillna(0) <= 109).all().all(), error + + # Check that cumulative variables are monotonically increasing. + # Firstly, list columns of cumulative variables, but ignoring cumulative columns as a share of global + # (since they are not necessarily monotonic) and land-use change (which can be negative). + cumulative_cols = [ + col for col in tb_combined.columns if "cumulative" in col if "share" not in col if "land_use" not in col + ] + # Using ".is_monotonic_increasing" can fail when differences between consecutive numbers are very small. + # Instead, sort data backwards in time, and check that consecutive values of cumulative variables always have + # a percentage change that is smaller than, say, 0.1%. + error = ( + "Cumulative variables (not given as a share of global) should be monotonically increasing (except when " + "including land-use change emissions, which can be negative)." + ) + assert ( + tb_combined.sort_values("year", ascending=False) + .groupby("country") + .agg({col: lambda x: ((x.pct_change(fill_method=None).dropna() * 100) <= 0.1).all() for col in cumulative_cols}) + .all() + .all() + ), error + + error = ( + "Production emissions as a share of global production emissions for the World should always be 100% " + "(or larger than 98%, given small discrepancies)." + ) + # Consumption emissions as a share of global production emissions is allowed to be smaller than 100%. + share_variables = [col for col in tb_combined.columns if "share" in col if "consumption" not in col] + assert (tb_combined[tb_combined["country"] == "World"][share_variables].fillna(100) > 98).all().all(), error + + error = "Traded emissions for the World should be close to zero (within 2% error)." + world_mask = tb_combined["country"] == "World" + assert ( + abs( + 100 + * tb_combined[world_mask]["traded_emissions"].fillna(0) + / tb_combined[world_mask]["emissions_total"].fillna(1) + ) + < 2 + ).all(), error + + +def prepare_fossil_co2_emissions(tb_co2: Table) -> Table: + """Prepare Fossil CO2 emissions data (basic processing).""" + # Select and rename columns from fossil CO2 data. + tb_co2 = tb_co2[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") + + # Ensure all emissions are given in tonnes of CO2. + tb_co2[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 + + #################################################################################################################### + # For certain years, column "emissions_from_other_industry" is not informed for "World" but it is informed + # for some countries (namely China and US). + # Note that this is not necessarily an issue in the original data: The data provider may have decided that it is + # better to leave the world uninformed where not enough countries are informed. + # However, "emissions_total" for the World seems to include those contributions from China and the US. + # This can be easily checked in the original data by selecting the year 1989 (last year for which there is data for + # China and US, but not for the World). The sum of emissions from all sources (namely coal, oil, gas, cement, and + # flaring, given that "other" is empty) does not add up to "emissions_total". But, if one includes the other + # emissions from China and US, then it does add up. + # This inconsistency causes the cumulative emissions from other industry for China and US to be larger than the + # global cumulative emissions. And the share of global emissions for those countries becomes hence larger than 100%. + # To fix this issue, we aggregate the data for China and US on those years when the world's data is missing (without + # touching other years or other columns), and add that data to the global emissions from other industry. + # NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. + + # Firstly, list of years for which the world has no data for emissions_from_other_industry. + world_missing_years = ( + tb_co2[(tb_co2["country"] == "Global") & (tb_co2["emissions_from_other_industry"].isnull())]["year"] + .unique() + .tolist() # type: ignore + ) + # Data that needs to be aggregated. + data_missing_in_world = tb_co2[ + tb_co2["year"].isin(world_missing_years) & (tb_co2["emissions_from_other_industry"].notnull()) + ] + # Check that there is indeed data to be aggregated (that is missing for the World). + error = ( + "Expected emissions_from_other_industry to be null for the world but not null for certain countries " + "(which was an issue in the original fossil CO2 data). The issue may be fixed and the code can be simplified." + ) + assert len(data_missing_in_world) > 0, error + # Create a table of aggregate data for the World, on those years when it's missing. + aggregated_missing_data = ( + data_missing_in_world.groupby("year") + .agg({"emissions_from_other_industry": "sum"}) + .reset_index() + .assign(**{"country": "Global"}) + ) + # Combine the new table of aggregate data with the main table. + tb_co2 = dataframes.combine_two_overlapping_dataframes( + df1=tb_co2, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True + ) + # NOTE: The previous function currently does not properly propagate metadata, but keeps only the sources of the + # first table. But given that both tables combined have the same source, we don't need to manually change it. + #################################################################################################################### + + # We add the emissions from "Kuwaiti Oil Fires" (which is also included as a separate country) as part of the + # emissions of Kuwait. This ensures that they will be included in region aggregates. + error = "'Kuwaiti Oil Fires' was expected to only have not-null data for 1991." + assert tb_co2[ + (tb_co2["country"] == "Kuwaiti Oil Fires") + & (tb_co2["emissions_total"].notnull()) + & (tb_co2["emissions_total"] != 0) + ]["year"].tolist() == [1991], error + + tb_co2.loc[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991), EMISSION_SOURCES] = ( + tb_co2[(tb_co2["country"] == "Kuwaiti Oil Fires") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values + + tb_co2[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values + ) + + # Check that "emissions_total" agrees with the sum of emissions from individual sources. + error = "The sum of all emissions should add up to total emissions (within 1%)." + assert ( + abs( + tb_co2.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) + - tb_co2["emissions_total"].fillna(0) + ) + / (tb_co2["emissions_total"].fillna(0) + 1e-7) + < 1e-2 + ).all(), error + + # Many rows have zero total emissions, but actually the individual sources are nan. + # Total emissions in those cases should be nan, instead of zero. + no_individual_emissions = tb_co2.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) + tb_co2.loc[no_individual_emissions, "emissions_total"] = np.nan + + return tb_co2 + + +def prepare_consumption_emissions(tb_consumption: Table) -> Table: + """Prepare consumption-based emissions data (basic processing).""" + # Select and rename columns. + tb_consumption = tb_consumption[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( + columns=CONSUMPTION_EMISSIONS_COLUMNS, errors="raise" + ) + + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in tb_consumption.drop(columns=["country", "year"]).columns: + tb_consumption[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + # List indexes of rows in tb_consumption corresponding to outliers (defined above in OUTLIERS_IN_tb_consumption). + outlier_indexes = [ + tb_consumption[(tb_consumption["country"] == outlier[0]) & (tb_consumption["year"] == outlier[1])].index.item() + for outlier in OUTLIERS_IN_CONSUMPTION_DF + ] + + error = ( + "Outliers were expected to have negative consumption emissions. " + "Maybe outliers have been fixed (and should be removed from the code)." + ) + assert (tb_consumption.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error + + # Remove outliers. + tb_consumption = tb_consumption.drop(outlier_indexes).reset_index(drop=True) + + return tb_consumption + + +def prepare_production_emissions(tb_production: Table) -> Table: + """Prepare production-based emissions data (basic processing).""" + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in tb_production.drop(columns=["country", "year"]).columns: + tb_production[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + return tb_production + + +def prepare_land_use_emissions(tb_land_use: Table) -> Table: + """Prepare land-use change emissions data (basic processing).""" + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + tb_land_use["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + # There are two additional regions in the land-use change file, namely Global and EU27. + # It makes sense to extract national land-use change contributions from one of the sheets of that file (we currently + # do so from the "BLUE" sheet), since there are no other national land-use change emissions in other files. + # But for global emissions, it makes more sense to take the ones estimated by GCP, which are given in the + # "Historical Budget" sheet of the global emissions file. + # So, remove the data for "Global". + # We also remove EU27 data, as explained above, since we aggregate that data ourselves. + tb_land_use = tb_land_use[~tb_land_use["country"].isin(["Global", "EU27"])].reset_index(drop=True) + + return tb_land_use + + +def prepare_historical_emissions(tb_historical: Table) -> Table: + """Prepare historical emissions data.""" + # Select and rename columns from historical emissions data. + tb_historical = tb_historical[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( + columns=HISTORICAL_EMISSIONS_COLUMNS, errors="raise" + ) + + # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in tb_historical.drop(columns=["country", "year"]).columns: + tb_historical[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + return tb_historical + + +def extract_global_emissions(tb_co2: Table, tb_historical: Table, ds_population: Dataset) -> Table: + """Extract World emissions by combining data from the Fossil CO2 emissions and the global emissions dataset. + + The resulting global emissions data includes bunker and land-use change emissions. + + NOTE: This function has to be used after selecting and renaming columns in tb_co2, but before harmonizing country + names in tb_co2 (so that "International Aviation" and "International Shipping" are still listed as countries). + + Parameters + ---------- + tb_co2 : Table + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). + tb_historical : Table + Historical emissions from GCP's official global emissions dataset (excel file). + ds_population : Dataset + Population dataset. + + Returns + ------- + global_emissions : Table + World emissions. + + """ + # "International Aviation" and "International Shipping" are now included as separate countries. + # Combine their emissions into one variable. + global_aviation = ( + tb_co2[tb_co2["country"] == "International Aviation"].set_index(["year"]).drop(columns=["country"]) + ) + global_shipping = ( + tb_co2[tb_co2["country"] == "International Shipping"].set_index(["year"]).drop(columns=["country"]) + ) + global_transport = global_aviation + global_shipping + + # Check that total emissions for international aviation coincide with oil emissions. + # NOTE: International shipping does include emissions from gas, coal and oil. + error = "Total emissions from international aviation do not coincide with oil emissions." + assert all((global_aviation["emissions_from_oil"] - global_aviation["emissions_total"]).dropna() == 0), error + + # Keep only total emissions from international transport. + global_transport = ( + global_transport[["emissions_total"]] + .rename(columns={"emissions_total": "global_emissions_from_international_transport"}, errors="raise") + .dropna() + .reset_index() + ) + + # Create a new table of global emissions. + global_emissions = ( + tb_co2[tb_co2["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] + .rename(columns={column: f"global_{column}" for column in EMISSION_SOURCES}, errors="raise") + .sort_values("year") + .reset_index(drop=True) + ) + + # Add bunker fuels to global emissions. + global_emissions = pr.merge(global_emissions, global_transport, on=["year"], how="outer") + + # Add historical land-use change emissions to table of global emissions. + global_emissions = pr.merge( + global_emissions, tb_historical[["year", "global_emissions_from_land_use_change"]], how="left", on="year" + ) + + # Add variable of total emissions including fossil fuels and land use change. + global_emissions["global_emissions_total_including_land_use_change"] = ( + global_emissions["global_emissions_total"] + global_emissions["global_emissions_from_land_use_change"] + ) + + # Calculate global cumulative emissions. + for column in EMISSION_SOURCES + ["emissions_from_land_use_change", "emissions_total_including_land_use_change"]: + global_emissions[f"global_cumulative_{column}"] = global_emissions[f"global_{column}"].cumsum() + + # Add a country column and add global population. + global_emissions["country"] = "World" + + # Add global population. + global_emissions = geo.add_population_to_table( + tb=global_emissions, ds_population=ds_population, population_col="global_population" + ) + + return global_emissions + + +def harmonize_country_names(tb: Table) -> Table: + """Harmonize country names, and fix known issues with certain regions. + + Parameters + ---------- + tb : Table + Emissions data (either from the fossil CO2, the production-based, consumption-based, or land-use emissions + datasets). + + Returns + ------- + tb : Table + Emissions data after harmonizing country names. + + """ + # Harmonize country names. + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + warn_on_missing_countries=True, + warn_on_unused_countries=False, + make_missing_countries_nan=False, + warn_on_unknown_excluded_countries=False, + ) + + return tb + + +def fix_duplicated_palau_data(tb_co2: Table) -> Table: + tb = tb_co2.copy() + # Check that there is only one data point for each country-year. + # In the fossil CO2 emissions data, after harmonization, "Pacific Islands (Palau)" is mapped to "Palau", and + # therefore there are rows with different data for the same country-year. + # However, "Pacific Islands (Palau)" have data until 1991, and "Palau" has data from 1992 onwards. + # NOTE: this is not an issue with the original data, and it's simply caused by our harmonization of names. + + # Check that duplicate rows are still there. + error = "Expected 'Palau' data to be duplicated. Remove temporary fix." + assert tb[tb.duplicated(subset=["country", "year"])]["country"].unique().tolist() == ["Palau"], error + + # Select rows corresponding to "Palau" prior to 1992, and to "Pacific Islands (Palau)" from 1992 onwards. + indexes_to_drop = ( + tb[ + (tb["country"] == "Palau") & (tb["year"] < 1992) & (tb.duplicated(subset=["country", "year"], keep="first")) + ].index.tolist() + + tb[ + (tb["country"] == "Palau") & (tb["year"] >= 1992) & (tb.duplicated(subset=["country", "year"], keep="last")) + ].index.tolist() + ) + # Check that the selected rows do not overlap. + assert len(indexes_to_drop) == len(set(indexes_to_drop)) + # Remove those rows. + tb = tb.drop(indexes_to_drop).reset_index(drop=True) + # NOTE: Do not drop empty rows yet, as they will be needed to have a complete population series. + + return tb + + +def fix_consumption_emissions_for_africa(tb_co2_with_regions: Table) -> Table: + # The calculated consumption emissions for Africa differ significantly from those in the GCP dataset. + # GCP's estimate is significantly larger. The reason may be that many African countries do not have data on + # consumption emissions, so the aggregate may be underestimated. Maybe GCP has a different way to estimate Africa's + # consumption emissions. + # We therefore replace our values for Africa (calculated by summing consumption emissions from African countries) + # with those from GCP. + # At the end of the day, the reason why we keep ours and GCP's version of continents is that our definitions may + # differ. But it is unlikely that their definition of the African continent is different from ours. + # NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. + + # First, check that the discrepancy exists in the current data. + tb = tb_co2_with_regions.copy() + consumption_emissions_africa = tb[(tb["country"] == "Africa") & (tb["year"] == 2020)][ + "consumption_emissions" + ].item() + consumption_emissions_africa_gcp = tb[(tb["country"] == "Africa (GCP)") & (tb["year"] == 2020)][ + "consumption_emissions" + ].item() + error = ( + "Discrepancy in consumption emissions between aggregated Africa and Africa (GCP) no longer exists. " + "Remove temporary fix" + ) + assert ( + consumption_emissions_africa_gcp - consumption_emissions_africa + ) / consumption_emissions_africa_gcp > 0.23, error + + # Replace consumption emissions for "Africa" by those by "Africa (GCP)". + consumption_emissions = tb[tb["country"] != "Africa"][["country", "year", "consumption_emissions"]].reset_index( + drop=True + ) + consumption_emissions_for_africa = ( + consumption_emissions[consumption_emissions["country"] == "Africa (GCP)"] + .reset_index(drop=True) + .replace({"Africa (GCP)": "Africa"}) + ) + consumption_emissions = pr.concat([consumption_emissions, consumption_emissions_for_africa], ignore_index=True) + # Replace consumption emissions in main table by the fixed one. + tb = tb.drop(columns="consumption_emissions").merge(consumption_emissions, on=["country", "year"], how="outer") + + # Sanity checks. + # All columns except consumption_emissions should be identical to the original. + error = "Mismatch before and after fixing consumption emissions for Africa." + for col in tb.drop(columns=["consumption_emissions"]).columns: + assert ( + tb[col].dropna().reset_index(drop=True) == tb_co2_with_regions[col].dropna().reset_index(drop=True) + ).all() + # Consumption emissions should be identical to the original except for Africa. + assert ( + tb[tb["country"] != "Africa"]["consumption_emissions"].dropna().reset_index(drop=True) + == tb_co2_with_regions[tb_co2_with_regions["country"] != "Africa"]["consumption_emissions"] + .dropna() + .reset_index(drop=True) + ).all() + + return tb + + +def combine_data_and_add_variables( + tb_co2: Table, + tb_production: Table, + tb_consumption: Table, + tb_global_emissions: Table, + tb_land_use: Table, + tb_energy: Table, + ds_gdp: Dataset, + ds_population: Table, + ds_regions: Dataset, + ds_income_groups: Dataset, +) -> Table: + """Combine all relevant data into one table, add region aggregates, and add custom variables (e.g. emissions per + capita). + + Parameters + ---------- + tb_co2 : Table + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file), after harmonization. + tb_production : Table + Production-based emissions from GCP's official national emissions dataset (excel file), after harmonization. + tb_consumption : Table + Consumption-based emissions from GCP's official national emissions dataset (excel file), after harmonization. + tb_global_emissions : Table + World emissions (including bunker and land-use change emissions). + tb_land_use : Table + National land-use change emissions from GCP's official dataset (excel file), after harmonization. + tb_energy : Table + Primary energy data. + ds_gdp : Dataset + GDP dataset. + ds_population : Dataset + Population dataset. + ds_regions : Dataset + Regions dataset. + ds_income_groups : Dataset + Income groups dataset. + + Returns + ------- + tb_co2_with_regions : Table + Combined data, with all additional variables and with region aggregates. + + """ + tb_co2_with_regions = tb_co2.copy() + + # Add region aggregates that were included in the national emissions file, but not in the Fossil CO2 emissions file. + gcp_aggregates = sorted(set(tb_production["country"]) - set(tb_co2_with_regions["country"])) + # NOTE: Here, "International transport" is included. This will cause that total emissions have both data for + # international aviation and shipping, and international transport (which is the sum of the former two). + # But international transport will be removed later, in columns when that happens. + tb_co2_with_regions = pr.concat( + [ + tb_co2_with_regions, + tb_production[tb_production["country"].isin(gcp_aggregates)] + .rename(columns={"production_emissions": "emissions_total"}) + .astype({"year": int}), + ], + ignore_index=True, + short_name=paths.short_name, + ).reset_index(drop=True) + + # Add consumption emissions to main table (keep only the countries of the main table). + # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to tb_co2 + # (when merging with tb_production), all countries from tb_consumption should be included in tb_co2. + error = "Some countries in tb_consumption are not included in tb_co2." + assert set(tb_consumption["country"]) < set(tb_co2_with_regions["country"]), error + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_consumption, on=["country", "year"], how="outer") + + # Add population to original table. + tb_co2_with_regions = geo.add_population_to_table( + tb=tb_co2_with_regions, ds_population=ds_population, warn_on_missing_countries=False + ) + + # Add GDP to main table. + tb_co2_with_regions = geo.add_gdp_to_table(tb=tb_co2_with_regions, ds_gdp=ds_gdp) + + # Add primary energy to main table. + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_energy, on=["country", "year"], how="left") + + # For convenience, rename columns in land-use change emissions data. + tb_land_use = tb_land_use.rename(columns={"emissions": "emissions_from_land_use_change"}) + + # Land-use change data does not include data for the World. Include it by merging with the global dataset. + tb_land_use = pr.concat( + [ + tb_land_use, + tb_global_emissions.rename( + columns={"global_emissions_from_land_use_change": "emissions_from_land_use_change"} + )[["year", "emissions_from_land_use_change"]] + .dropna() + .assign(**{"country": "World"}), + ], + ignore_index=True, + ).astype({"year": int}) + + # Add land-use change emissions to main table. + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_land_use, on=["country", "year"], how="outer") + + # Add total emissions (including land-use change) for each country. + tb_co2_with_regions["emissions_total_including_land_use_change"] = ( + tb_co2_with_regions["emissions_total"] + tb_co2_with_regions["emissions_from_land_use_change"] + ) + + # Add region aggregates. + # Aggregate not only emissions data, but also population, gdp and primary energy. + # This way we ensure that custom regions (e.g. "North America (excl. USA)") will have all required data. + aggregations = {column: "sum" for column in tb_co2_with_regions.columns if column not in ["country", "year"]} + for region in REGIONS: + countries_in_region = geo.list_members_of_region( + region=region, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + additional_regions=REGIONS[region].get("additional_regions", None), + excluded_regions=REGIONS[region].get("excluded_regions", None), + additional_members=REGIONS[region].get("additional_members", None), + excluded_members=REGIONS[region].get("excluded_members", None), + include_historical_regions_in_income_groups=True, + ) + tb_co2_with_regions = geo.add_region_aggregates( + df=tb_co2_with_regions, + region=region, + countries_in_region=countries_in_region, + countries_that_must_have_data=[], + frac_allowed_nans_per_year=0.999, + aggregations=aggregations, + ) + + # Fix consumption emissions for Africa. + tb_co2_with_regions = fix_consumption_emissions_for_africa(tb_co2_with_regions=tb_co2_with_regions) + + # Temporarily add global emissions and global cumulative emissions columns to main table, to be able to calculate + # indicators in terms of global emissions. + tb_co2_with_regions = pr.merge( + tb_co2_with_regions, tb_global_emissions.drop(columns="country"), on=["year"], how="left" + ) + + # Temporarily add certain global emissions variables. + # This is done simply to be able to consider "consumption_emissions" as just another type of emission + # when creating additional variables. + tb_co2_with_regions["global_consumption_emissions"] = tb_co2_with_regions["global_emissions_total"].copy() + tb_co2_with_regions["global_cumulative_consumption_emissions"] = tb_co2_with_regions[ + "global_cumulative_emissions_total" + ].copy() + + # Ensure main table is sorted (so that cumulative emissions are properly calculated). + tb_co2_with_regions = tb_co2_with_regions.sort_values(["country", "year"]).reset_index(drop=True) + + # Add new variables for each source of emissions. + for column in EMISSION_SOURCES + [ + "consumption_emissions", + "emissions_from_land_use_change", + "emissions_total_including_land_use_change", + ]: + # Add per-capita variables. + tb_co2_with_regions[f"{column}_per_capita"] = tb_co2_with_regions[column] / tb_co2_with_regions["population"] + + # Add columns for cumulative emissions. + # Rows that had nan emissions will have nan cumulative emissions. + # But nans will not be propagated in the sum. + # This means that countries with some (not all) nans will have the cumulative sum of the informed emissions + # (treating nans as zeros), but will have nan on those rows that were not informed. + tb_co2_with_regions[f"cumulative_{column}"] = tb_co2_with_regions.groupby(["country"])[column].cumsum() + + # Add share of global emissions. + tb_co2_with_regions[f"{column}_as_share_of_global"] = ( + 100 * tb_co2_with_regions[column] / tb_co2_with_regions[f"global_{column}"] + ) + + # Add share of global cumulative emissions. + tb_co2_with_regions[f"cumulative_{column}_as_share_of_global"] = ( + 100 * tb_co2_with_regions[f"cumulative_{column}"] / tb_co2_with_regions[f"global_cumulative_{column}"] + ) + + # Add total emissions per unit energy (in kg of emissions per kWh). + tb_co2_with_regions["emissions_total_per_unit_energy"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total"] + / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) + ) + + # Add total emissions (including land-use change) per unit energy (in kg of emissions per kWh). + tb_co2_with_regions["emissions_total_including_land_use_change_per_unit_energy"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total_including_land_use_change"] + / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) + ) + + # Add total emissions per unit GDP. + tb_co2_with_regions["emissions_total_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["emissions_total"] / tb_co2_with_regions["gdp"] + ) + + # Add total emissions (including land-use change) per unit GDP. + tb_co2_with_regions["emissions_total_including_land_use_change_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total_including_land_use_change"] + / tb_co2_with_regions["gdp"] + ) + + # Add total consumption emissions per unit GDP. + tb_co2_with_regions["consumption_emissions_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["consumption_emissions"] / tb_co2_with_regions["gdp"] + ) + + # Add variable of emissions embedded in trade. + tb_co2_with_regions["traded_emissions"] = ( + tb_co2_with_regions["consumption_emissions"] - tb_co2_with_regions["emissions_total"] + ) + tb_co2_with_regions["pct_traded_emissions"] = ( + 100 * tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["emissions_total"] + ) + tb_co2_with_regions["traded_emissions_per_capita"] = ( + tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["population"] + ) + + # Add annual percentage growth of total emissions. + tb_co2_with_regions["pct_growth_emissions_total"] = ( + tb_co2_with_regions.groupby("country", observed=True)["emissions_total"].pct_change(fill_method=None) * 100 + ) + + # Add annual percentage growth of total emissions (including land-use change). + tb_co2_with_regions["pct_growth_emissions_total_including_land_use_change"] = ( + tb_co2_with_regions.groupby("country")["emissions_total_including_land_use_change"].pct_change(fill_method=None) + * 100 + ) + + # Add annual absolute growth of total emissions. + tb_co2_with_regions["growth_emissions_total"] = tb_co2_with_regions.groupby("country")["emissions_total"].diff() + + # Add annual absolute growth of total emissions (including land-use change). + tb_co2_with_regions["growth_emissions_total_including_land_use_change"] = tb_co2_with_regions.groupby("country")[ + "emissions_total_including_land_use_change" + ].diff() + + # Create variable of population as a share of global population. + tb_co2_with_regions["population_as_share_of_global"] = ( + tb_co2_with_regions["population"] / tb_co2_with_regions["global_population"] * 100 + ) + + # Remove temporary columns of global emissions. + tb_co2_with_regions = tb_co2_with_regions.drop( + columns=[column for column in tb_co2_with_regions.columns if column.startswith("global_")] + ) + + # Empty rows of international transport if international aviation and shipping are already informed. + # First find the list of columns where this happens. + international_entities = [entity for entity in set(tb_co2_with_regions["country"]) if "International" in entity] + check = tb_co2_with_regions[tb_co2_with_regions["country"].isin(international_entities)].reset_index(drop=True) + # Check that the only columns where international transport, aviation and shipping are all informed are columns + # derived from total emissions. + columns_with_redundant_international_emissions = [ + column + for column in check.drop(columns=["country", "year"]).columns + if set(check.dropna(subset=column)["country"]) == set(international_entities) + ] + error = ( + "Unexpected columns where international transport is informed as well as international aviation and shipping." + ) + assert all(["emissions_total" in column for column in columns_with_redundant_international_emissions]), error + # Now for those columns, make international transport nan. + for column in columns_with_redundant_international_emissions: + tb_co2_with_regions.loc[tb_co2_with_regions["country"] == "International transport", column] = np.nan + + # Replace infinity values (for example when calculating growth from zero to non-zero) in the data by nan. + for column in tb_co2_with_regions.drop(columns=["country", "year"]).columns: + tb_co2_with_regions.loc[np.isinf(tb_co2_with_regions[column]), column] = np.nan + + # For special GCP countries/regions (e.g. "Europe (GCP)") we should keep only the original data. + # Therefore, make nan all additional variables for those countries/regions, and keep only GCP's original data. + added_variables = tb_co2_with_regions.drop( + columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA + ).columns.tolist() + tb_co2_with_regions.loc[ + (tb_co2_with_regions["country"].str.contains(" (GCP)", regex=False)), added_variables + ] = np.nan + + # Remove uninformative rows (those that have only data for, say, gdp, but not for variables related to emissions). + tb_co2_with_regions = tb_co2_with_regions.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index( + drop=True + ) + + # Ensure that there are no rows that only have nan values. + tb_co2_with_regions = tb_co2_with_regions.dropna( + subset=tb_co2_with_regions.drop(columns=["country", "year"]).columns, how="all" + ) + + return tb_co2_with_regions diff --git a/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.countries.json b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.countries.json new file mode 100644 index 00000000000..ac727db78bf --- /dev/null +++ b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.countries.json @@ -0,0 +1,283 @@ +{ + "Afghanistan": "Afghanistan", + "Africa": "Africa (GCP)", + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antarctica": "Antarctica", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Asia": "Asia (GCP)", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bolivia (Plurinational State of)": "Bolivia", + "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Bunkers": "International transport", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Central America": "Central America (GCP)", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Christmas Island": "Christmas Island", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Congo, Democratic Republic of the": "Democratic Republic of Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "EU27": "European Union (27) (GCP)", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Europe": "Europe (GCP)", + "Faeroe Islands": "Faroe Islands", + "Falkland Islands (Malvinas)": "Falkland Islands", + "Faroe Islands": "Faroe Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Equatorial Africa": "French Equatorial Africa (GCP)", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "French West Africa": "French West Africa (GCP)", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Global": "World", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guatemala": "Guatemala", + "Guernsey": "Guernsey", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hong Kong": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "International Aviation": "International aviation", + "International Shipping": "International shipping", + "Iran": "Iran", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Isle of Man": "Isle of Man", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jersey": "Jersey", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Korea (Democratic People's Republic of)": "North Korea", + "Korea, Republic of": "South Korea", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kuwaiti Oil Fires": "Kuwaiti Oil Fires (GCP)", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Leeward Islands": "Leeward Islands (GCP)", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macao": "Macao", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Middle East": "Middle East (GCP)", + "Moldova": "Moldova", + "Moldova, Republic of": "Moldova", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "Netherlands Antilles": "Netherlands Antilles", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Non-OECD": "Non-OECD (GCP)", + "North America": "North America (GCP)", + "North Korea": "North Korea", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "OECD": "OECD (GCP)", + "Occupied Palestinian Territory": "Palestine", + "Oceania": "Oceania (GCP)", + "Oman": "Oman", + "Pacific Islands (Palau)": "Palau", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Palestine, State of": "Palestine", + "Panama": "Panama", + "Panama Canal Zone": "Panama Canal Zone (GCP)", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Romania": "Romania", + "Russia": "Russia", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Ryukyu Islands": "Ryukyu Islands (GCP)", + "R\u00e9union": "Reunion", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Martin (French part)": "Saint Martin (French part)", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South America": "South America (GCP)", + "South Korea": "South Korea", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "State of Palestine": "Palestine", + "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla (GCP)", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Svalbard and Jan Mayen": "Svalbard and Jan Mayen", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Syrian Arab Republic": "Syria", + "Taiwan": "Taiwan", + "Taiwan, Province of China": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Tanzania, United Republic of": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Türkiye": "Turkey", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "USA": "United States", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vatican City": "Vatican", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Virgin Islands (U.S.)": "United States Virgin Islands", + "Wallis and Futuna Islands": "Wallis and Futuna", + "Western Sahara": "Western Sahara", + "World": "World", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "\u00c5land Islands": "Aland Islands" +} diff --git a/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.excluded_countries.json b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.excluded_countries.json new file mode 100644 index 00000000000..6ad8ec106f5 --- /dev/null +++ b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.excluded_countries.json @@ -0,0 +1,7 @@ +[ + "KP Annex B", + "Non KP Annex B", + "DISPUTED", + "OTHER", + "EU27" +] diff --git a/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.meta.yml b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.meta.yml new file mode 100644 index 00000000000..9d9470af087 --- /dev/null +++ b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.meta.yml @@ -0,0 +1,514 @@ +definitions: + production_emissions_description_key: &production_emissions_description_key + - This data is based on territorial emissions, which do not account for emissions embedded in traded goods. + traded_emissions_description_key: &traded_emissions_description_key + - Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter. + international_aviation_description_key: &international_aviation_description_key + - Emissions from international aviation and shipping are not included in any country or region's emissions. They are only included in the global total emissions. + consumption_emissions_description_key: &consumption_emissions_description_key + - Consumption-based emissions attribute the emissions generated in the production of goods and services according to where they were _consumed_, rather than where they were _produced_. + - "The data is calculated by adjusting 'production-based' emissions (emissions produced domestically) for trade: Consumption-based emissions equals production-based emissions, _minus_ emissions embedded in exports, _plus_ emissions embedded in imports." + - If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. If its consumption-based emissions are lower, then it is a net exporter. + - Consumption-based emissions are not available for all countries because not all countries have sufficient, high-quality trade data. But those without complete data are a small fraction (3%) of the global total. + - This data measures carbon dioxide (CO₂) emissions from fossil fuels and industry and does not include emissions from land use change, deforestation, soils, or vegetation. + per_capita_description_key: &per_capita_description_key + - Per capita emissions represent the emissions of an average person in a country or region - they are calculated as the total emissions divided by population. + # Common fields to be used in all indicators (unless overridden for specific indicators below). + common: + description_processing: &description_processing | + - Data on global emissions has been converted from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. + - Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. + - Country's share of the global population is calculated using our population dataset, based on [different sources](https://ourworldindata.org/population-sources). + - Each country's share of global CO₂ emissions from flaring has been calculated using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset. + description_key: + # NOTE: The description key points are re-defined for each indicator on consumption-based emissions and traded emissions, as well as on per-capita indicators. + - *production_emissions_description_key + - *international_aviation_description_key + presentation: + topic_tags: + - CO2 & Greenhouse Gas Emissions + attribution_short: GCB + processing_level: major + +dataset: + title: Global Carbon Budget + update_period_days: 365 + +tables: + global_carbon_budget: + variables: + consumption_emissions: + title: "Annual consumption-based CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description_short: Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes. + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + consumption_emissions_as_share_of_global: + title: "Share of global annual CO₂ consumption-based emissions" + unit: "%" + short_unit: "%" + description_short: "Annual consumption-based emissions of carbon dioxide (CO₂), measured as a percentage of global consumption-based emissions of CO₂ in the same year." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + ################################################################################################################## + # Curated indicator for data page. + consumption_emissions_per_capita: + title: Per capita consumption-based CO₂ emissions + description_short: | + Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes per person. + description_key: + - *consumption_emissions_description_key + - *per_capita_description_key + - *international_aviation_description_key + description_processing: *description_processing + unit: tonnes per person + short_unit: t/person + display: + shortUnit: t + numDecimalPlaces: 0 + presentation: + attribution_short: Global Carbon Project + topic_tags: + - CO2 & Greenhouse Gas Emissions + - Climate Change + - Energy + faqs: + - fragment_id: emissions-from-aviation-and-shipping + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: missing-consumption-based-emissions + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + subtitle: >- + [Consumption-based emissions](#dod:consumptionbasedemissions) are national + emissions that have been adjusted for trade. It's production-based emissions + minus emissions embedded in exports, plus emissions embedded in imports. + hideAnnotationFieldsInTitle: + time: true + entity: true + changeInPrefix: true + hideRelativeToggle: false + hasMapTab: true + tab: map + originUrl: https://ourworldindata.org/co2-and-greenhouse-gas-emissions + colorScale: + binningStrategy: equalInterval + map: + colorScale: + baseColorScheme: Reds + binningStrategy: manual + customNumericValues: + - 1 + - 2 + - 5 + - 10 + - 20 + - 50 + customNumericColors: + - null + - null + selectedEntityNames: + - United States + - United Kingdom + - European Union (27) + - China + - India + - Australia + - Brazil + - South Africa + relatedQuestions: + - url: https://ourworldindata.org/grapher/consumption-co2-per-capita#faqs + text: FAQs on this data + consumption_emissions_per_gdp: + title: "Annual consumption-based CO₂ emissions per GDP (kg per international-$)" + unit: "kilograms per international-$" + short_unit: "kg/$" + description_short: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in kilograms per dollar of GDP (2011 international-$)." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + cumulative_consumption_emissions: + title: "Cumulative CO₂ consumption-based emissions" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of available data, measured in tonnes." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + cumulative_consumption_emissions_as_share_of_global: + title: "Share of global cumulative CO₂ consumption-based emissions" + unit: "%" + short_unit: "%" + description_short: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of available data, measured as a percentage of global cumulative consumption-based emissions." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + cumulative_emissions_from_cement: + title: "Cumulative CO₂ emissions from cement" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from cement since the first year of available data, measured in tonnes." + cumulative_emissions_from_cement_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from cement" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from cement since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from cement." + cumulative_emissions_from_coal: + title: "Cumulative CO₂ emissions from coal" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from coal since the first year of available data, measured in tonnes." + cumulative_emissions_from_coal_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from coal" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from coal since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from coal." + cumulative_emissions_from_flaring: + title: "Cumulative CO₂ emissions from flaring" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from flaring since the first year of available data, measured in tonnes." + cumulative_emissions_from_flaring_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from flaring" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from flaring since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from flaring." + cumulative_emissions_from_gas: + title: "Cumulative CO₂ emissions from gas" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from gas since the first year of available data, measured in tonnes." + cumulative_emissions_from_gas_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from gas" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from gas since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from gas." + cumulative_emissions_from_land_use_change: + title: "Cumulative CO₂ emissions from land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from land-use change since the first year of available data, measured in tonnes." + cumulative_emissions_from_land_use_change_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from land-use change" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from land-use change since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from land-use change." + cumulative_emissions_from_oil: + title: "Cumulative CO₂ emissions from oil" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from oil since the first year of available data, measured in tonnes." + cumulative_emissions_from_oil_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from oil" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from oil since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from oil." + cumulative_emissions_from_other_industry: + title: "Cumulative CO₂ emissions from other industry" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from other industry sources since the first year of available data, measured in tonnes." + cumulative_emissions_from_other_industry_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from other industry" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from other industry sources since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from other industry sources." + cumulative_emissions_total: + title: "Cumulative CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of available data, measured in tonnes." + cumulative_emissions_total_as_share_of_global: + title: "Share of global cumulative CO₂ emissions" + unit: "%" + short_unit: "%" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of available data, measured as a percentage of global total cumulative emissions of CO₂." + cumulative_emissions_total_including_land_use_change: + title: "Cumulative CO₂ emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), including land-use change, since the first year of available data, measured in tonnes." + cumulative_emissions_total_including_land_use_change_as_share_of_global: + title: "Share of global cumulative CO₂ emissions including land-use change" + unit: "%" + short_unit: "%" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), including land-use change, since the first year of available data, measured as a percentage of global total cumulative emissions of CO₂ (including land-use change)." + emissions_from_cement: + title: "Annual CO₂ emissions from cement" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured in tonnes." + emissions_from_cement_as_share_of_global: + title: "Share of global annual CO₂ emissions from cement" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured as a percentage of global emissions of CO₂ from cement in the same year." + emissions_from_cement_per_capita: + title: "Annual CO₂ emissions from cement (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_coal: + title: "Annual CO₂ emissions from coal" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured in tonnes." + emissions_from_coal_as_share_of_global: + title: "Share of global annual CO₂ emissions from coal" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured as a percentage of global emissions of CO₂ from coal in the same year." + emissions_from_coal_per_capita: + title: "Annual CO₂ emissions from coal (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_flaring: + title: "Annual CO₂ emissions from flaring" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured in tonnes." + emissions_from_flaring_as_share_of_global: + title: "Share of global annual CO₂ emissions from flaring" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured as a percentage of global emissions of CO₂ from flaring in the same year." + emissions_from_flaring_per_capita: + title: "Annual CO₂ emissions from flaring (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_gas: + title: "Annual CO₂ emissions from gas" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured in tonnes." + emissions_from_gas_as_share_of_global: + title: "Share of global annual CO₂ emissions from gas" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured as a percentage of global emissions of CO₂ from gas in the same year." + emissions_from_gas_per_capita: + title: "Annual CO₂ emissions from gas (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_land_use_change: + title: "Annual CO₂ emissions from land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes." + emissions_from_land_use_change_as_share_of_global: + title: "Share of global annual CO₂ emissions from land-use change" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured as a percentage of global emissions of CO₂ from land-use change in the same year." + emissions_from_land_use_change_per_capita: + title: "Annual CO₂ emissions from land-use change per capita" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_oil: + title: "Annual CO₂ emissions from oil" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured in tonnes." + emissions_from_oil_as_share_of_global: + title: "Share of global annual CO₂ emissions from oil" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured as a percentage of global emissions of CO₂ from oil in the same year." + emissions_from_oil_per_capita: + title: "Annual CO₂ emissions from oil (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_other_industry: + title: "Annual CO₂ emissions from other industry" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes." + emissions_from_other_industry_as_share_of_global: + title: "Share of global annual CO₂ emissions from other industry" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured as a percentage of global emissions of CO₂ from other industry sources in the same year." + emissions_from_other_industry_per_capita: + title: "Annual CO₂ emissions from other industry (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_total: + title: "Annual CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes." + emissions_total_as_share_of_global: + title: "Share of global annual CO₂ emissions" + unit: "%" + short_unit: "%" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured as a percentage of global emissions of CO₂ in the same year." + emissions_total_including_land_use_change: + title: "Annual CO₂ emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes." + emissions_total_including_land_use_change_as_share_of_global: + title: "Share of global annual CO₂ emissions including land-use change" + unit: "%" + short_unit: "%" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured as a percentage of global total emissions of CO₂ in the same year." + emissions_total_including_land_use_change_per_capita: + title: "Annual CO₂ emissions including land-use change per capita" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_total_including_land_use_change_per_gdp: + title: "Annual CO₂ emissions including land-use change per GDP" + unit: "kilograms per international-$" + short_unit: "kg/$" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per dollar of GDP (2011 international-$)." + emissions_total_including_land_use_change_per_unit_energy: + title: "Annual CO₂ emissions including land-use change per unit energy" + unit: "kilograms per kilowatt-hour" + short_unit: "kg/kWh" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per kilowatt-hour of primary energy consumption." + emissions_total_per_capita: + title: "Annual CO₂ emissions (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_total_per_gdp: + title: "Annual CO₂ emissions per GDP (kg per international-$)" + unit: "kilograms per international-$" + short_unit: "kg/$" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per dollar of GDP (2011 international-$)." + emissions_total_per_unit_energy: + title: "Annual CO₂ emissions per unit energy (kg per kilowatt-hour)" + unit: "kilograms per kilowatt-hour" + short_unit: "kg/kWh" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per kilowatt-hour of primary energy consumption." + gdp: + title: "GDP" + unit: "2011 international-$" + short_unit: "$" + description_short: >- + Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) + and price differences between countries. + growth_emissions_total: + title: "Annual CO₂ emissions growth (abs)" + unit: "tonnes" + short_unit: "t" + description_short: "Annual growth in total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes." + growth_emissions_total_including_land_use_change: + title: "Growth rate of emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Annual growth in total emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes." + pct_growth_emissions_total: + title: "Annual CO₂ emissions growth (%)" + unit: "%" + short_unit: "%" + description_short: "Annual percentage growth in total emissions of carbon dioxide (CO₂), excluding land-use change." + pct_growth_emissions_total_including_land_use_change: + title: "Growth rate of emissions including land-use change (%)" + unit: "%" + short_unit: "%" + description_short: "Annual percentage growth in total emissions of carbon dioxide (CO₂), including land-use change." + pct_traded_emissions: + title: "Share of annual CO₂ emissions embedded in trade" + unit: "%" + short_unit: "%" + description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured as a percentage of emissions of CO₂." + description_key: + - *traded_emissions_description_key + - *international_aviation_description_key + population: + title: "Population" + unit: "persons" + short_unit: "persons" + population_as_share_of_global: + title: "Share of population" + unit: "%" + short_unit: "%" + description_short: "Population, measured as a percentage of global total population in the same year." + primary_energy_consumption: + title: "Primary energy consumption" + unit: "terawatt-hours" + short_unit: "TWh" + description_short: "Primary energy consumption, measured in terawatt-hours per year." + traded_emissions: + title: "Annual CO₂ emissions embedded in trade" + unit: "tonnes" + short_unit: "t" + description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes." + description_key: + - *traded_emissions_description_key + - *international_aviation_description_key + traded_emissions_per_capita: + title: "Annual CO₂ emissions embedded in trade (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *traded_emissions_description_key + - *international_aviation_description_key diff --git a/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.py b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.py new file mode 100644 index 00000000000..e453ba09dff --- /dev/null +++ b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.py @@ -0,0 +1,1142 @@ +"""This step creates the Global Carbon Budget (GCB) dataset, by the Global Carbon Project (GCP). + +It harmonizes and further processes meadow data, and uses the following auxiliary datasets: +- GGDC's Maddison dataset on GDP, used to calculate emissions per GDP. +- Primary Energy Consumption (mix of sources from the 'energy' namespace) to calculate emissions per unit energy. +- Population (mix of sources), to calculate emissions per capita. +- Regions (mix of sources), to generate aggregates for different continents. +- WorldBank's Income groups, to generate aggregates for different income groups. + +""" +import numpy as np +import owid.catalog.processing as pr +from owid.catalog import Dataset, Table +from owid.datautils import dataframes +from structlog import get_logger + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Expected outliers in consumption-based emissions (with negative emissions in the original data, that will be removed). +# NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. +OUTLIERS_IN_CONSUMPTION_DF = [ + ("Panama", 2003), + ("Panama", 2004), + ("Panama", 2005), + ("Panama", 2006), + ("Panama", 2012), + ("Panama", 2013), + ("Venezuela", 2018), +] + +# Regions and income groups to create by aggregating contributions from member countries. +# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. +# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and +# "countries_excluded". The aggregates will be calculated on the resulting countries. +REGIONS = { + # Default continents. + "Africa": {}, + "Asia": {}, + "Europe": {}, + # We exclude GCB's EU27 data, because it appears only in a few metrics, and, when it exists, it is identical to our + # aggregated European Union (27). + "European Union (27)": {}, + "North America": {}, + "Oceania": {}, + "South America": {}, + # Income groups. + "Low-income countries": {}, + "Upper-middle-income countries": {}, + "Lower-middle-income countries": {}, + "High-income countries": {}, + # Additional composite regions. + "Asia (excl. China and India)": { + "additional_regions": ["Asia"], + "excluded_members": ["China", "India"], + }, + "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, + "Europe (excl. EU-28)": { + "additional_regions": ["Europe"], + "excluded_regions": ["European Union (27)"], + "excluded_members": ["United Kingdom"], + }, + "European Union (28)": { + "additional_regions": ["European Union (27)"], + "additional_members": ["United Kingdom"], + }, + "North America (excl. USA)": { + "additional_regions": ["North America"], + "excluded_members": ["United States"], + }, +} + +# Columns to use from GCB fossil CO2 emissions data and how to rename them. +CO2_COLUMNS = { + "country": "country", + "year": "year", + "cement": "emissions_from_cement", + "coal": "emissions_from_coal", + "flaring": "emissions_from_flaring", + "gas": "emissions_from_gas", + "oil": "emissions_from_oil", + "other": "emissions_from_other_industry", + "total": "emissions_total", +} + +# List all sources of emissions considered. +EMISSION_SOURCES = [column for column in CO2_COLUMNS.values() if column not in ["country", "year"]] + +# Columns to use from primary energy consumption data and how to rename them. +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", +} + +# Columns to use from historical emissions data and how to rename them. +HISTORICAL_EMISSIONS_COLUMNS = { + "country": "country", + "year": "year", + # Global fossil emissions are used only for sanity checks. + "global_fossil_emissions": "global_fossil_emissions", + "global_land_use_change_emissions": "global_emissions_from_land_use_change", +} + +# Columns to use from consumption-based emissions data and how to rename them. +CONSUMPTION_EMISSIONS_COLUMNS = { + "country": "country", + "year": "year", + "consumption_emissions": "consumption_emissions", +} + +# Conversion from terawatt-hours to kilowatt-hours. +TWH_TO_KWH = 1e9 + +# Conversion factor to change from billion tonnes of carbon to tonnes of CO2. +BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e9 + +# Conversion factor to change from million tonnes of carbon to tonnes of CO2. +MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 + +# Conversion from million tonnes of CO2 to tonnes of CO2. +MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 = 1e6 + +# Conversion from tonnes of CO2 to kg of CO2 (used for emissions per GDP and per unit energy). +TONNES_OF_CO2_TO_KG_OF_CO2 = 1000 + +# In order to remove uninformative columns, keep only rows where at least one of the following columns has data. +# All other columns are either derived variables, or global variables, or auxiliary variables from other datasets. +COLUMNS_THAT_MUST_HAVE_DATA = [ + "emissions_from_cement", + "emissions_from_coal", + "emissions_from_flaring", + "emissions_from_gas", + "emissions_from_oil", + "emissions_from_other_industry", + "emissions_total", + "consumption_emissions", + "emissions_from_land_use_change", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read all its tables. + ds_meadow = paths.load_dataset("global_carbon_budget") + tb_co2 = ds_meadow.read("global_carbon_budget_fossil_co2_emissions", safe_types=False) + tb_historical = ds_meadow.read("global_carbon_budget_historical_budget", safe_types=False) + tb_consumption = ds_meadow.read("global_carbon_budget_consumption_emissions", safe_types=False) + tb_production = ds_meadow.read("global_carbon_budget_production_emissions", safe_types=False) + tb_land_use = ds_meadow.read("global_carbon_budget_land_use_change", safe_types=False) + + # Load primary energy consumption dataset and read its main table. + ds_energy = paths.load_dataset("primary_energy_consumption") + tb_energy = ds_energy["primary_energy_consumption"].reset_index() + + # Load GDP dataset. + ds_gdp = paths.load_dataset("maddison_project_database") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # + # Process data. + # + # Prepare fossil CO2 emissions data. + tb_co2 = prepare_fossil_co2_emissions(tb_co2=tb_co2) + + # Prepare consumption-based emission data. + tb_consumption = prepare_consumption_emissions(tb_consumption=tb_consumption) + + # Prepare production-based emission data. + tb_production = prepare_production_emissions(tb_production=tb_production) + + # Prepare land-use emission data. + tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) + + # Select and rename columns from primary energy data. + tb_energy = tb_energy[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS, errors="raise") + + # Prepare historical emissions data. + tb_historical = prepare_historical_emissions(tb_historical=tb_historical) + + # Run sanity checks on input data. + sanity_checks_on_input_data( + tb_production=tb_production, tb_consumption=tb_consumption, tb_historical=tb_historical, tb_co2=tb_co2 + ) + + # Extract global emissions, including bunker and land-use change emissions. + tb_global_emissions = extract_global_emissions( + tb_co2=tb_co2, tb_historical=tb_historical, ds_population=ds_population + ) + + # Harmonize country names. + tb_co2 = harmonize_country_names(tb=tb_co2) + tb_consumption = harmonize_country_names(tb=tb_consumption) + tb_production = harmonize_country_names(tb=tb_production) + tb_land_use = harmonize_country_names(tb=tb_land_use) + + # Fix duplicated rows for Palau. + tb_co2 = fix_duplicated_palau_data(tb_co2=tb_co2) + + # Add new variables to main table (consumption-based emissions, emission intensity, per-capita emissions, etc.). + tb_combined = combine_data_and_add_variables( + tb_co2=tb_co2, + tb_production=tb_production, + tb_consumption=tb_consumption, + tb_global_emissions=tb_global_emissions, + tb_land_use=tb_land_use, + tb_energy=tb_energy, + ds_gdp=ds_gdp, + ds_population=ds_population, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + ) + + #################################################################################################################### + # The data for emissions from other industry is quite sparse. + # This causes the share of emissions to have spurious jumps (because during some years only a few countries are informed). You can easily see these jumps for China and US. From 1990 on, more countries are informed, and therefore the data is more reliable. So I will set the share of emissions from other industry to None for years before 1990. + tb_combined.loc[(tb_combined["year"] < 1990), "emissions_from_other_industry_as_share_of_global"] = None + tb_combined.loc[(tb_combined["year"] < 1990), "cumulative_emissions_from_other_industry_as_share_of_global"] = None + #################################################################################################################### + + # Set an appropriate index, ensure there are no rows that only have nan, and sort conveniently. + tb_combined = tb_combined.format(sort_columns=True, short_name=paths.short_name) + + # Run sanity checks on output data. + sanity_checks_on_output_data(tb_combined) + + # + # Save outputs. + # + # Create a new garden dataset and use metadata from meadow dataset. + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[tb_combined], default_metadata=ds_meadow.metadata, check_variables_metadata=True + ) + ds_garden.save() + + +def sanity_checks_on_input_data( + tb_production: Table, tb_consumption: Table, tb_historical: Table, tb_co2: Table +) -> None: + """Run sanity checks on input data files. + + These checks should be used prior to country harmonization, but after basic processing of the tables. + + Parameters + ---------- + tb_production : Table + Production-based emissions from GCP's official national emissions dataset (excel file). + tb_consumption : Table + Consumption-based emissions from GCP's official national emissions dataset (excel file). + tb_historical : Table + Historical emissions from GCP's official global emissions dataset (excel file). + tb_co2 : Table + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). + + """ + tb_production = tb_production.copy() + tb_consumption = tb_consumption.copy() + tb_historical = tb_historical.copy() + tb_co2 = tb_co2.copy() + + # In the original data, Bunkers was included in the national data file, as another country. + # But I suppose it should be considered as another kind of global emission. + # In fact, bunker emissions should coincide for production and consumption emissions. + global_bunkers_emissions = ( + tb_production[tb_production["country"] == "Bunkers"][["year", "production_emissions"]] + .reset_index(drop=True) + .rename(columns={"production_emissions": "global_bunker_emissions"}, errors="raise") + ) + + # Check that we get exactly the same array of bunker emissions from the consumption emissions table + # (on years where there is data for bunker emissions in both datasets). + comparison = pr.merge( + global_bunkers_emissions, + tb_consumption[tb_consumption["country"] == "Bunkers"][["year", "consumption_emissions"]] + .reset_index(drop=True) + .rename(columns={"consumption_emissions": "global_bunker_emissions"}, errors="raise"), + how="inner", + on="year", + suffixes=("", "_check"), + ) + + error = "Bunker emissions were expected to coincide in production and consumption emissions tables." + assert (comparison["global_bunker_emissions"] == comparison["global_bunker_emissions_check"]).all(), error + + # Check that all production-based emissions are positive. + error = "There are negative emissions in tb_production (from the additional variables dataset)." + assert (tb_production.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that all production-based emissions from the fossil CO2 dataset are positive. + error = "There are negative emissions in tb_co2 (from the fossil CO2 dataset)." + assert (tb_co2.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that all consumption-based emissions are positive. + error = "There are negative emissions in tb_consumption (from the national emissions dataset)." + assert (tb_consumption.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that, for the World, production emissions coincides with consumption emissions (on common years). + error = "Production and consumption emissions for the world were expected to be identical." + comparison = pr.merge( + tb_production[tb_production["country"] == "World"].reset_index(drop=True), + tb_consumption[tb_consumption["country"] == "World"].reset_index(drop=True), + how="inner", + on="year", + ) + assert (comparison["production_emissions"] == comparison["consumption_emissions"]).all(), error + + # Check that production emissions for the World coincide with global (historical) emissions (on common years). + comparison = pr.merge( + tb_production[tb_production["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), + tb_historical[["year", "global_fossil_emissions"]], + how="inner", + on="year", + ) + error = "Production emissions for the world were expected to coincide with global fossil emissions." + assert ( + 100 + * abs(comparison["production_emissions"] - comparison["global_fossil_emissions"]) + / (comparison["global_fossil_emissions"]) + < 0.0001 + ).all(), error + + # In the Fossil CO2 file, international transport emissions has been separated into aviation and shipping. + # Emissions are also separated by fuel. + # We'll add them to the global emissions. + global_aviation_and_shipping = ( + tb_co2[tb_co2["country"].isin(["International Aviation", "International Shipping"])] + .dropna() + .pivot(index="year", columns="country", values="emissions_total") + .reset_index() + ) + global_aviation_and_shipping["global_aviation_and_shipping"] = ( + global_aviation_and_shipping["International Aviation"] + global_aviation_and_shipping["International Shipping"] + ) + comparison = ( + tb_production[tb_production["country"] == "Bunkers"] + .reset_index(drop=True) + .rename(columns={"production_emissions": "global_bunker_emissions"}) + .merge( + global_aviation_and_shipping[["year", "global_aviation_and_shipping"]], + how="outer", + on="year", + ) + .sort_values("year") + .reset_index(drop=True) + ) + # Keep only rows where both time series are informed. + comparison = comparison.dropna( + subset=["global_bunker_emissions", "global_aviation_and_shipping"], how="any" + ).reset_index(drop=True) + error = ( + "Bunker emissions from national emissions file should coincide (within 0.0001%) with the sum of aviation" + " and shipping emissions from the Fossil CO2 file." + ) + assert ( + 100 + * abs(comparison["global_bunker_emissions"] - comparison["global_aviation_and_shipping"]) + / (comparison["global_bunker_emissions"]) + < 0.0001 + ).all(), error + + # Now check that all other emissions (that are not from bunker fuels) in tb_production (emissions from the national + # excel file) coincide with emissions in tb_co2 (from the Fossil CO2 emissions csv file). + # Since country names have not yet been harmonized, rename the only countries that are present in both datasets. + comparison = pr.merge( + tb_co2[["country", "year", "emissions_total"]], + tb_production[tb_production["country"] != "Bunkers"].astype({"country": str}).replace({"World": "Global"}), + on=["country", "year"], + how="inner", + ).dropna(subset=["emissions_total", "production_emissions"], how="any") + # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in tb_production), + # omit that row in the comparison. + comparison = comparison.drop( + comparison[(comparison["country"] == "Kuwait") & (comparison["year"] == 1991)].index + ).reset_index(drop=True) + # Check that production emissions from national file coincide with the Fossil CO2 emissions dataset. + # NOTE: It seems that total emissions may have been rounded to zero decimals, which is why in the following assertion I also round production emissions. + error = "Production emissions from national file were expected to coincide with the Fossil CO2 emissions dataset." + assert ( + ( + 100 + * abs(comparison["production_emissions"].round(0) - comparison["emissions_total"]) + / (comparison["emissions_total"]) + ).fillna(0) + < 0.01 + ).all(), error + + +def sanity_checks_on_output_data(tb_combined: Table) -> None: + """Run sanity checks on output data. + + These checks should be run on the very final output table (with an index) prior to storing it as a table. + + Parameters + ---------- + tb_combined : Table + Combination of all input tables, after processing, harmonization, and addition of variables. + + """ + tb_combined = tb_combined.reset_index() + error = "All variables (except traded emissions, growth, and land-use change) should be >= 0 or nan." + positive_variables = [ + col + for col in tb_combined.columns + if col != "country" + if "traded" not in col + if "growth" not in col + if "land_use" not in col + ] + assert (tb_combined[positive_variables].fillna(0) >= 0).all().all(), error + + error = "Production emissions as a share of global emissions should be 100% for 'World'." + assert tb_combined[ + (tb_combined["country"] == "World") & (abs(tb_combined["emissions_total_as_share_of_global"] - 100) > 0.00001) + ].empty, error + + error = "Consumption emissions as a share of global emissions should be 100% for 'World'." + assert tb_combined[ + (tb_combined["country"] == "World") + & (abs(tb_combined["consumption_emissions_as_share_of_global"] - 100) > 0.0001) + ].empty, error + + error = "Population as a share of global population should be 100% for 'World'." + assert tb_combined[ + (tb_combined["country"] == "World") & (tb_combined["population_as_share_of_global"].fillna(100) != 100) + ].empty, error + + error = "All share of global emissions should be smaller than 100%." + share_variables = [ + col + for col in tb_combined.columns + if "share" in col + if col != "emissions_from_land_use_change_as_share_of_global" + ] + assert (tb_combined[share_variables].fillna(0) <= 100.001).all().all(), error + # NOTE: In previous versions, "emissions_from_land_use_change_as_share_of_global" was >101%, e.g. from Upper-middle-income countries in 1982, 1984 and 1986. This is, in principle, possible (since land use change emissions can be negative). + assert (tb_combined["emissions_from_land_use_change_as_share_of_global"].fillna(0) <= 100.00001).all().all(), error + + # Check that cumulative variables are monotonically increasing. + # Firstly, list columns of cumulative variables, but ignoring cumulative columns as a share of global + # (since they are not necessarily monotonic) and land-use change (which can be negative). + cumulative_cols = [ + col for col in tb_combined.columns if "cumulative" in col if "share" not in col if "land_use" not in col + ] + # Using ".is_monotonic_increasing" can fail when differences between consecutive numbers are very small. + # Instead, sort data backwards in time, and check that consecutive values of cumulative variables always have + # a percentage change that is smaller than a certain amount. + error = ( + "Cumulative variables (not given as a share of global) should be monotonically increasing (except when " + "including land-use change emissions, which can be negative)." + ) + assert ( + tb_combined.sort_values("year", ascending=False) + .groupby("country") + .agg( + { + col: lambda x: ((x.pct_change(fill_method=None).dropna() * 100) <= 0.0001).all() + for col in cumulative_cols + } + ) + .all() + .all() + ), error + + error = ( + "Production emissions as a share of global production emissions for the World should always be 100% " + "(or larger than 98%, given small discrepancies)." + ) + # Consumption emissions as a share of global production emissions is allowed to be smaller than 100%. + share_variables = [col for col in tb_combined.columns if "share" in col if "consumption" not in col] + assert (tb_combined[tb_combined["country"] == "World"][share_variables].fillna(100) > 99.9999).all().all(), error + + error = "Traded emissions for the World should be close to zero." + world_mask = tb_combined["country"] == "World" + assert ( + abs( + 100 + * tb_combined[world_mask]["traded_emissions"].fillna(0) + / tb_combined[world_mask]["emissions_total"].fillna(1) + ) + < 0.0001 + ).all(), error + + +def prepare_fossil_co2_emissions(tb_co2: Table) -> Table: + """Prepare Fossil CO2 emissions data (basic processing).""" + # Select and rename columns from fossil CO2 data. + tb_co2 = tb_co2[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") + + # Ensure all emissions are given in tonnes of CO2. + tb_co2[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 + + #################################################################################################################### + # For certain years, column "emissions_from_other_industry" is not informed for "World" but it is informed + # for some countries (namely China and US). + # Note that this is not necessarily an issue in the original data: The data provider may have decided that it is + # better to leave the world uninformed where not enough countries are informed. + # However, "emissions_total" for the World seems to include those contributions from China and the US. + # This can be easily checked in the original data by selecting the year 1989 (last year for which there is data for + # China and US, but not for the World). The sum of emissions from all sources (namely coal, oil, gas, cement, and + # flaring, given that "other" is empty) does not add up to "emissions_total". But, if one includes the other + # emissions from China and US, then it does add up. + # This inconsistency causes the cumulative emissions from other industry for China and US to be larger than the + # global cumulative emissions. And the share of global emissions for those countries becomes hence larger than 100%. + # To fix this issue, we aggregate the data for China and US on those years when the world's data is missing (without + # touching other years or other columns), and add that data to the global emissions from other industry. + # NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. + + # Firstly, list of years for which the world has no data for emissions_from_other_industry. + world_missing_years = ( + tb_co2[(tb_co2["country"] == "Global") & (tb_co2["emissions_from_other_industry"].isnull())]["year"] + .unique() + .tolist() # type: ignore + ) + # Data that needs to be aggregated. + data_missing_in_world = tb_co2[ + tb_co2["year"].isin(world_missing_years) & (tb_co2["emissions_from_other_industry"].notnull()) + ] + # Check that there is indeed data to be aggregated (that is missing for the World). + error = ( + "Expected emissions_from_other_industry to be null for the world but not null for certain countries " + "(which was an issue in the original fossil CO2 data). The issue may be fixed and the code can be simplified." + ) + assert len(data_missing_in_world) > 0, error + # Create a table of aggregate data for the World, on those years when it's missing. + aggregated_missing_data = ( + data_missing_in_world.groupby("year") + .agg({"emissions_from_other_industry": "sum"}) + .reset_index() + .assign(**{"country": "Global"}) + ) + # Combine the new table of aggregate data with the main table. + tb_co2 = dataframes.combine_two_overlapping_dataframes( + df1=tb_co2, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True + ) + # NOTE: The previous function currently does not properly propagate metadata, but keeps only the sources of the + # first table. But given that both tables combined have the same source, we don't need to manually change it. + #################################################################################################################### + + # We add the emissions from "Kuwaiti Oil Fires" (which is also included as a separate country) as part of the + # emissions of Kuwait. This ensures that they will be included in region aggregates. + error = "'Kuwaiti Oil Fires' was expected to only have not-null data for 1991." + assert tb_co2[ + (tb_co2["country"] == "Kuwaiti Oil Fires") + & (tb_co2["emissions_total"].notnull()) + & (tb_co2["emissions_total"] != 0) + ]["year"].tolist() == [1991], error + + tb_co2.loc[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991), EMISSION_SOURCES] = ( + tb_co2[(tb_co2["country"] == "Kuwaiti Oil Fires") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values + + tb_co2[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values + ) + + # Check that "emissions_total" agrees with the sum of emissions from individual sources. + error = "The sum of all emissions should add up to total emissions (within 1%)." + assert ( + abs( + tb_co2.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) + - tb_co2["emissions_total"].fillna(0) + ) + / (tb_co2["emissions_total"].fillna(0) + 1e-7) + < 1e-2 + ).all(), error + + # Many rows have zero total emissions, but actually the individual sources are nan. + # Total emissions in those cases should be nan, instead of zero. + no_individual_emissions = tb_co2.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) + tb_co2.loc[no_individual_emissions, "emissions_total"] = np.nan + + return tb_co2 + + +def prepare_consumption_emissions(tb_consumption: Table) -> Table: + """Prepare consumption-based emissions data (basic processing).""" + # Select and rename columns. + tb_consumption = tb_consumption[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( + columns=CONSUMPTION_EMISSIONS_COLUMNS, errors="raise" + ) + + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in tb_consumption.drop(columns=["country", "year"]).columns: + tb_consumption[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + # List indexes of rows in tb_consumption corresponding to outliers (defined above in OUTLIERS_IN_tb_consumption). + outlier_indexes = [ + tb_consumption[(tb_consumption["country"] == outlier[0]) & (tb_consumption["year"] == outlier[1])].index.item() + for outlier in OUTLIERS_IN_CONSUMPTION_DF + ] + + error = ( + "Outliers were expected to have negative consumption emissions. " + "Maybe outliers have been fixed (and should be removed from the code)." + ) + assert (tb_consumption.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error + + # Remove outliers. + tb_consumption = tb_consumption.drop(outlier_indexes).reset_index(drop=True) + + return tb_consumption + + +def prepare_production_emissions(tb_production: Table) -> Table: + """Prepare production-based emissions data (basic processing).""" + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in tb_production.drop(columns=["country", "year"]).columns: + tb_production[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + return tb_production + + +def prepare_land_use_emissions(tb_land_use: Table) -> Table: + """Prepare land-use change emissions data (basic processing).""" + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + tb_land_use["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + # There are two additional regions in the land-use change file, namely Global and EU27. + # It makes sense to extract national land-use change contributions from one of the sheets of that file (we currently + # do so from the "BLUE" sheet), since there are no other national land-use change emissions in other files. + # But for global emissions, it makes more sense to take the ones estimated by GCP, which are given in the + # "Historical Budget" sheet of the global emissions file. + # So, remove the data for "Global". + # We also remove EU27 data, as explained above, since we aggregate that data ourselves. + tb_land_use = tb_land_use[~tb_land_use["country"].isin(["Global", "EU27"])].reset_index(drop=True) + + return tb_land_use + + +def prepare_historical_emissions(tb_historical: Table) -> Table: + """Prepare historical emissions data.""" + # Select and rename columns from historical emissions data. + tb_historical = tb_historical[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( + columns=HISTORICAL_EMISSIONS_COLUMNS, errors="raise" + ) + + # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in tb_historical.drop(columns=["country", "year"]).columns: + tb_historical[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + return tb_historical + + +def extract_global_emissions(tb_co2: Table, tb_historical: Table, ds_population: Dataset) -> Table: + """Extract World emissions by combining data from the Fossil CO2 emissions and the global emissions dataset. + + The resulting global emissions data includes bunker and land-use change emissions. + + NOTE: This function has to be used after selecting and renaming columns in tb_co2, but before harmonizing country + names in tb_co2 (so that "International Aviation" and "International Shipping" are still listed as countries). + + Parameters + ---------- + tb_co2 : Table + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). + tb_historical : Table + Historical emissions from GCP's official global emissions dataset (excel file). + ds_population : Dataset + Population dataset. + + Returns + ------- + global_emissions : Table + World emissions. + + """ + # "International Aviation" and "International Shipping" are now included as separate countries. + # Combine their emissions into one variable. + global_aviation = ( + tb_co2[tb_co2["country"] == "International Aviation"].set_index(["year"]).drop(columns=["country"]) + ) + global_shipping = ( + tb_co2[tb_co2["country"] == "International Shipping"].set_index(["year"]).drop(columns=["country"]) + ) + global_transport = global_aviation + global_shipping + + # Check that total emissions for international aviation coincide with oil emissions. + # NOTE: International shipping does include emissions from gas, coal and oil. + error = "Total emissions from international aviation do not coincide with oil emissions." + assert all((global_aviation["emissions_from_oil"] - global_aviation["emissions_total"]).dropna() == 0), error + + # Keep only total emissions from international transport. + global_transport = ( + global_transport[["emissions_total"]] + .rename(columns={"emissions_total": "global_emissions_from_international_transport"}, errors="raise") + .dropna() + .reset_index() + ) + + # Create a new table of global emissions. + global_emissions = ( + tb_co2[tb_co2["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] + .rename(columns={column: f"global_{column}" for column in EMISSION_SOURCES}, errors="raise") + .sort_values("year") + .reset_index(drop=True) + ) + + # Add bunker fuels to global emissions. + global_emissions = pr.merge(global_emissions, global_transport, on=["year"], how="outer") + + # Add historical land-use change emissions to table of global emissions. + global_emissions = pr.merge( + global_emissions, tb_historical[["year", "global_emissions_from_land_use_change"]], how="left", on="year" + ) + + # Add variable of total emissions including fossil fuels and land use change. + global_emissions["global_emissions_total_including_land_use_change"] = ( + global_emissions["global_emissions_total"] + global_emissions["global_emissions_from_land_use_change"] + ) + + # Calculate global cumulative emissions. + for column in EMISSION_SOURCES + ["emissions_from_land_use_change", "emissions_total_including_land_use_change"]: + global_emissions[f"global_cumulative_{column}"] = global_emissions[f"global_{column}"].cumsum() + + # Add a country column and add global population. + global_emissions["country"] = "World" + + # Add global population. + global_emissions = geo.add_population_to_table( + tb=global_emissions, ds_population=ds_population, population_col="global_population" + ) + + return global_emissions + + +def harmonize_country_names(tb: Table) -> Table: + """Harmonize country names, and fix known issues with certain regions. + + Parameters + ---------- + tb : Table + Emissions data (either from the fossil CO2, the production-based, consumption-based, or land-use emissions + datasets). + + Returns + ------- + tb : Table + Emissions data after harmonizing country names. + + """ + # Harmonize country names. + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + warn_on_missing_countries=True, + warn_on_unused_countries=False, + make_missing_countries_nan=False, + warn_on_unknown_excluded_countries=False, + ) + + return tb + + +def fix_duplicated_palau_data(tb_co2: Table) -> Table: + tb = tb_co2.copy() + # Check that there is only one data point for each country-year. + # In the fossil CO2 emissions data, after harmonization, "Pacific Islands (Palau)" is mapped to "Palau", and + # therefore there are rows with different data for the same country-year. + # However, "Pacific Islands (Palau)" have data until 1991, and "Palau" has data from 1992 onwards. + # NOTE: this is not an issue with the original data, and it's simply caused by our harmonization of names. + + # Check that duplicate rows are still there. + error = "Expected 'Palau' data to be duplicated. Remove temporary fix." + assert tb[tb.duplicated(subset=["country", "year"])]["country"].unique().tolist() == ["Palau"], error + + # Select rows corresponding to "Palau" prior to 1992, and to "Pacific Islands (Palau)" from 1992 onwards. + indexes_to_drop = ( + tb[ + (tb["country"] == "Palau") & (tb["year"] < 1992) & (tb.duplicated(subset=["country", "year"], keep="first")) + ].index.tolist() + + tb[ + (tb["country"] == "Palau") & (tb["year"] >= 1992) & (tb.duplicated(subset=["country", "year"], keep="last")) + ].index.tolist() + ) + # Check that the selected rows do not overlap. + assert len(indexes_to_drop) == len(set(indexes_to_drop)) + # Remove those rows. + tb = tb.drop(indexes_to_drop).reset_index(drop=True) + # NOTE: Do not drop empty rows yet, as they will be needed to have a complete population series. + + return tb + + +def fix_consumption_emissions_for_africa(tb_co2_with_regions: Table) -> Table: + # The calculated consumption emissions for Africa differ significantly from those in the GCP dataset. + # GCP's estimate is significantly larger. The reason may be that many African countries do not have data on + # consumption emissions, so the aggregate may be underestimated. Maybe GCP has a different way to estimate Africa's + # consumption emissions. + # We therefore replace our values for Africa (calculated by summing consumption emissions from African countries) + # with those from GCP. + # At the end of the day, the reason why we keep ours and GCP's version of continents is that our definitions may + # differ. But it is unlikely that their definition of the African continent is different from ours. + # NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. + + # First, check that the discrepancy exists in the current data. + tb = tb_co2_with_regions.copy() + consumption_emissions_africa = tb[(tb["country"] == "Africa") & (tb["year"] == 2020)][ + "consumption_emissions" + ].item() + consumption_emissions_africa_gcp = tb[(tb["country"] == "Africa (GCP)") & (tb["year"] == 2020)][ + "consumption_emissions" + ].item() + error = ( + "Discrepancy in consumption emissions between aggregated Africa and Africa (GCP) no longer exists. " + "Remove temporary fix" + ) + assert ( + consumption_emissions_africa_gcp - consumption_emissions_africa + ) / consumption_emissions_africa_gcp > 0.23, error + + # Replace consumption emissions for "Africa" by those by "Africa (GCP)". + consumption_emissions = tb[tb["country"] != "Africa"][["country", "year", "consumption_emissions"]].reset_index( + drop=True + ) + consumption_emissions_for_africa = ( + consumption_emissions[consumption_emissions["country"] == "Africa (GCP)"] + .reset_index(drop=True) + .replace({"Africa (GCP)": "Africa"}) + ) + consumption_emissions = pr.concat([consumption_emissions, consumption_emissions_for_africa], ignore_index=True) + # Replace consumption emissions in main table by the fixed one. + tb = tb.drop(columns="consumption_emissions").merge(consumption_emissions, on=["country", "year"], how="outer") + + # Sanity checks. + # All columns except consumption_emissions should be identical to the original. + error = "Mismatch before and after fixing consumption emissions for Africa." + for col in tb.drop(columns=["consumption_emissions"]).columns: + assert ( + tb[col].dropna().reset_index(drop=True) == tb_co2_with_regions[col].dropna().reset_index(drop=True) + ).all() + # Consumption emissions should be identical to the original except for Africa. + assert ( + tb[tb["country"] != "Africa"]["consumption_emissions"].dropna().reset_index(drop=True) + == tb_co2_with_regions[tb_co2_with_regions["country"] != "Africa"]["consumption_emissions"] + .dropna() + .reset_index(drop=True) + ).all() + + return tb + + +def combine_data_and_add_variables( + tb_co2: Table, + tb_production: Table, + tb_consumption: Table, + tb_global_emissions: Table, + tb_land_use: Table, + tb_energy: Table, + ds_gdp: Dataset, + ds_population: Table, + ds_regions: Dataset, + ds_income_groups: Dataset, +) -> Table: + """Combine all relevant data into one table, add region aggregates, and add custom variables (e.g. emissions per + capita). + + Parameters + ---------- + tb_co2 : Table + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file), after harmonization. + tb_production : Table + Production-based emissions from GCP's official national emissions dataset (excel file), after harmonization. + tb_consumption : Table + Consumption-based emissions from GCP's official national emissions dataset (excel file), after harmonization. + tb_global_emissions : Table + World emissions (including bunker and land-use change emissions). + tb_land_use : Table + National land-use change emissions from GCP's official dataset (excel file), after harmonization. + tb_energy : Table + Primary energy data. + ds_gdp : Dataset + GDP dataset. + ds_population : Dataset + Population dataset. + ds_regions : Dataset + Regions dataset. + ds_income_groups : Dataset + Income groups dataset. + + Returns + ------- + tb_co2_with_regions : Table + Combined data, with all additional variables and with region aggregates. + + """ + tb_co2_with_regions = tb_co2.copy() + + # Add region aggregates that were included in the national emissions file, but not in the Fossil CO2 emissions file. + gcp_aggregates = sorted(set(tb_production["country"]) - set(tb_co2_with_regions["country"])) + # NOTE: Here, "International transport" is included. This will cause that total emissions have both data for + # international aviation and shipping, and international transport (which is the sum of the former two). + # But international transport will be removed later, in columns when that happens. + tb_co2_with_regions = pr.concat( + [ + tb_co2_with_regions, + tb_production[tb_production["country"].isin(gcp_aggregates)] + .rename(columns={"production_emissions": "emissions_total"}) + .astype({"year": int}), + ], + ignore_index=True, + short_name=paths.short_name, + ).reset_index(drop=True) + + # Add consumption emissions to main table (keep only the countries of the main table). + # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to tb_co2 + # (when merging with tb_production), all countries from tb_consumption should be included in tb_co2. + error = "Some countries in tb_consumption are not included in tb_co2." + assert set(tb_consumption["country"]) < set(tb_co2_with_regions["country"]), error + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_consumption, on=["country", "year"], how="outer") + + # Add population to original table. + tb_co2_with_regions = geo.add_population_to_table( + tb=tb_co2_with_regions, ds_population=ds_population, warn_on_missing_countries=False + ) + + # Add GDP to main table. + tb_co2_with_regions = geo.add_gdp_to_table(tb=tb_co2_with_regions, ds_gdp=ds_gdp) + + # Add primary energy to main table. + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_energy, on=["country", "year"], how="left") + + # For convenience, rename columns in land-use change emissions data. + tb_land_use = tb_land_use.rename(columns={"emissions": "emissions_from_land_use_change"}) + + # Land-use change data does not include data for the World. Include it by merging with the global dataset. + tb_land_use = pr.concat( + [ + tb_land_use, + tb_global_emissions.rename( + columns={"global_emissions_from_land_use_change": "emissions_from_land_use_change"} + )[["year", "emissions_from_land_use_change"]] + .dropna() + .assign(**{"country": "World"}), + ], + ignore_index=True, + ).astype({"year": int}) + + # Add land-use change emissions to main table. + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_land_use, on=["country", "year"], how="outer") + + # Add total emissions (including land-use change) for each country. + tb_co2_with_regions["emissions_total_including_land_use_change"] = ( + tb_co2_with_regions["emissions_total"] + tb_co2_with_regions["emissions_from_land_use_change"] + ) + + # Add region aggregates. + # Aggregate not only emissions data, but also population, gdp and primary energy. + # This way we ensure that custom regions (e.g. "North America (excl. USA)") will have all required data. + aggregations = {column: "sum" for column in tb_co2_with_regions.columns if column not in ["country", "year"]} + for region in REGIONS: + countries_in_region = geo.list_members_of_region( + region=region, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + additional_regions=REGIONS[region].get("additional_regions", None), + excluded_regions=REGIONS[region].get("excluded_regions", None), + additional_members=REGIONS[region].get("additional_members", None), + excluded_members=REGIONS[region].get("excluded_members", None), + include_historical_regions_in_income_groups=True, + ) + tb_co2_with_regions = geo.add_region_aggregates( + df=tb_co2_with_regions, + region=region, + countries_in_region=countries_in_region, + countries_that_must_have_data=[], + frac_allowed_nans_per_year=0.999, + aggregations=aggregations, + ) + + # Fix consumption emissions for Africa. + tb_co2_with_regions = fix_consumption_emissions_for_africa(tb_co2_with_regions=tb_co2_with_regions) + + # Temporarily add global emissions and global cumulative emissions columns to main table, to be able to calculate + # indicators in terms of global emissions. + tb_co2_with_regions = pr.merge( + tb_co2_with_regions, tb_global_emissions.drop(columns="country"), on=["year"], how="left" + ) + + # Temporarily add certain global emissions variables. + # This is done simply to be able to consider "consumption_emissions" as just another type of emission + # when creating additional variables. + tb_co2_with_regions["global_consumption_emissions"] = tb_co2_with_regions["global_emissions_total"].copy() + tb_co2_with_regions["global_cumulative_consumption_emissions"] = tb_co2_with_regions[ + "global_cumulative_emissions_total" + ].copy() + + # Ensure main table is sorted (so that cumulative emissions are properly calculated). + tb_co2_with_regions = tb_co2_with_regions.sort_values(["country", "year"]).reset_index(drop=True) + + # Add new variables for each source of emissions. + for column in EMISSION_SOURCES + [ + "consumption_emissions", + "emissions_from_land_use_change", + "emissions_total_including_land_use_change", + ]: + # Add per-capita variables. + tb_co2_with_regions[f"{column}_per_capita"] = tb_co2_with_regions[column] / tb_co2_with_regions["population"] + + # Add columns for cumulative emissions. + # Rows that had nan emissions will have nan cumulative emissions. + # But nans will not be propagated in the sum. + # This means that countries with some (not all) nans will have the cumulative sum of the informed emissions + # (treating nans as zeros), but will have nan on those rows that were not informed. + tb_co2_with_regions[f"cumulative_{column}"] = tb_co2_with_regions.groupby(["country"])[column].cumsum() + + # Add share of global emissions. + tb_co2_with_regions[f"{column}_as_share_of_global"] = ( + 100 * tb_co2_with_regions[column] / tb_co2_with_regions[f"global_{column}"] + ) + + # Add share of global cumulative emissions. + tb_co2_with_regions[f"cumulative_{column}_as_share_of_global"] = ( + 100 * tb_co2_with_regions[f"cumulative_{column}"] / tb_co2_with_regions[f"global_cumulative_{column}"] + ) + + # Add total emissions per unit energy (in kg of emissions per kWh). + tb_co2_with_regions["emissions_total_per_unit_energy"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total"] + / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) + ) + + # Add total emissions (including land-use change) per unit energy (in kg of emissions per kWh). + tb_co2_with_regions["emissions_total_including_land_use_change_per_unit_energy"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total_including_land_use_change"] + / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) + ) + + # Add total emissions per unit GDP. + tb_co2_with_regions["emissions_total_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["emissions_total"] / tb_co2_with_regions["gdp"] + ) + + # Add total emissions (including land-use change) per unit GDP. + tb_co2_with_regions["emissions_total_including_land_use_change_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total_including_land_use_change"] + / tb_co2_with_regions["gdp"] + ) + + # Add total consumption emissions per unit GDP. + tb_co2_with_regions["consumption_emissions_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["consumption_emissions"] / tb_co2_with_regions["gdp"] + ) + + # Add variable of emissions embedded in trade. + tb_co2_with_regions["traded_emissions"] = ( + tb_co2_with_regions["consumption_emissions"] - tb_co2_with_regions["emissions_total"] + ) + tb_co2_with_regions["pct_traded_emissions"] = ( + 100 * tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["emissions_total"] + ) + tb_co2_with_regions["traded_emissions_per_capita"] = ( + tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["population"] + ) + + # Add annual percentage growth of total emissions. + tb_co2_with_regions["pct_growth_emissions_total"] = ( + tb_co2_with_regions.groupby("country", observed=True)["emissions_total"].pct_change(fill_method=None) * 100 + ) + + # Add annual percentage growth of total emissions (including land-use change). + tb_co2_with_regions["pct_growth_emissions_total_including_land_use_change"] = ( + tb_co2_with_regions.groupby("country")["emissions_total_including_land_use_change"].pct_change(fill_method=None) + * 100 + ) + + # Add annual absolute growth of total emissions. + tb_co2_with_regions["growth_emissions_total"] = tb_co2_with_regions.groupby("country")["emissions_total"].diff() + + # Add annual absolute growth of total emissions (including land-use change). + tb_co2_with_regions["growth_emissions_total_including_land_use_change"] = tb_co2_with_regions.groupby("country")[ + "emissions_total_including_land_use_change" + ].diff() + + # Create variable of population as a share of global population. + tb_co2_with_regions["population_as_share_of_global"] = ( + tb_co2_with_regions["population"] / tb_co2_with_regions["global_population"] * 100 + ) + + # Remove temporary columns of global emissions. + tb_co2_with_regions = tb_co2_with_regions.drop( + columns=[column for column in tb_co2_with_regions.columns if column.startswith("global_")] + ) + + # Empty rows of international transport if international aviation and shipping are already informed. + # First find the list of columns where this happens. + international_entities = [entity for entity in set(tb_co2_with_regions["country"]) if "International" in entity] + check = tb_co2_with_regions[tb_co2_with_regions["country"].isin(international_entities)].reset_index(drop=True) + # Check that the only columns where international transport, aviation and shipping are all informed are columns + # derived from total emissions. + columns_with_redundant_international_emissions = [ + column + for column in check.drop(columns=["country", "year"]).columns + if set(check.dropna(subset=column)["country"]) == set(international_entities) + ] + error = ( + "Unexpected columns where international transport is informed as well as international aviation and shipping." + ) + assert all(["emissions_total" in column for column in columns_with_redundant_international_emissions]), error + # Now for those columns, make international transport nan. + for column in columns_with_redundant_international_emissions: + tb_co2_with_regions.loc[tb_co2_with_regions["country"] == "International transport", column] = np.nan + + # Replace infinity values (for example when calculating growth from zero to non-zero) in the data by nan. + for column in tb_co2_with_regions.drop(columns=["country", "year"]).columns: + tb_co2_with_regions.loc[np.isinf(tb_co2_with_regions[column]), column] = np.nan + + # For special GCP countries/regions (e.g. "Europe (GCP)") we should keep only the original data. + # Therefore, make nan all additional variables for those countries/regions, and keep only GCP's original data. + added_variables = tb_co2_with_regions.drop( + columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA + ).columns.tolist() + tb_co2_with_regions.loc[ + (tb_co2_with_regions["country"].str.contains(" (GCP)", regex=False)), added_variables + ] = np.nan + + # Remove uninformative rows (those that have only data for, say, gdp, but not for variables related to emissions). + tb_co2_with_regions = tb_co2_with_regions.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index( + drop=True + ) + + # Ensure that there are no rows that only have nan values. + tb_co2_with_regions = tb_co2_with_regions.dropna( + subset=tb_co2_with_regions.drop(columns=["country", "year"]).columns, how="all" + ) + + return tb_co2_with_regions diff --git a/etl/steps/data/garden/growth/2024-05-16/gdp_historical.py b/etl/steps/data/garden/growth/2024-05-16/gdp_historical.py index fc8e09264f4..71a042c3d0d 100644 --- a/etl/steps/data/garden/growth/2024-05-16/gdp_historical.py +++ b/etl/steps/data/garden/growth/2024-05-16/gdp_historical.py @@ -179,7 +179,7 @@ def create_estimations_from_growth( tb[f"{var}_estimated"] = to_adjust_value * tb[f"{var}_scalar"] # Rename the estimated variables without the suffix - tb[f"{var}"] = tb[f"{var}{to_adjust_var_suffix}"].fillna(tb[f"{var}_estimated"]) + tb[f"{var}"] = tb[f"{var}{to_adjust_var_suffix}"].astype("Float64").fillna(tb[f"{var}_estimated"]) # Specify "World" entity for each row tb["country"] = "World" diff --git a/etl/steps/data/garden/happiness/2024-06-09/happiness.py b/etl/steps/data/garden/happiness/2024-06-09/happiness.py index 3edd2c6b15e..8684a6d65e4 100644 --- a/etl/steps/data/garden/happiness/2024-06-09/happiness.py +++ b/etl/steps/data/garden/happiness/2024-06-09/happiness.py @@ -29,8 +29,8 @@ def run(dest_dir: str) -> None: ds_income_groups = paths.load_dataset("income_groups") # Read table datasets. - tb_this_year = ds_meadow["happiness"].reset_index() - tb_prev_years = ds_prev_years["happiness"] + tb_this_year = ds_meadow.read("happiness") + tb_prev_years = ds_prev_years.read("happiness") # combine meadow data with previous years tb_this_year["cantril_ladder_score"] = tb_this_year["ladder_score"] diff --git a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml index 550e22f18f0..1e1ccd860f0 100644 --- a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml +++ b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml @@ -108,7 +108,7 @@ tables: grapher_config: hideAnnotationFieldsInTitle: time: true - hasChartTab: false + chartTypes: [] hasMapTab: true tab: map map: diff --git a/etl/steps/data/garden/hmd/2024-11-19/hfd.countries.json b/etl/steps/data/garden/hmd/2024-11-19/hfd.countries.json new file mode 100644 index 00000000000..77c18eb21c2 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-11-19/hfd.countries.json @@ -0,0 +1,41 @@ +{ + "AUT": "Austria", + "BEL": "Belgium", + "BGR": "Bulgaria", + "BLR": "Belarus", + "CAN": "Canada", + "CHE": "Switzerland", + "CHL": "Chile", + "CZE": "Czechia", + "DEUTE": "East Germany", + "DEUTNP": "Germany", + "DEUTW": "West Germany", + "DNK": "Denmark", + "ESP": "Spain", + "EST": "Estonia", + "FRATNP": "France", + "FIN": "Finland", + "GBRTENW": "England & Wales", + "GBR_NIR": "Northern Ireland", + "GBR_SCO": "Scotland", + "GBR_NP": "United Kingdom", + "HRV": "Croatia", + "HUN": "Hungary", + "IRL": "Ireland", + "ISL": "Iceland", + "ITA": "Italy", + "JPN": "Japan", + "KOR": "South Korea", + "LTU": "Lithuania", + "NLD": "Netherlands", + "NOR": "Norway", + "POL": "Poland", + "PRT": "Portugal", + "RUS": "Russia", + "SVK": "Slovakia", + "SVN": "Slovenia", + "SWE": "Sweden", + "TWN": "Taiwan", + "UKR": "Ukraine", + "USA": "United States" +} diff --git a/etl/steps/data/garden/hmd/2024-11-19/hfd.meta.yml b/etl/steps/data/garden/hmd/2024-11-19/hfd.meta.yml new file mode 100644 index 00000000000..0e02510f341 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-11-19/hfd.meta.yml @@ -0,0 +1,353 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + others: + bo_1: |- + <% if birth_order == '1' %> + first + <%- elif birth_order == '2' %> + second + <%- elif birth_order == '3' %> + third + <%- elif birth_order == '4' %> + fourth + <%- elif birth_order == '5p' %> + fifth (or greater) + <% endif %> + bo_1_m1: |- + <% if birth_order == '2' %> + second + <%- elif birth_order == '3' %> + third + <%- elif birth_order == '4' %> + fourth + <%- elif birth_order == '5p' %> + fifth (or greater) + <% endif %> + title: |- + <% if birth_order == 'total' %> + << title >> - Total + <%- elif birth_order == '5p' %> + << title >> - Birth order: ≥5 + <%- else %> + << title >> - Birth order: << birth_order >> + <%- endif %> + title_age: |- + <% set age_str = '≤12' if age == '12-' else age %> + <% if birth_order == 'total' %> + << title >> - Mother age: << age_str >> - All births + <% elif birth_order == '5p' %> + << title >> - Mother age: << age_str >> - Birth order: ≥5 + <%- else %> + << title >> - Mother age: << age_str >> - Birth order: << birth_order >> + <%- endif %> + common: + presentation: + topic_tags: + - Fertility Rate + display: + numDecimalPlaces: 2 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + title: Human Fertility Database + +tables: + period: + variables: + tfr: + title: |- + <% set title = "Period total fertility rate" %> + {definitions.others.title} + description_short: |- + The average number of{definitions.others.bo_1} children a woman would have in her lifetime if she experienced the fertility rates of a specific year. + unit: "births per woman" + description_key: + - Assumes current age-specific fertility rates remain constant throughout a woman's lifetime. + - Does not account for potential changes in social, economic, or health conditions that could affect fertility rates. + + tfr40: + title: |- + <% set title = "Period total fertility rate by age 40" %> + {definitions.others.title} + description_short: |- + The average number of{definitions.others.bo_1} children a woman would have by age 40 if she experienced the fertility rates of a specific year. + unit: "births per woman" + description_key: + - Useful for understanding early and mid-reproductive age fertility patterns. + + adjtfr: + title: |- + <% set title = "Tempo-adjusted total fertility rate" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + The total fertility rate adjusted to account for delays or advances in childbearing. + <%- else %> + The total fertility rate for{definitions.others.bo_1} births adjusted to account for delays or advances in having a{definitions.others.bo_1} child. + <%- endif %> + unit: "births per woman" + description_key: + - The TFR has been adjusted using a method proposed by [Bongaarts-Feeney](https://en.wikipedia.org/wiki/Sub-replacement_fertility). It sums order-specific TFRs and adjusts for changes in the mean age of order-specific fertility schedule. + - The tempo-adjusted TFR adjusts for timing shifts in childbearing, such as postponement of births. + - The tempo-adjusted TFR helps to distinguish between changes in the number of children women have and changes in the timing of when they have them. + - The tempo-adjusted TFR often displays large year-to-year fluctuations ([Sobotka 2003](https://www.demographic-research.org/articles/volume/8/6)), which can make its use for specific years problematic. Therefore, three- or five-year moving averages are often used to smooth out fluctuations. + - Requires careful interpretation, as the adjustment is based on specific assumptions about timing effects. + + patfr: + title: |- + <% set title = "Parity- and age-adjusted total fertility rate" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + The total fertility rate adjusted for both the age of the mother and the number of children she already has. + <%- else %> + The total fertility rate for{definitions.others.bo_1} births adjusted for the age of the mother. + <%- endif %> + unit: "births per woman" + description_key: + - The Parity- and age-adjusted TFR accounts for both the age structure and parity distribution of the female population. + + # Mean ages at birth, and at 40 + mab: + title: |- + <% set title = "Period mean ages at birth" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + The average age of mothers when they give birth in a specific year. It is standardized for the age structure of the female population of reproductive age. + <%- else %> + The average age of mothers when they have their{definitions.others.bo_1} child in a specific year. It is standardized for the age structure of the female population of reproductive age. + <%- endif %> + unit: "years" + + mab40: + title: |- + <% set title = "Period mean ages at birth by age 40" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + The average age of mothers under 40 when they give birth. It is standardized for the age structure of the female population of reproductive age. + <%- else %> + The average age of mothers under 40 when they have their{definitions.others.bo_1} child in a specific year. It is standardized for the age structure of the female population of reproductive age. + <%- endif %> + unit: "years" + description_key: + - Focuses on childbearing occurring before age 40, providing insights into early fertility patterns. + + sdmab: + title: |- + <% set title = "Standard deviation in period mean ages at birth" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + Variation in the ages of mothers when they give birth in a specific year. + <%- else %> + Variation in the ages of mothers when they have their{definitions.others.bo_1} child in a specific year. + <%- endif %> + unit: "years" + description_key: + - Measures the spread of ages at which women give birth. + - A low standard deviation indicates that most births occur around the same age, while a high standard deviation suggests a wider range of ages. + - Useful for understanding the diversity of reproductive timing. + + sdmab40: + title: |- + <% set title = "Standard deviation in period mean ages at birth by age 40" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + Variation in the ages of mothers under 40 when they give birth. + <%- else %> + Variation in the ages of mothers under 40 when they have their{definitions.others.bo_1} child in a specific year. + <%- endif %> + unit: "years" + description_key: + - Reflects variability in the timing of births up to age 40. + - Helps to understand how concentrated or spread out early childbearing is within the population. + + tmab: + title: |- + <% set title = "Period table mean ages at birth" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + The average age of mothers at birth, considering the number of children they already have. + <%- else %> + The average age of mothers at the birth of their{definitions.others.bo_1} child. + <%- endif %> + unit: "years" + description_key: + - Derived from fertility tables that account for both age and parity. + - Provides a more detailed understanding of the timing of births across different birth orders. + + # Births and birth rate + cbr: + title: |- + <% set title = "Crude birth rate" %> + {definitions.others.title} + description_short: |- + The number of{definitions.others.bo_1} live births per 1,000 people in a given year. It is not standardized for the age structure of the population. + unit: "births per 1,000 people" + description_key: + - The rate is calculated by dividing the total number of{definitions.others.bo_1} live births occurring in a given year by person-years lived by all population in that year. + - Simple measure of birth intensity, not adjusted for age or parity. + - It is influenced by the age structure of the population, which can make comparisons between populations challenging. + + b: + title: |- + <% set title = "Total live births" %> + {definitions.others.title} + description_short: The total number of{definitions.others.bo_1} live births recorded in a given year. + unit: "births" + description_key: + - Represents the total number of{definitions.others.bo_1} live births without considering age or parity. + - Serves as a foundational statistic for calculating other fertility measures. + - Can be influenced by population size and structure. + display: + numDecimalPlaces: 0 + + cohort: + variables: + # Fertility + + ccf: + title: |- + <% set title = "Completed cohort fertility rate" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + The average number of children born to women in a specific cohort over their lifetime. + <%- else %> + The average number of{definitions.others.bo_1} children born to women in a specific cohort over their lifetime. + <%- endif %> + unit: "births per woman" + description_key: + - Represents the completed fertility of a cohort by the end of their reproductive years. + - Useful for comparing fertility across different cohorts. + - Calculated from age-specific fertility rates observed throughout the cohort's reproductive lifespan. + + ccf40: + title: |- + <% set title = "Completed cohort fertility rate by the age 40" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + The average number of children born to women under 40 in a specific cohort. + <%- else %> + The average number of{definitions.others.bo_1} children born to women under 40 in a specific cohort. + <%- endif %> + unit: "births per woman" + description_key: + - Represents the fertility of a cohort up to age 40. + - Provides insights into early and mid-reproductive age fertility patterns. + + # Mean age at birth + cmab: + title: |- + <% set title = "Cohort mean ages at birth" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + The average age at which women in a specific birth cohort have children. It is standardized for the age structure of the female population of reproductive age. + <%- else %> + The average age at which women in a specific birth cohort have their{definitions.others.bo_1} child. It is standardized for the age structure of the female population of reproductive age. + <%- endif %> + unit: "years" + description_key: + - Represents the average age at which women from a particular cohort give birth. + - Helps understand generational differences in fertility timing. + - Calculated from observed fertility rates across the reproductive lifespan of the cohort. + + cmab40: + title: |- + <% set title = "Cohort mean ages at birth by the age 40" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + The average age at which women under 40 in a specific cohort have children. + <%- else %> + The average age at which women under 40 in a specific cohort have their{definitions.others.bo_1} child. + <%- endif %> + unit: "years" + description_key: + - Focuses on births occurring before age 40, providing insights into early fertility patterns for the cohort. + - Useful for comparing early reproductive behavior across different cohorts. + + sdcmab: + title: |- + <% set title = "Standard deviation in cohort mean ages at birth" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + Variation in the ages at which women in a specific cohort have children. + <%- else %> + Variation in the ages at which women in a specific cohort have their{definitions.others.bo_1} child. + <%- endif %> + unit: "years" + description_key: + - Measures the spread of ages at which women in a cohort give birth. + - A lower value indicates concentrated timing, while a higher value suggests more variation in childbearing ages. + - Helps to understand the diversity of reproductive timing within a cohort. + + sdcmab40: + title: |- + <% set title = "Standard deviation in cohort mean ages at 40" %> + {definitions.others.title} + description_short: |- + <% if birth_order == 'total' %> + How much variation there is in the ages at which women in a specific cohort have children, calculated only up to age 40. + <%- else %> + How much variation there is in the ages at which women in a specific cohort have their{definitions.others.bo_1} child, calculated only up to age 40. + <%- endif %> + unit: "years" + description_key: + - Reflects variability in the timing of births up to age 40 within a cohort. + - Helps to understand how concentrated or spread out early childbearing is within the cohort. + + ppr: + title: |- + Cohort parity progression ratio - << (birth_order | int) - 1 >> to << (birth_order | int) >> birth + description_short: |- + <% if birth_order == '1' %> + Probability of giving birth to a first child. + <%- elif birth_order == '2' %> + Probability of giving birth to a second child, conditioned on having had a first child. + <%- elif birth_order == '3' %> + Probability of giving birth to a third child, conditioned on having had a second child. + <%- elif birth_order == '4' %> + Probability of giving birth to a fourth child, conditioned on having had a third child. + <% endif %> + unit: "" + description_key: + - Measures the likelihood that a woman with a given number of children will go on to have another child. + - It is useful for understanding family-building dynamics and changes in reproductive behavior over time. + + period_ages: + variables: + asfr_period: + title: |- + <% set title = "Period fertility rate" %> + {definitions.others.title_age} + description_short: |- + Age-specific fertility rates for each calendar year, measured in completed years of age. + unit: "births per woman" + description_key: + - Represents fertility rates for each age group in a specific year. + - Useful for detailed analysis of fertility patterns by both age and year. + - Presented in the form of Lexis squares, which provide a snapshot of fertility behavior over time. + + cohort_ages: + variables: + asfr_cohort: + title: |- + <% set title = "Cohort fertility rate" %> + {definitions.others.title_age} + unit: "births per woman" + description_short: |- + Age-specific fertility rates for women in a specific birth cohort, measured by their age in completed years. + description_key: + - Represents fertility rates for a specific cohort as they age. + - Useful for understanding how fertility behavior changes across different cohorts over time. + - Presented in the form of horizontal parallelograms, allowing for the tracking of cohort-specific fertility patterns. diff --git a/etl/steps/data/garden/hmd/2024-11-19/hfd.py b/etl/steps/data/garden/hmd/2024-11-19/hfd.py new file mode 100644 index 00000000000..2ad25b7d3dd --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-11-19/hfd.py @@ -0,0 +1,772 @@ +"""There are various input tables, this step brings them together. + + +The comments below summarize the various columns in all tables. This comments are meant for maintenance, and not for the end-user. They try to: + + - Group the various tables into categories, depending on the primary index columns that they have. + - Each group is separated from the next one with '----'. + - I've tagged each category with a number + letter. E.g. "2y" meaning it has two primary indices, and one is 'year'. 'c' stands for 'cohort. + - '*y' categories likely contain period indicators. '*c' categories likely contain cohort indicators. + - Within each group, I've listed the names of the original tables, along with their header (description of its indicators). I've also added the list of columns (indicators that the table presents). + - At the end I've added a section '=> OUTPUT' which tries to summarize the format of the output & consolidated table. + +------------ +P 2y: code, year [PERIOD] + + x adjtfrRR + Tempo-adjusted total fertility rates, Bongaarts-Feeney method + adjTFR + x adjtfrRRbo + Tempo-adjusted total fertility rates by birth order, Bongaarts-Feeney method + adjTFR adjTFR1 adjTFR2 adjTFR3 adjTFR4 adjTFR5p + + cbrRR + Crude birth rate + CBR + cbrRRbo + Crude birth rate by birth order + CBR CBR1 CBR2 CBR3 CBR4 CBR5p + + x mabRR + Period mean ages at birth and period mean ages at birth by age 40 + MAB MAB40 + x mabRRbo + Period mean ages at birth by birth order and period mean ages at birth by birth order by age 40 + MAB1 MAB2 MAB3 MAB4 MAB5p MAB40_1 MAB40_2 MAB40_3 MAB40_4 MAB40_5p + + patfr + Parity- and age-adjusted total fertility rate + PATFR PATFR1 PATFR2 PATFR3 PATFR4 PATFR5p + patfrc + Parity- and age-adjusted total fertility rate (based on parity distribution from population census) + PATFR PATFR1 PATFR2 PATFR3 PATFR4 PATFR5p + + x pmab + Period table mean ages at birth by birth order + TMAB TMAB1 TMAB2 TMAB3 TMAB4 TMAB5p + x pmabc + Period table mean ages at birth by birth order (based on parity distribution from population census) + TMAB TMAB1 TMAB2 TMAB3 TMAB4 TMAB5p + + x sdmabRR + Standard deviation in period mean ages at birth and standard deviation in period mean ages at birth by age 40 + sdMAB sdMAB40 + x sdmabRRbo + Standard deviation in period mean ages at birth by birth order and standard deviation in period mean ages at birth by birth order by age 40 + sdMAB sdMAB1 sdMAB2 sdMAB3 sdMAB4 sdMAB5p sdMAB40 sdMAB40_1 sdMAB40_2 sdMAB40_3 sdMAB40_4 sdMAB40_5p + + x tfrRR + Period total fertility rates and period total fertility rates by age 40 + TFR TFR40 + x tfrRRbo + Period total fertility rates by birth order and period total fertility rates by birth order by age 40 + TFR TFR1 TFR2 TFR3 TFR4 TFR5p TFR40 TFR40_1 TFR40_2 TFR40_3 TFR40_4 TFR40_5p + + x totbirthsRR + Total live births + Total + x totbirthsRRbo + Total live births by birth order + Total B1 B2 B3 B4 B5p + + => OUTPUT: + columns + adjTFR + CBR + MAB + MAB40 + sdMAB + sdMAB40 + TFR + TFR40 + PATFR + PATFR_c + TMAB + TMAB_c + B + dimensions + code, year, birth_order + +------------ +P 3y: code, year, age [PERIOD] + + asfrRR + Period fertility rates by calendar year and age (Lexis squares, age in completed years (ACY)) + ASFR + asfrRRbo + Period fertility rates by calendar year, age and birth order (Lexis squares, age in completed years (ACY)) + ASFR ASFR1 ASFR2 ASFR3 ASFR4 ASFR5p + + ? birthsRR + Live births by calendar year and age (Lexis squares, age in completed years (ACY)) + Total + ? birthsRRbo + Live births by calendar year, age and birth order (Lexis squares, age in completed years (ACY)) + Total B1 B2 B3 B4 B5p + + cpfrRR + Cumulative period fertility rates (Lexis squares) + CPFR + cpfrRRbo + Cumulative period fertility rates by birth order (Lexis squares) + CPFR CPFR1 CPFR2 CPFR3 CPFR4 CPFR5p + + exposRR + Female population exposure by calendar year and age (Lexis squares, age in completed years (ACY)) + Exposure + exposRRpa + Female exposure to risk by calendar year, age and parity + E0x E1x E2x E3x E4px + exposRRpac + Female exposure to risk by calendar year, age and parity (based on parity distribution from population census) + E0x E1x E2x E3x E4px + + mi + Conditional fertility rates by calendar year, age and birth order + m1x m2x m3x m4x m5px + mic + Conditional fertility rates by calendar year, age and birth order (based on parity distribution from population census) + m1x m2x m3x m4x m5px + + => OUTPUT: + columns + ASFR + B + CPFR + expos + expos_c + mi + dimensions + code, year, age, parity + +------------ +4y: code, year, age, cohort [PERIOD] + + asfrTR + Period fertility rates by calendar year, age and birth cohort (Lexis triangles) + ASFR + asfrTRbo + Period fertility rates by calendar year, age, birth cohort and birth order (Lexis triangles) + ASFR ASFR1 ASFR2 ASFR3 ASFR4 ASFR5p + + ? birthsTR + Live births by calendar year, age and birth cohort (Lexis triangles) + Total + ? birthsTRbo + Live births by calendar year, age, birth cohort and birth order (Lexis triangles) + Total B1 B2 B3 B4 B5p + + exposTR + Female population exposure by calendar year, age and birth cohort (Lexis triangles) + Exposure + + => OUTPUT + columns + ASFR + B + E + dimensions + code, year, age, cohort, parity + +------------ +C 2c: code, cohort [COHORT] + + x mabVH + Cohort mean ages at birth and cohort mean ages at birth by age 40 + CMAB CMAB40 + x mabVHbo + Cohort mean ages at birth by birth order and cohort mean ages at birth by birth order by age 40 + CMAB CMAB1 CMAB2 CMAB3 CMAB4 CMAB5p CMAB40 CMAB40_1 CMAB40_2 CMAB40_3 CMAB40_4 CMAB40_5p + + pprVHbo + Cohort parity progression ratios + PPR0_1 PPR1_2 PPR2_3 PPR3_4 + + sdmabVH + Standard deviation in cohort mean ages at birth and standard deviation in cohort mean ages at birth by age 40 + sdCMAB sdCMAB40 + sdmabVHbo + Standard deviation in cohort mean ages at birth by birth order and standard deviation in cohort mean ages at birth by birth order by age 40 + sdCMAB sdCMAB1 sdCMAB2 sdCMAB3 sdCMAB4 sdCMAB5p sdCMAB40 sdCMAB40_1 sdCMAB40_2 sdCMAB40_3 sdCMAB40_4 sdCMAB40_5p + + x tfrVH + Completed cohort fertility and completed cohort fertility by age 40 + CCF CCF40 + x tfrVHbo + Completed cohort fertility by birth order and completed cohort fertility by birth order by age 40 + CCF CCF1 CCF2 CCF3 CCF4 CCF5p CCF40 CCF40_1 CCF40_2 CCF40_3 CCF40_4 CCF40_5p + + => OUTPUT + columns + CMAB + PPR + sdCMAB + sdCMAB40 + CCF + CCF40 + dimensions + code, cohort, parity + +------------ +C 3c: code, cohort, age [COHORT] + asfrVH + Cohort fertility rates by birth cohort and age (horizontal parallelograms, age in completed years (ACY)) + ASFR + asfrVHbo + Cohort fertility rates by birth cohort, age and birth order (horizontal parallelograms, age in completed years (ACY)) + ASFR ASFR1 ASFR2 ASFR3 ASFR4 ASFR5p + + birthsVH + Live births by birth cohort and age (horizontal parallelograms, age in completed years (ACY)) + Total + birthsVHbo + Live births by birth cohort, age and birth order (horizontal parallelograms, age in completed years (ACY)) + Total B1 B2 B3 B4 B5p + + ccfrVH + Cumulative cohort fertility rates (horizontal parallelograms + CCFR + ccfrVHbo + Cumulative cohort fertility rates by birth order (horizontal parallelograms + CCFR CCFR1 CCFR2 CCFR3 CCFR4 CCFR5p + + exposVH + Female population exposure by birth cohort and age (horizontal parallelograms, age in completed years (ACY)) + Exposure + + => OUTPUT + columns + ASFR + B + CCFR + E + dimensions + code, cohort, age, parity + +------------ +C 3x COHORT FERTILITY TABLES: code, cohort, x [COHORT] + cft + Cohort fertility tables, birth orders 1 to 5+ + b1x l0x m1x q1x Sb1x b2x l1x m2x q2x Sb2x b3x l2x m3x q3x Sb3x b4x l3x m4x q4x Sb4x b5px l4x m5px q5px Sb5px chix + + => OUTPUT + columns + b1x + l0x + m1x + q1x + Sb1x + b2x + l1x + m2x + q2x + Sb2x + b3x + l2x + m3x + q3x + Sb3x + b4x + l3x + m4x + q4x + Sb4x + b5px + l4x + m5px + q5px + Sb5px + chix + dimensions + code, cohort, x + +------------ +P 3X PERIOD FERTILITY TABLES: code, year, x [PERIOD] + pft + Period fertility tables, birth orders 1 to 5+ + w0x m1x q1x l0x b1x L0x Sb1x w1x m2x q2x l1x b2x L1x Sb2x w2x m3x q3x l2x b3x L2x Sb3x w3x m4x q4x l3x b4x L3x Sb4x w4x m5px q5px l4x b5px L4x Sb5px + + pftc + Census-based period fertility tables, birth orders 1 to 5+ + w0x m1x q1x l0x b1x L0x Sb1x w1x m2x q2x l1x b2x L1x Sb2x w2x m3x q3x l2x b3x L2x Sb3x w3x m4x q4x l3x b4x L3x Sb4x w4x m5px q5px l4x b5px L4x Sb5px + + => OUTPUT + columns + w0x + m1x + q1x + l0x + b1x + L0x + Sb1x + w1x + m2x + 2x + l1x + b2x + L1x + Sb2x + w2x + m3x + q3x + l2x + b3x + L2x + Sb3x + w3x + m4x + q4x + l3x + b4x + L3x + Sb4x + w4x + m5px + q5px + l4x + b5px + L4x + Sb5px + dimensions + code, year, x + +------------ +C 4A: code, year, cohort, ardy [COHORT] + asfrVV + Period fertility rates by calendar year, age reached during the year (ARDY) and birth cohort (vertical parallelograms) + ASFR + asfrVVbo + Period fertility rates by calendar year, age reached during the year (ARDY), birth cohort and birth order (vertical parallelograms) + ASFR ASFR1 ASFR2 ASFR3 ASFR4 ASFR5p + + birthsVV + Live births by calendar year, age reached during the year (ARDY) and birth cohort (vertical parallelograms) + Total + birthsVVbo + Live births by calendar year, age reached during the year (ARDY), birth cohort and birth order (vertical parallelograms) + Total B1 B2 B3 B4 B5p + + cpfrVV + Cumulative period fertility rates (vertical parallelograms) + CPFR + cpfrVVbo + Cumulative period fertility rates by birth order (vertical parallelograms) + CPFR CPFR1 CPFR2 CPFR3 CPFR4 CPFR5p + + exposVV + Female population exposure by calendar year, age reached during the year (ARDY) and birth cohort (vertical parallelograms) + Exposure + + => OUTPUT + columns + ASFR + B + CPFR + E + dimensions + code, year, cohort, ardy +""" + +import re + +import owid.catalog.processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +# CONFIG +COLUMN_RAW = "column_raw" +COLUMN_IND = "indicator_name" +COLUMNS_RENAME = { + "totbirthsrr": { + "total": "b", + } +} + +# Tables to process for PERIOD country-year +TABLES_PERIOD = [ + "adjtfrrr", + "cbrrr", + "mabrr", + "patfr", + # "patfrc", + "pmab", + # "pmabc", + "sdmabrr", + "tfrrr", + "totbirthsrr", +] +TABLES_PERIOD_W_PARITY = { + "patfr": { + "indicators": ["patfr"], + }, + # "patfrc", + "pmab": { + "indicators": ["tmab"], + }, + # "pmabc", +} +REGEX_PERIOD_BO = {} + +# Tables to process for COHORT country-cohort +TABLES_COHORT = [ + "mabvh", + "pprvhbo", + "sdmabvh", + "tfrvh", +] +TABLES_COHORT_W_PARITY = { + "pprvhbo": { + "indicators": ["ppr"], + }, +} +REGEX_COHORT_BO = { + "pprvhbo": { + "ppr": r"^ppr\d+_\d+$", + }, +} +# Tables to process for PERIOD country-year-age +TABLES_PERIOD_AGE = [ + "asfrrr", +] +TABLES_PERIOD_AGE_W_PARITY = {} +REGEX_PERIOD_AGE_BO = {} +# Tables to process for COHORT country-year-age +TABLES_COHORT_AGE = [ + "asfrvh", +] +TABLES_COHORT_AGE_W_PARITY = {} +REGEX_COHORT_AGE_BO = {} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("hfd") + + # 1/ Read period tables + consolidate in one table tb_period + ## Initial definitions + cols_index = ["country", "year"] + col_bo = "birth_order" + ## Read tables + tbs = make_table_list( + ds_meadow=ds_meadow, + table_names=TABLES_PERIOD, + tables_w_parity=TABLES_PERIOD_W_PARITY, + cols_index=cols_index, + col_bo=col_bo, + regex_bo=REGEX_PERIOD_BO, + ) + ## Merge + tb_period = consolidate_table_from_list(tbs, cols_index + [col_bo], "period") + + # 2/ Read cohort tables + consolidate in one table tb_cohort + ## Initial definitions + cols_index = ["country", "cohort"] + col_bo = "birth_order" + ## Read tables + tbs = make_table_list( + ds_meadow=ds_meadow, + table_names=TABLES_COHORT, + tables_w_parity=TABLES_COHORT_W_PARITY, + cols_index=cols_index, + col_bo=col_bo, + regex_bo=REGEX_COHORT_BO, + ) + # Quick fix: change birth_order label for PPR + tbs = _fix_ppr(tbs) + ## Merge + tb_cohort = consolidate_table_from_list(tbs, cols_index + [col_bo], "cohort") + + # 3/ Period tables (by age) + cols_index = ["country", "year", "age"] + col_bo = "birth_order" + ## Read tables + tbs = make_table_list( + ds_meadow=ds_meadow, + table_names=TABLES_PERIOD_AGE, + tables_w_parity=TABLES_PERIOD_AGE_W_PARITY, + cols_index=cols_index, + col_bo=col_bo, + regex_bo=REGEX_PERIOD_AGE_BO, + ) + ## Consolidate + tb_period_ages = consolidate_table_from_list( + tbs=tbs, + cols_index_out=cols_index + [col_bo], + short_name="period_ages", + fcn=keep_relevant_ages, + ) + tb_period_ages = tb_period_ages.rename( + columns={ + "asfr": "asfr_period", + } + ) + + # 4/ Cohort tables (by age) + cols_index = ["country", "cohort", "age"] + col_bo = "birth_order" + ## Read tables + tbs = make_table_list( + ds_meadow=ds_meadow, + table_names=TABLES_COHORT_AGE, + tables_w_parity=TABLES_COHORT_AGE_W_PARITY, + cols_index=cols_index, + col_bo=col_bo, + regex_bo=REGEX_COHORT_AGE_BO, + check_integration=False, + check_integration_limit=143, + ) + ## Consolidate + tb_cohort_ages = consolidate_table_from_list( + tbs=tbs, + cols_index_out=cols_index + [col_bo], + short_name="cohort_ages", + fcn=keep_relevant_ages, + ) + tb_cohort_ages = tb_cohort_ages.rename( + columns={ + "asfr": "asfr_cohort", + } + ) + + # + # Process data. + # + tables = [ + tb_period, + tb_cohort, + tb_period_ages, + tb_cohort_ages, + ] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def make_table_list( + ds_meadow, + table_names, + tables_w_parity, + cols_index, + col_bo, + regex_bo=None, + check_integration=True, + check_integration_limit=None, +): + """Reads relevant tables, and formats them accordingly. + + Tables come in wide format, sometimes as two-tables (main and birth order). This function consolidates them into single tables per topic. + + For instance, we have one table with total fertility rates (columns `tfr`). And then another one with fertilities broken down by birth order (columns `tfr`, `tfr1`, etc.) + Instead, we want a table in long format, which has one column `tfr` and adds the birth order as a dimension of the table. + """ + if regex_bo is None: + regex_bo = {} + + tbs = [] + for tname in table_names: + # Get custom regex for this table + regex = regex_bo.get(tname) + + # Read main table + tb = read_table(ds_meadow, tname) + + # Check if there is a birth order table for this indicator(s). If so, process it and integrate it to the main table + tname_bo = tname + "bo" + if tname_bo in ds_meadow.table_names: + # Read BO table + tb_bo = read_table(ds_meadow, tname_bo, tname) + # Get list of core indicators: These are the names of the columns that are actual indicators (and not dimensional indicators, e.g. `tfr1`, `tfr2`, etc.) + core_indicators = [col for col in tb.columns.intersection(tb_bo.columns) if col not in cols_index] + # Add BO to main table + tb = integrate_bo( + tb=tb, + tb_bo=tb_bo, + cols_index=cols_index, + core_indicators=core_indicators, + check=check_integration, + check_limit_wrong=check_integration_limit, + ) + # Consolidate table: Use long format, and add birth_order as a dimension of the main table. + tb = make_table_with_birth_order(tb, cols_index, core_indicators, col_bo, regex) + # Sometimes, the main table contains already indicators broken down by birth order. In such cases, we also need to reshape the table. + elif tname in tables_w_parity: + core_indicators = tables_w_parity[tname]["indicators"] + tb = make_table_with_birth_order(tb, cols_index, core_indicators, col_bo, regex) + + # Add formatted table to the list of tables. + tbs.append(tb) + + return tbs + + +def read_table(ds_meadow, tname, tname_base=None): + """Read table from dataset and minor cleaning: + + - Rename columns if applicable + - Harmonize country names + """ + # Read table + tb = ds_meadow.read(tname) + + # Rename columns + if tname_base is None: + tname_base = tname + if tname_base in COLUMNS_RENAME: + tb = tb.rename(columns=COLUMNS_RENAME[tname_base]) + + # Harmonize country names + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + country_col="code", + warn_on_unused_countries=False, + ) + + # Rename country column + tb = tb.rename(columns={"code": "country"}) + + return tb + + +def integrate_bo(tb, tb_bo, cols_index, core_indicators, check=True, check_limit_wrong=None): + """Merge main table with its BO counterpart. + + Some tables have a secondary table which provides the same core indicators but by birth order. + """ + # Outer join + tb = tb.merge( + tb_bo, + on=cols_index, + suffixes=["", "__bo"], + how="outer", + ) + + # Integrate core indicators + # It can happen that one table has more values for the core indicator. We solve this with fillna calls. + for col in core_indicators: + # Check that we can integrate them! + TOLERANCE = 5e-3 + if check: + assert ( + (((tb[col] - tb[f"{col}__bo"]) / tb[col]).dropna().abs() < TOLERANCE).all() + ).all(), f"Integration failed for {col}. Core indicator is not equivalent between main and `bo` tables." + elif check_limit_wrong is not None: + num = (~(((tb[col] - tb[f"{col}__bo"]) / tb[col]).dropna().abs() < TOLERANCE)).sum() + assert ( + num == check_limit_wrong + ), f"Integration failed for {col}. There are too many allowed miss-matches ({check_limit_wrong})!" + # Actual integration + tb[col] = tb[col].fillna(tb[f"{col}__bo"]) + tb = tb.drop(columns=[f"{col}__bo"]) + + return tb + + +def make_table_with_birth_order(tb, cols_index, core_indicators, col_bo="birth_order", regex_bo=None): + """Change the format of a table from wide to long, to incorporate the birth order as a dimension.""" + + if regex_bo is None: + regex_bo = {} + + def _generate_regex(name): + if re.search(r"\d$", string=name): # Check if the name ends with a number + return rf"^{name}_?(\d+|(\d+p)?)$" + else: + return rf"^{name}(\d+|(\d+p)?)$" + + regex_patterns = {name: regex_bo.get(name, _generate_regex(name)) for name in core_indicators} + + tb = tb.melt( + cols_index, + var_name=COLUMN_RAW, + value_name="value", + ) + + tb["indicator_name"] = None + tb[col_bo] = None + for name, pattern in regex_patterns.items(): + # print(f"> {name}") + + # Set indicator name + flag_0 = (~tb[COLUMN_RAW].isin(core_indicators)) | (tb[COLUMN_RAW] == name) + flag = tb[COLUMN_RAW].str.match(pattern) & flag_0 + assert tb.loc[flag, COLUMN_IND].isna().all(), "Multiple columns assign to the same indicator!" + tb.loc[flag, COLUMN_IND] = name + + # Get birth order + tb.loc[flag, col_bo] = tb.loc[flag, COLUMN_RAW].replace({f"{name}_?": ""}, regex=True) + tb.loc[tb[COLUMN_RAW] == name, col_bo] = "total" + + # Sanity check + assert tb[COLUMN_IND].notna().all(), "Some NaNs found in column `indicator_name`" + assert tb[col_bo].notna().all(), f"Some NaNs found in column `{col_bo}`" + + # Final reshape + tb = tb.drop(columns=[COLUMN_RAW]) + tb = tb.pivot(index=cols_index + [col_bo], columns=COLUMN_IND, values="value").reset_index() + tb = tb.rename_axis(None, axis=1) + + # Drop NaNs + tb = tb.dropna(subset=core_indicators) + + return tb + + +def consolidate_table_from_list(tbs, cols_index_out, short_name, fcn=None) -> geo.Table: + ## Sanity check: no column is named the same + _sanity_check_colnames(tbs, cols_index_out) + + # Merge + tb = pr.multi_merge(tbs, on=cols_index_out, how="outer") + + # Optional function + if fcn is not None: + tb = fcn(tb) + + # Format + tb = tb.format(cols_index_out, short_name=short_name) + return tb + + +def _fix_ppr(tbs): + for tb in tbs: + if tb.m.short_name == "pprvhbo": + tb["birth_order"] = tb["birth_order"].str.split("_").str[-1] + return tbs + + +def _sanity_check_colnames(tbs, cols_index_out): + colnames = [col for t in tbs for col in t.columns if col not in cols_index_out] + assert len(colnames) == len(set(colnames)), "Some columns are named the same!" + + +def keep_relevant_ages(tb): + AGES_RELEVANT = [ + "12-", + "20", + "30", + "40", + "50", + "55+", + ] + tb = tb.loc[tb["age"].isin(AGES_RELEVANT)] + return tb diff --git a/etl/steps/data/garden/hmd/2024-12-01/hmd.countries.json b/etl/steps/data/garden/hmd/2024-12-01/hmd.countries.json new file mode 100644 index 00000000000..c88a67232d4 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-01/hmd.countries.json @@ -0,0 +1,48 @@ +{ + "Australia": "Australia", + "Austria": "Austria", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Bulgaria": "Bulgaria", + "Canada": "Canada", + "Chile": "Chile", + "Croatia": "Croatia", + "Czechia": "Czechia", + "Denmark": "Denmark", + "East Germany": "East Germany", + "Estonia": "Estonia", + "Finland": "Finland", + "Germany": "Germany", + "Greece": "Greece", + "Hong Kong": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "Ireland": "Ireland", + "Japan": "Japan", + "Latvia": "Latvia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Norway": "Norway", + "Poland": "Poland", + "Portugal": "Portugal", + "Republic of Korea": "South Korea", + "Russia": "Russia", + "Slovenia": "Slovenia", + "Spain": "Spain", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Taiwan": "Taiwan", + "Ukraine": "Ukraine", + "United Kingdom": "United Kingdom", + "West Germany": "West Germany", + "England and Wales, Total Population": "England and Wales", + "France, Total Population": "France", + "Israel, Total Population": "Israel", + "Italy ": "Italy", + "Northern Ireland": "Northern Ireland", + "Scotland": "Scotland", + "Slovakia ": "Slovakia", + "The United States of America": "United States" +} diff --git a/etl/steps/data/garden/hmd/2024-12-01/hmd.excluded_countries.json b/etl/steps/data/garden/hmd/2024-12-01/hmd.excluded_countries.json new file mode 100644 index 00000000000..cf4a0297e10 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-01/hmd.excluded_countries.json @@ -0,0 +1,6 @@ +[ + "England and Wales, Civilian National Population", + "France, Civilian Population", + "New Zealand -- Maori", + "New Zealand -- Non-Maori" +] diff --git a/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml new file mode 100644 index 00000000000..d846e9b485c --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml @@ -0,0 +1,404 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + attribution_short: HMD + topic_tags: + - Life Expectancy + + others: + display_name_dim: |- + at << 'birth' if (age == '0') else age >><< ', ' + sex + 's' if (sex != 'total') >>, << type >> + title_public_dim: |- + <% if age != 'total' %>at << age if age != '0' else 'birth'>><% endif %> + global: + life_expectancy: + point_1: |- + <%- if type == "period" -%> + Period life expectancy is a metric that summarizes death rates across all age groups in one particular year. + <%- else -%> + Cohort life expectancy is the average lifespan of a group of people, usually a birth cohort – people born in the same year. + <%- endif -%> + point_2: |- + <%- if type == "period" -%> + <%- if age == '0' -%> + For a given year, it represents the average lifespan for a hypothetical group of people, if they experienced the same age-specific death rates throughout their whole lives as the age-specific death rates seen in that particular year. + <%- else -%> + For a given year, it represents the remaining average lifespan for a hypothetical group of people, if they experienced the same age-specific death rates throughout the rest of their lives as the age-specific death rates seen in that particular year. + <%- endif -%> + <%- else -%> + <%- if age == '0' -%> + It is calculated by tracking individuals from that cohort throughout their lives until death, and calculating their average lifespan. + <%- else -%> + It is calculated by tracking individuals from that cohort throughout the rest of their lives until death, and calculating their average remaining lifespan. + <%- endif -%> + <%- endif -%> + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 365 + description: |- + The Human Mortality Database (HMD) is a collaborative project sponsored by the University of California, Berkeley (in the United States of America) and the Max Planck Institute for Demographic Research (in Germany). + + It provides researchers with comprehensive data on mortality from around 40 countries around the world, which have very high coverage and quality of data at the national level, through vital registration and potentially census data. + + Data is given in terms of period or cohort estimates: + + - **Period data** refers to a snapshot estimated with data at a particular interval. For period life expectancy at birth, this refers to the estimated life expectancy at birth based on a synthetic cohort created using mortality rates across age groups in a given year. + - **Cohort data** refers to estimates of a particular birth cohort. For cohort life expectancy at birth, this refers to the average number of years that people in the birth cohort survived. Cohort data may use birth cohorts that are ‘almost extinct’ rather than entirely extinct. + + 'Interval' refers to the specific age- and time- period of the estimate. An interval can be a one year period for a single-age group, or it can be wider. For example, the life expectancy of a 40 year old in 2019 corresponds to an interval of 1 single-age group in 1 year. The central death rate of 5–9 year olds in 2020 corresponds to an interval of a 5 year age group in 1 year. + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + life_tables: + common: + presentation: + title_variant: << sex + 's, ' if sex != 'total' >><< type + ' tables'>> + topic_tags: + - Life Expectancy + + variables: + central_death_rate: + title: Central death rate + description_short: |- + The death rate, calculated as the number of deaths divided by the average number of people alive during the interval. + description_key: + - "The death rate is measured using the number of person-years lived during the interval." + - "Person-years refers to the combined total time that a group of people has lived. For example, if 10 people each live for 2 years, they collectively contribute 20 person-years." + - "The death rate is slightly different from the 'probability of death' during the interval, because the 'probability of death' metric uses a different denominator: the number of people alive at that age at the start of the interval, while this indicator uses the average number of people alive during the interval." + unit: deaths per 1,000 people + processing_level: minor + description_processing: |- + The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 1,000 to get a per-1,000 people rate. + display: + name: |- + {tables.life_tables.variables.central_death_rate.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.life_tables.variables.central_death_rate.title} {definitions.others.title_public_dim} + topic_tags: + - Life Expectancy + + probability_of_death: + title: Probability of death + unit: "%" + description_short: |- + The probability of dying in a given interval, among people who survived to the start of that interval. + description_key: + - "For example, the probability of death for a 50 year old in a given year is found by: dividing the number of deaths in 50 year olds that year, by the number of people alive at the age of 50 at the start of the year." + processing_level: minor + description_processing: |- + The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 100 to get a percentage. + display: + name: |- + {tables.life_tables.variables.probability_of_death.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.life_tables.variables.probability_of_death.title} {definitions.others.title_public_dim} + topic_tags: + - Life Expectancy + + average_survival_length: + title: Average survival length + short_unit: years + unit: years + description_short: Average length of survival between ages x and x+n for persons dying in the interval. + display: + name: |- + {tables.life_tables.variables.average_survival_length.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.life_tables.variables.average_survival_length.title} {definitions.others.title_public_dim} + + number_survivors: + title: Number of survivors + unit: survivors + description_short: Number of survivors at a given age, assuming survivors at 0 years old is 100,000. + display: + name: |- + {tables.life_tables.variables.number_survivors.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.life_tables.variables.number_survivors.title} {definitions.others.title_public_dim} + + number_deaths: + title: Number of deaths + short_unit: deaths + unit: deaths + description_short: Number of deaths between ages x and x+n. + display: + name: |- + {tables.life_tables.variables.number_deaths.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.life_tables.variables.number_deaths.title} {definitions.others.title_public_dim} + topic_tags: + - Life Expectancy + + number_person_years_lived: + title: Number of person-years lived + unit: person-years + description_short: Number of person-years lived between ages x and x+n. + display: + name: |- + {tables.life_tables.variables.number_person_years_lived.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.life_tables.variables.number_person_years_lived.title} {definitions.others.title_public_dim} + + number_person_years_remaining: + title: Number of person-years remaining + unit: person-years + description_short: Number of person-years remaining after a given age. + display: + name: |- + {tables.life_tables.variables.number_person_years_remaining.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.life_tables.variables.number_person_years_remaining.title} {definitions.others.title_public_dim} + + life_expectancy: + title: Life expectancy + short_unit: years + unit: years + description_short: |- + <%- if age == '0' -%> + <%- if sex == 'total' -%> + The << type >> life expectancy at birth, in a given year. + <%- else -%> + The << type >> life expectancy at birth among << sex + 's' >>, in a given year. + <%- endif -%> + <%- else -%> + <%- if sex == 'total' -%> + The remaining << type >> life expectancy at age << age >>, in a given year. + <%- else -%> + The remaining << type >> life expectancy at age << age >> among << sex + 's' >>, in a given year. + <%- endif -%> + <%- endif -%> + description_key: + - |- + {definitions.global.life_expectancy.point_1} + - |- + {definitions.global.life_expectancy.point_2} + - |- + <%- if age != '0' -%> + <%- if type == "period" -%> + This shows the remaining period life expectancy among people who have already reached the age << age >>, using death rates from their age group and older age groups. + <%- else -%> + This shows the remaining cohort life expectancy of people who have reached the age << age >>. + <%- endif -%> + <%- endif -%> + display: + numDecimalPlaces: 1 + name: |- + {tables.life_tables.variables.life_expectancy.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.life_tables.variables.life_expectancy.title} {definitions.others.title_public_dim} + + exposures: + common: + presentation: + title_variant: << sex + 's, ' if sex != 'total' >><< type + ' tables'>> + topic_tags: + - Life Expectancy + + variables: + exposure: + title: Exposure-to-risk + unit: person-years + description_short: The total number of person-years lived within a given interval. + description_key: + - It is equivalent to the average number of people living in that age group during the period. + description_from_producer: |- + Estimates of the population exposed to the risk of death during some age-time interval are based on annual (January 1st) population estimates, with small corrections that reflect the timing of deaths during the interval. Period exposure estimations are based on assumptions of uniformity in the distribution of events except when historical monthly birth data are available. + display: + name: |- + {tables.exposures.variables.exposure.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.exposures.variables.exposure.title} {definitions.others.title_public_dim} + + deaths: + common: + presentation: + topic_tags: + - Global Health + title_variant: << sex + 's, ' if sex != 'total' >> + + variables: + deaths: + title: Number of deaths + unit: deaths + description_short: |- + <% if sex == 'total' %> + The total number of deaths at age << age >> in a given year. + <%- else %> + The total number of << sex >> deaths at age << age >> in a given year. + <%- endif %> + display: + name: |- + {tables.deaths.variables.deaths.title} at << 'birth' if (age == '0') else age >><< ', ' + sex + 's' if (sex != 'total') >> + presentation: + title_public: |- + {tables.deaths.variables.deaths.title} {definitions.others.title_public_dim} + + population: + common: + presentation: + topic_tags: + - Population Growth + + variables: + population: + title: Population + unit: people + description_short: |- + <% if age == 'total' %> + <%- if sex == 'total' %> + The total number of people living in a country. + <%- else %> + The total number of << sex + 's' >> living in a country. + <%- endif %> + <%- else %> + <% if sex == 'total' %> + The total number of people aged << age >> living in a country. + <%- else %> + The total number of << sex + 's' >> aged << age >> living in a country. + <%- endif %> + <%- endif %> + description_processing: |- + From HMD Notes: For populations with territorial changes, two sets of population estimates are given for years in which a territorial change occurred. The first set of estimates (identified as year "19xx-") refers to the population just before the territorial change, whereas the second set (identified as year "19xx+") refers to the population just after the change. For example, in France, the data for "1914-" cover the previous territory (i.e., as of December 31, 1913), whereas the data for "1914+" reflect the territorial boundaries as of January 1, 1914. + + We have used the "19xx+" population estimates for the year of the territorial change. + display: + name: |- + {tables.population.variables.population.title}<< 'aged ' + age if (age != 'total') >><< ', ' + sex + 's' if (sex != 'total') >> + presentation: + title_public: |- + {tables.population.variables.population.title} {definitions.others.title_public_dim} + title_variant: << sex + 's, ' if sex != 'total' >> + + births: + common: + presentation: + topic_tags: + - Fertility Rate + title_variant: << sex + 's, ' if sex != 'total' >> + + variables: + births: + title: Births + unit: births + description_short: |- + <% if sex == 'total' %> + The total number of births in a given year. + <%- else %> + The total number of << sex >> births in a given year. + <%- endif %> + display: + name: |- + Births, sex: << sex >> + presentation: + title_public: |- + {tables.births.variables.births.title}, + <%- if sex == 'total' %> + total + <%- else %> + << sex >>s + <%- endif %> + birth_rate: + title: Birth rate + unit: births per 1,000 people + description_short: |- + <% if sex == 'total' %> + The total number of births per 1,000 people in a given year. + <%- else %> + The total number of << sex >> births per 1,000 in a given year. + <%- endif %> + display: + name: |- + Birth rate, sex: << sex >> + presentation: + title_public: |- + {tables.births.variables.birth_rate.title}, + <%- if sex == 'total' %> + total + <%- else %> + << sex >>s + <%- endif %> + + diff_ratios: + common: + presentation: + topic_tags: + - Life Expectancy + + variables: + central_death_rate_mf_ratio: + title: Central death rate ratio (m/f) + unit: "" + description_short: |- + The ratio of the << type >> central death rate (males to females) at age << age >>. + processing_level: major + display: + name: |- + Central death rate (male-to-female ratio) at << 'birth' if (age == '0') else age >>, << type >> + presentation: + title_public: Central death rate {definitions.others.title_public_dim} + title_variant: |- + male-to-female ratio, << type >> tables + topic_tags: + - Life Expectancy + - Gender Ratio + + life_expectancy_fm_diff: + title: Life expectancy difference (f-m) + short_unit: years + unit: years + description_short: |- + The difference in the << type >> life expectancy (females - males) at age << age >>. + processing_level: major + description_key: + - Higher values indicate longer life expectancy among females than males. + - |- + {definitions.global.life_expectancy.point_1} + - |- + {definitions.global.life_expectancy.point_2} + display: + numDecimalPlaces: 1 + name: |- + Life expectancy (female-male difference) at << 'birth' if (age == '0') else age >>, << type >> + presentation: + title_public: Life expectancy at << age if age != '0' else 'birth'>> + title_variant: female-male difference, << type >> tables + topic_tags: + - Life Expectancy + - Gender Ratio + + life_expectancy_fm_ratio: + title: Life expectancy ratio (f/m) + unit: "" + short_unit: "" + description_short: |- + The ratio of the << type >> life expectancy (females to males) at age << age >>. + processing_level: major + description_key: + - Higher values indicate longer life expectancy among females than males. + - |- + {definitions.global.life_expectancy.point_1} + - |- + {definitions.global.life_expectancy.point_2} + display: + numDecimalPlaces: 1 + name: |- + Life expectancy (female-to-male ratio) at << 'birth' if (age == '0') else age >>, << type >> + presentation: + title_public: Life expectancy at << age if age != '0' else 'birth'>> + title_variant: female-to-male ratio, << type >> tables + topic_tags: + - Life Expectancy + - Gender Ratio diff --git a/etl/steps/data/garden/hmd/2024-12-01/hmd.py b/etl/steps/data/garden/hmd/2024-12-01/hmd.py new file mode 100644 index 00000000000..471502c0299 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-01/hmd.py @@ -0,0 +1,240 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import numpy as np +from owid.catalog import Table +from owid.catalog import processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("hmd") + + # Read table from meadow dataset. + paths.log.info("reading tables") + tb_lt = ds_meadow.read("life_tables") + tb_exp = ds_meadow.read("exposures") + tb_mort = ds_meadow.read("deaths") + tb_pop = ds_meadow.read("population") + tb_births = ds_meadow.read("births") + + # Drop NaNs + tb_exp = tb_exp.dropna(subset="exposure") + tb_births = tb_births.dropna(subset="births") + + # + # Process data. + # + paths.log.info("processing tables") + + # 1/ Life tables + def _sanity_check_lt(tb): + summary = tb.groupby(["country", "year", "sex", "type", "age"], as_index=False).size().sort_values("size") + row_dups = summary.loc[summary["size"] != 1] + assert row_dups.shape[0] <= 19, "Found duplicated rows in life tables!" + assert (row_dups["country"].unique() == "Switzerland").all() & ( + row_dups["year"] <= 1931 + ).all(), "Unexpected duplicates in life tables!" + + flag = ( + (tb_lt["country"] == "Switzerland") + & (tb_lt["age"] == "110+") + & (tb_lt["type"] == "cohort") + & (tb_lt["sex"] == "male") + & (tb_lt["year"] <= 1931) + & (tb_lt["year"] >= 1913) + ) + tb = tb.loc[~flag] + + return tb + + tb_lt = process_table( + tb=tb_lt, + col_index=["country", "year", "sex", "age", "type"], + sex_expected={"females", "males", "total"}, + callback_post=_sanity_check_lt, + ) + # Scale central death rates + tb_lt["central_death_rate"] = tb_lt["central_death_rate"] * 1_000 + tb_lt["probability_of_death"] = tb_lt["probability_of_death"] * 100 + + # 2/ Exposures + tb_exp = process_table( + tb=tb_exp, + col_index=["country", "year", "sex", "age", "type"], + ) + + # 3/ Mortality + tb_mort = process_table( + tb=tb_mort, + col_index=["country", "year", "sex", "age", "type"], + ) + assert set(tb_mort["type"].unique()) == {"period"}, "Unexpected values in column 'type' in mortality tables!" + tb_mort = tb_mort.drop(columns="type") + + # 4/ Population + tb_pop = process_table( + tb=tb_pop, + col_index=["country", "year", "sex", "age"], + ) + tb_pop = add_total_population(tb_pop) + + # 5/ Births + tb_births = process_table( + tb=tb_births, + col_index=["country", "year", "sex"], + ) + + def add_birth_rate(tb_pop, tb_births): + tb_pop_agg = tb_pop[tb_pop["age"] == "total"].drop(columns="age") + tb_births = tb_births.merge(tb_pop_agg, on=["country", "year", "sex"], how="left") + tb_births["birth_rate"] = tb_births["births"] / tb_births["population"] * 1_000 + tb_births["birth_rate"] = tb_births["birth_rate"].replace([np.inf, -np.inf], np.nan) + tb_births = tb_births.drop(columns=["population"]) + return tb_births + + tb_births = add_birth_rate(tb_pop, tb_births) + + # 6/ Create table with differences and ratios + tb_ratios = make_table_diffs_ratios(tb_lt) + + # Create list with tables + paths.log.info("saving tables") + tables = [ + tb_lt.format(["country", "year", "sex", "age", "type"]), + tb_exp.format(["country", "year", "sex", "age", "type"]), + tb_mort.format(["country", "year", "sex", "age"]), + tb_pop.format(["country", "year", "sex", "age"]), + tb_births.format(["country", "year", "sex"]), + tb_ratios.format(["country", "year", "age", "type"], short_name="diff_ratios"), + ] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def process_table(tb, col_index, sex_expected=None, callback_post=None): + """Reshape a table. + + Input table has column `format`, which is sort-of redundant. This function ensures we can safely drop it (i.e. no duplicate rows). + + Additionally, it standardizes the dimension values. + """ + paths.log.info(f"processing table {tb.m.short_name}") + + if sex_expected is None: + sex_expected = {"female", "male", "total"} + + # Standardize dimension values + tb = standardize_sex_cat_names(tb, sex_expected) + + # Drop duplicate rows + tb = tb.sort_values("format").drop_duplicates(subset=[col for col in tb.columns if col != "format"], keep="first") + + # Check no duplicates + if callback_post is not None: + tb = callback_post(tb) + else: + summary = tb.groupby(col_index, as_index=False).size().sort_values("size") + row_dups = summary.loc[summary["size"] != 1] + assert row_dups.empty, "Found duplicated rows in life tables!" + + # Final dropping o f columns + tb = tb.drop(columns="format") + + # Country name standardization + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + ) + + # Make year column integer + tb["year"] = tb["year"].astype(int) + + return tb + + +def standardize_sex_cat_names(tb, sex_expected): + # Define expected sex categories + sex_expected = {s.lower() for s in sex_expected} + + # Set sex categories to lowercase + tb["sex"] = tb["sex"].str.lower() + + # Sanity check categories + sex_found = set(tb["sex"].unique()) + assert sex_found == sex_expected, f"Unexpected sex categories! Found {sex_found} but expected {sex_expected}" + + # Rename + tb["sex"] = tb["sex"].replace({"females": "female", "males": "male"}) + + return tb + + +def add_total_population(tb_pop): + flag = tb_pop["age"].str.match(r"^(\d{1,3}|\d{3}\+)$") + tb_pop_total = tb_pop[flag] + tb_pop_total = tb_pop_total.groupby(["country", "year", "sex"], as_index=False)["population"].sum() + tb_pop_total["age"] = "total" + tb_pop = pr.concat([tb_pop, tb_pop_total], ignore_index=True) + return tb_pop + + +def make_table_diffs_ratios(tb: Table) -> Table: + """Create table with metric differences and ratios. + + Currently, we estimate: + + - female - male: Life expectancy + - male/female: Life Expectancy, Central Death Rate + """ + # Pivot & obtain differences and ratios + cols_index = ["country", "year", "age", "type"] + tb_new = ( + tb.pivot_table( + index=cols_index, + columns="sex", + values=["life_expectancy", "central_death_rate"], + ) + .assign( + life_expectancy_fm_diff=lambda df: df[("life_expectancy", "female")] - df[("life_expectancy", "male")], + life_expectancy_fm_ratio=lambda df: df[("life_expectancy", "female")] / df[("life_expectancy", "male")], + central_death_rate_mf_ratio=lambda df: df[("central_death_rate", "male")] + / df[("central_death_rate", "female")], + ) + .reset_index() + ) + + # Keep relevant columns + cols = [col for col in tb_new.columns if col[1] == ""] + tb_new = tb_new.loc[:, cols] + + # Rename columns + tb_new.columns = [col[0] for col in tb_new.columns] + + # Add metadata back + for col in tb_new.columns: + if col not in cols_index: + tb_new[col].metadata.origins = tb["life_expectancy"].m.origins.copy() + tb_new[col] = tb_new[col].replace([np.inf, -np.inf], np.nan) + + return tb_new diff --git a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.countries.json b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.countries.json new file mode 100644 index 00000000000..26b62193714 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.countries.json @@ -0,0 +1,47 @@ +{ + "AUS": "Australia", + "AUT": "Austria", + "BEL": "Belgium", + "BGR": "Bulgaria", + "BLR": "Belarus", + "CAN": "Canada", + "CHE": "Switzerland", + "CHL": "Chile", + "CZE": "Czechia", + "DNK": "Denmark", + "ESP": "Spain", + "EST": "Estonia", + "FIN": "Finland", + "GRC": "Greece", + "HKG": "Hong Kong", + "HRV": "Croatia", + "HUN": "Hungary", + "IRL": "Ireland", + "ISL": "Iceland", + "ISR": "Israel", + "ITA": "Italy", + "JPN": "Japan", + "KOR": "South Korea", + "LTU": "Lithuania", + "LUX": "Luxembourg", + "LVA": "Latvia", + "NLD": "Netherlands", + "NOR": "Norway", + "POL": "Poland", + "PRT": "Portugal", + "RUS": "Russia", + "SVK": "Slovakia", + "SVN": "Slovenia", + "SWE": "Sweden", + "UKR": "Ukraine", + "USA": "United States", + "DEUTE": "East Germany", + "DEUTNP": "Germany", + "DEUTW": "West Germany", + "FRATNP": "France", + "GBRTENW": "England and Wales", + "GBR_NIR": "Northern Ireland", + "GBR_NP": "United Kingdom", + "GBR_SCO": "Scotland", + "NZL_NP": "New Zealand" +} diff --git a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.excluded_countries.json b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.excluded_countries.json new file mode 100644 index 00000000000..3a7f14126b5 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.excluded_countries.json @@ -0,0 +1,4 @@ +[ + "FRACNP", + "GBRCENW" +] diff --git a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.meta.yml b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.meta.yml new file mode 100644 index 00000000000..96e7e2a8e04 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.meta.yml @@ -0,0 +1,111 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Fertility Rate + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + title: Birth rate by month (HMD) + update_period_days: 365 + +tables: + birth_rate: + variables: + birth_rate: + title: Birth rate, on a monthly basis + unit: births per 1,000 people + description_short: |- + The total number of births per 1,000 people in a given month. + display: + name: |- + Birth rate + + birth_rate_per_day: + title: Birth rate per day, on a monthly basis + unit: births per million people + description_short: |- + The average daily number of births, per million people, calculated monthly. + display: + name: |- + Birth rate, per day + + birth_rate_lead_9months: + title: Birth rate, on a monthly basis, by estimated month of conception + unit: births per 1,000 people + description_short: |- + The total number of births per 1,000 people in a given month. + display: + name: |- + Birth rate + + birth_rate_per_day_lead_9months: + title: Birth rate per day, on a monthly basis, by estimated month of conception + unit: births per 1,000 people + description_short: |- + The average daily number of births, per million people, calculated monthly. + display: + name: |- + Birth rate, per day + + birth_rate_month: + variables: + birth_rate: + title: Birth rate, in << month >> + unit: births per 1,000 people + description_short: |- + The total number of births per 1,000 people in <>. + display: + name: |- + Birth rate + + birth_rate_per_day: + title: Birth rate per day, in << month >> + unit: births per million people + description_short: |- + The average daily number of births, per million people, calculated for <>. + display: + name: |- + Birth rate, per day + + birth_rate_month_max: + variables: + month_max: + title: Month ordinal with peak daily birth rate + unit: "" + description_short: |- + Number corresponding to the month with the highest daily birth rate. + month_max_name: + title: Month name with peak daily birth rate + unit: "" + description_short: |- + Month with the highest daily birth rate. + birth_rate_per_day_max: + title: Peak birth rate per day, on a monthly basis + unit: births per million people + description_short: |- + The highest average daily number of births, per million people, recorded in the given year. + display: + name: |- + Maximum birth rate, per day + + month_max_lead_9months: + title: Month ordinal with peak daily birth rate in 9 months + unit: "" + description_short: |- + Number corresponding to the month with the highest daily birth rate. + month_max_name_lead_9months: + title: Month name with peak daily birth rate in 9 months + unit: "" + description_short: |- + Month with the highest daily birth rate. + birth_rate_per_day_max_lead_9months: + title: Peak birth rate per day, on a monthly basis, in 9 months + unit: births per million people + description_short: |- + The highest average daily number of births, per million people, recorded in the given year. + display: + name: |- + Maximum birth rate, per day diff --git a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py new file mode 100644 index 00000000000..87a46550a1b --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py @@ -0,0 +1,187 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import calendar + +import numpy as np +import pandas as pd + +from etl.data_helpers import geo +from etl.data_helpers.misc import interpolate_table +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("hmd_country") + ds_hmd = paths.load_dataset("hmd") + + # Read table from meadow dataset. + tb = ds_meadow.read("monthly") + tb_pop = ds_hmd.read("population") + + # + # Process data. + # + tb = make_main_table(tb, tb_pop) + tb_month_long, tb_month_dimensions, tb_month_max = make_tables(tb) + + tables = [ + tb_month_long.format(["country", "date"], short_name="birth_rate"), + tb_month_dimensions.format(["country", "year", "month"], short_name="birth_rate_month"), + tb_month_max.format(["country", "year"], short_name="birth_rate_month_max"), + ] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def make_main_table(tb, tb_pop): + ## Discard unknown/total values + tb = clean_table(tb) + + # Add population to monthly birth data table + tb = add_population_column(tb, tb_pop) + + # Estimate metrics + tb = estimate_metrics(tb) + + # Fix date (use last day instead of first) + tb["date"] = tb["date"] + pd.to_timedelta(tb["days_in_month"] - 1, unit="D") + + # Sort rows + tb = tb.sort_values(["country", "date", "date"]) + + return tb + + +def make_tables(tb): + # Classic time-series, with date-values + tb_long = tb[["country", "date", "birth_rate", "birth_rate_per_day"]] + ## Add 9-month lead + tb_long["date_lead"] = (tb_long[["date"]] - pd.DateOffset(months=9) + pd.offsets.MonthEnd(0)).squeeze() + cols = [col for col in tb_long.columns if col != "date_lead"] + cols_lead = [col for col in tb_long.columns if col != "date"] + tb_long = tb_long[cols].merge( + tb_long[cols_lead], + left_on=["country", "date"], + right_on=["country", "date_lead"], + suffixes=("", "_lead_9months"), + how="outer", + ) + tb_long["date"] = tb_long["date"].fillna(tb_long["date_lead"]) + tb_long = tb_long.drop(columns="date_lead") + + # Month as a dimension + tb_dimensions = tb[["country", "year", "month", "birth_rate", "birth_rate_per_day"]] + tb_dimensions["month"] = tb_dimensions["month"].apply(lambda x: calendar.month_name[x]) + + # For each year, ID of the month with highest birth rate per day + def find_peak_month(tb): + tb = tb.loc[ + tb.groupby(["country", "year"])["birth_rate_per_day"].idxmax(), + ["country", "year", "month", "birth_rate_per_day"], + ].rename(columns={"month": "month_max", "birth_rate_per_day": "birth_rate_per_day_max"}) + tb["month_max_name"] = tb["month_max"].apply(lambda x: calendar.month_name[x]) + + return tb + + tb_month_max = find_peak_month(tb) + # Get: "for a given year, what was the month that lead to highest month-birth rate in +9months" + tb_pre9m = tb.copy() + tb_pre9m["date"] = (tb_pre9m[["date"]] - pd.DateOffset(months=9) + pd.offsets.MonthEnd(0)).squeeze() + tb_pre9m["year"] = tb_pre9m["date"].dt.year + tb_pre9m["year"] = tb_pre9m["year"].copy_metadata(tb["year"]) + tb_pre9m["month"] = tb_pre9m["date"].dt.month + tb_pre9m["month"] = tb_pre9m["month"].copy_metadata(tb["month"]) + + tb_pre9m = find_peak_month(tb_pre9m) + # Merge + tb_month_max = tb_month_max.merge(tb_pre9m, on=["country", "year"], how="outer", suffixes=("", "_lead_9months")) + return tb_long, tb_dimensions, tb_month_max + + +def clean_table(tb): + """Filter rows, harmonize country names, add date column.""" + # Filter unwanted month categories, set dtype + tb = tb.loc[~tb["month"].isin(["TOT", "UNK"])] + tb["month"] = tb["month"].astype("Int64") + ## Create date column. TODO: check what day of the month to assign + tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=1)) + # Harmonize country names + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + warn_on_unknown_excluded_countries=False, + ) + + return tb + + +def add_population_column(tb, tb_pop): + """Add population column to main table for each date.""" + # Prepare population table + tb_pop = _prepare_population_table(tb_pop) + # Merge population table with main table + tb = tb.merge(tb_pop, on=["country", "date"], how="outer") + tb = tb.sort_values(["country", "date"]) + # Interpolate to get monthly population estimates + tb_ = interpolate_table( + tb[["country", "date", "population"]], + entity_col="country", + time_col="date", + time_mode="none", + ) + tb = tb.drop(columns="population").merge(tb_, on=["country", "date"], how="left") + # Drop unused rows + tb = tb.dropna(subset=["year"]) + return tb + + +def _prepare_population_table(tb): + """Prepare population table to merge with main table. + + Original table is given in years, but we need it in days! We use linear interpolation for that. + """ + tb_aux = tb.loc[(tb["age"] == "total") & (tb["sex"] == "total"), ["country", "year", "population"]] + ## Assign a day to population. TODO: Check if this is true + tb_aux["date"] = pd.to_datetime(tb_aux["year"].astype(str) + "-01-01") + tb_aux = tb_aux.drop(columns="year") + + return tb_aux + + +def estimate_metrics(tb): + """Estimate metrics: birth rate and birth rate per day.""" + # Get days in month + tb["days_in_month"] = tb.apply(lambda row: calendar.monthrange(row["year"], row["month"])[1], axis=1) + # Estimate rates + tb["birth_rate"] = tb["births"] / tb["population"] * 1_000 + tb["birth_rate_per_day"] = tb["birth_rate"] / tb["days_in_month"] * 1_000 + # Check + assert tb[["birth_rate", "birth_rate_per_day"]].notna().all().all() + # Replace INF values with NAs + tb[["birth_rate", "birth_rate_per_day"]] = tb[["birth_rate", "birth_rate_per_day"]].replace( + [np.inf, -np.inf], pd.NA + ) + # Drop NAs + tb = tb.dropna(subset=["birth_rate", "birth_rate_per_day"]) + + return tb diff --git a/etl/steps/data/garden/homicide/2024-10-30/unodc.countries.json b/etl/steps/data/garden/homicide/2024-10-30/unodc.countries.json new file mode 100644 index 00000000000..7eeb99a2648 --- /dev/null +++ b/etl/steps/data/garden/homicide/2024-10-30/unodc.countries.json @@ -0,0 +1,215 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cayman Islands": "Cayman Islands", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Gibraltar": "Gibraltar", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guam": "Guam", + "Guatemala": "Guatemala", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Holy See": "Vatican", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Isle of Man": "Isle of Man", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Melanesia": "Melanesia", + "Mexico": "Mexico", + "Micronesia": "Micronesia", + "Micronesia (Federated States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nepal": "Nepal", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Polynesia": "Polynesia", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "R\u00e9union": "Reunion", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Martin (French Part)": "Saint Martin (French part)", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "State of Palestine": "Palestine", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Republic of Tanzania": "Tanzania", + "United States Virgin Islands": "United States Virgin Islands", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "World": "World", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "All Americas": "Americas (UN)", + "All Asia": "Asia (UN)", + "All Europe": "Europe (UN)", + "All Oceania": "Oceania (UN)", + "China, Hong Kong Special Administrative Region": "Hong Kong", + "China, Macao Special Administrative Region": "Macao", + "Kosovo under UNSCR 1244": "Kosovo", + "Latin America and the Caribbean": "Latin America and the Caribbean (UN)", + "Netherlands (Kingdom of the)": "Netherlands", + "T\u00fcrkiye": "Turkey", + "United Kingdom (England and Wales)": "England and Wales", + "United Kingdom (Northern Ireland)": "Northern Ireland", + "United Kingdom (Scotland)": "Scotland", + "All Africa": "Africa (UN)" +} diff --git a/etl/steps/data/garden/homicide/2024-10-30/unodc.excluded_countries.json b/etl/steps/data/garden/homicide/2024-10-30/unodc.excluded_countries.json new file mode 100644 index 00000000000..78ed6516098 --- /dev/null +++ b/etl/steps/data/garden/homicide/2024-10-30/unodc.excluded_countries.json @@ -0,0 +1,17 @@ +[ + "Australia and New Zealand", + "Iraq (Kurdistan Region)", + "Iraq (Central Iraq)", + "Central Asia", + "Eastern Asia", + "Eastern Europe", + "Northern Africa", + "Northern America", + "Northern Europe", + "South-eastern Asia", + "Southern Asia", + "Southern Europe", + "Sub-Saharan Africa", + "Western Asia", + "Western Europe" +] diff --git a/etl/steps/data/garden/homicide/2024-10-30/unodc.meta.yml b/etl/steps/data/garden/homicide/2024-10-30/unodc.meta.yml new file mode 100644 index 00000000000..67b9918497e --- /dev/null +++ b/etl/steps/data/garden/homicide/2024-10-30/unodc.meta.yml @@ -0,0 +1,113 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Homicides + description_processing: |- + Values for the United Kingdom are calcuated by Our World in Data from UNODC data for England, Wales, Scotland and Northern Ireland. + + metric: |- + <% if unit_of_measurement == "Counts" %> + Number of homicides + <%- elif unit_of_measurement == "Rate per 100,000 population" %> + Homicide rate per 100,000 population + <%- endif %> + sex: |- + <% if sex == "Total" %> + all victims + <%- elif sex == "Male" -%> + male victims + <%- elif sex == "Female" -%> + female victims + <% endif %> + age: |- + <% if age == "Total" %> + in all age-groups + <%- elif age == "30-44" %> + aged 30-44 years + <%- elif age == "45-59" %> + aged 45-59 years + <%- elif age == "60 and older" %> + aged over 60 years + <%- elif age == "0-9" %> + aged 0-9 years + <%- elif age == "10 -14" %> + aged 10-14 years + <%- elif age == "15 -17" %> + aged 15-17 years + <%- elif age == "18-19" %> + aged 18-19 years + <%- elif age == "20-24" %> + aged 20-24 years + <%- elif age == "25-29" %> + aged 25-29 years + <%- elif age == "Unknown" %> + of unknown age + <%- endif %> + unit: |- + <% if unit_of_measurement == "Counts" %> + homicides + <%- elif sex == "Rate per 100,000 population" %> + homicides per 100,000 population + <%- endif %> +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + by_mechanisms: + variables: + value: + title: |- + {definitions.metric} - << category >> + description_short: |- + {definitions.metric} of {definitions.sex} {definitions.age} where the weapon was << category.lower() >> + unit: |- + {definitions.unit} + presentation: + title_public: |- + {definitions.metric} of {definitions.sex} {definitions.age} where the weapon was << category.lower() >> + display: + numDecimalPlaces: <%- if unit_of_measurement == "Counts" -%> 0<%- elif unit_of_measurement == "Rate per 100,000 population" -%> 1<%- endif -%> + by_relationship_to_perpetrator: + variables: + value: + title: |- + {definitions.metric} - << category >> - sex: << sex >> + description_short: |- + {definitions.metric} of {definitions.sex} {definitions.age} where the << category.lower() >> + unit: |- + {definitions.unit} + presentation: + title_public: |- + {definitions.metric} of {definitions.sex} {definitions.age} where the << category.lower() >> + display: + numDecimalPlaces: <%- if unit_of_measurement == "Counts" -%> 0<%- elif unit_of_measurement == "Rate per 100,000 population" -%> 1<%- endif -%> + by_situational_context: + variables: + value: + title: |- + {definitions.metric} - << category >> - sex: << sex >> + description_short: |- + {definitions.metric} of {definitions.sex} {definitions.age} where the situation was << category.lower() >> + unit: |- + {definitions.unit} + presentation: + title_public: |- + {definitions.metric} of {definitions.sex} {definitions.age} where the situation was << category.lower() >> + display: + numDecimalPlaces: <%- if unit_of_measurement == "Counts" -%> 0<%- elif unit_of_measurement == "Rate per 100,000 population" -%> 1<%- endif -%> + total: + variables: + value: + title: |- + {definitions.metric} - sex: << sex >> - age: << age >> + description_short: |- + {definitions.metric} of {definitions.sex} {definitions.age} + unit: |- + {definitions.unit} + display: + numDecimalPlaces: <%- if unit_of_measurement == "Counts" -%> 0<%- elif unit_of_measurement == "Rate per 100,000 population" -%> 1<%- endif -%> diff --git a/etl/steps/data/garden/homicide/2024-10-30/unodc.py b/etl/steps/data/garden/homicide/2024-10-30/unodc.py new file mode 100644 index 00000000000..8e479671f82 --- /dev/null +++ b/etl/steps/data/garden/homicide/2024-10-30/unodc.py @@ -0,0 +1,160 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Dataset, Table +from owid.catalog import processing as pr +from owid.catalog.utils import underscore + +from etl.data_helpers import geo +from etl.data_helpers.geo import add_population_to_table +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("unodc") + # Load population dataset. + ds_population = paths.load_dataset("population") + # Read table from meadow dataset. + tb = ds_meadow["unodc"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) + + tb = clean_up_categories(tb) + tb = calculate_united_kingdom(tb, ds_population) + tables = clean_data(tb) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def clean_data(tb: Table) -> list[Table]: + """ + Splitting the data into four dataframes/tables based on the dimension columns: + * Total + * by mechanism + * by relationship to perpertrator + * by situational context + """ + tb_mech = create_table(tb, table_name="by mechanisms") + tb_perp = create_table(tb, table_name="by relationship to perpetrator") + tb_situ = create_table(tb, table_name="by situational context") + tb_tot = create_total_table(tb) + + tb_garden_list = [tb_mech, tb_tot, tb_perp, tb_situ] + + return tb_garden_list + + +def create_table(tb: Table, table_name: str) -> Table: + """ + Create the homicides by mechanism dataframe where we will have homicides/homicide rate + disaggregated by mechanism (e.g. weapon) + + """ + assert any(tb["dimension"] == table_name), "table_name must be a dimension in df" + tb_filter = tb[tb["dimension"] == table_name] + tb_filter = tb_filter.drop(columns=["indicator", "source", "dimension"]) + + tb_filter = tb_filter.format( + ["country", "year", "category", "sex", "age", "unit_of_measurement"], + short_name=underscore(table_name), + ) + + return tb_filter + + +def create_total_table(tb: Table) -> Table: + """ + Create the total homicides dataframe where we will have total homicides/homicide rate + disaggregated by age and sex + """ + tb_tot = tb[tb["dimension"] == "Total"] + + # There are some duplicates when sex is unknown so let's remove those rows + tb_tot = tb_tot[tb_tot["sex"] != "Unknown"] + + tb_tot = tb_tot.drop(columns=["indicator", "source", "dimension"]) + + tb_tot = tb_tot.format( + ["country", "year", "category", "sex", "age", "unit_of_measurement"], + short_name="total", + ) + + return tb_tot + + +def clean_up_categories(tb: Table) -> Table: + """ + Make the categories used in the dataset a bit more readable. + + """ + category_dict = { + "Another weapon - sharp object": "a sharp object", + "Unspecified means": "unspecified means", + "Without a weapon/ other Mechanism": " without a weapon or by another mechanism", + "Firearms or explosives": "firearms or explosives", + "Another weapon": "sharp or blunt object, including motor vehicles", + "Intimate partner or family member": "Perpetrator is an intimate partner or family member of the victim", + "Intimate partner or family member: Intimate partner": "Perpetrator is an intimate partner", + "Intimate partner or family member: Family member": "Perpetrator is a family member", + "Other Perpetrator known to the victim": "Another known perpetrator", + "Perpetrator unknown to the victim": "Perpetrator is unknown to victim", + "Perpetrator to victim relationship unknown": "the relationship to the victim is not known", + "Socio-political homicide - terrorist offences": "Terrorist offences", + "Unknown types of homicide": "Unknown situational context", + } + + for key in category_dict.keys(): + assert key in tb["category"].values, f"{key} not in table" + tb["category"] = tb["category"].cat.rename_categories(category_dict) + + assert tb["category"].isna().sum() == 0 + return tb + + +def calculate_united_kingdom(tb: Table, ds_population: Dataset) -> Table: + """ + Calculate data for the UK as it is reported by the constituent countries + """ + + countries = ["England and Wales", "Scotland", "Northern Ireland"] + tb_uk = tb[(tb["country"].isin(countries)) & (tb["unit_of_measurement"] == "Counts")] + + tb_uk = ( + tb_uk.groupby(["year", "indicator", "dimension", "category", "sex", "age", "unit_of_measurement"]) + .agg(value=("value", "sum"), count=("value", "size")) + .reset_index() + ) + # Use only rows where all three entites are in the data + tb_uk = tb_uk[tb_uk["count"] == 3] + tb_uk["country"] = "United Kingdom" + tb_uk = tb_uk.drop(columns="count") + + # Add in UK population to calculate rates + tb_uk_rate = tb_uk.copy() + tb_uk_rate = add_population_to_table(tb_uk_rate, ds_population) + tb_uk_rate["value"] = tb_uk_rate["value"] / tb_uk_rate["population"] * 100000 + tb_uk_rate["unit_of_measurement"] = "Rate per 100,000 population" + tb_uk_rate = tb_uk_rate.drop(columns=["population"]) + + tb = pr.concat([tb, tb_uk, tb_uk_rate]) + return tb diff --git a/etl/steps/data/garden/iea/2024-11-20/fossil_fuel_subsidies.countries.json b/etl/steps/data/garden/iea/2024-11-20/fossil_fuel_subsidies.countries.json new file mode 100644 index 00000000000..37c3e86047f --- /dev/null +++ b/etl/steps/data/garden/iea/2024-11-20/fossil_fuel_subsidies.countries.json @@ -0,0 +1,51 @@ +{ + "Algeria": "Algeria", + "Angola": "Angola", + "Argentina": "Argentina", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Bolivia": "Bolivia", + "Brunei": "Brunei", + "China": "China", + "Colombia": "Colombia", + "Croatia": "Croatia", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "France": "France", + "Gabon": "Gabon", + "Ghana": "Ghana", + "Hungary": "Hungary", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Iraq": "Iraq", + "Kazakhstan": "Kazakhstan", + "Kuwait": "Kuwait", + "Libya": "Libya", + "Malaysia": "Malaysia", + "Mexico": "Mexico", + "Nigeria": "Nigeria", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Poland": "Poland", + "Qatar": "Qatar", + "Russia": "Russia", + "Slovak Republic": "Slovakia", + "Thailand": "Thailand", + "Turkmenistan": "Turkmenistan", + "UAE": "United Arab Emirates", + "Ukraine": "Ukraine", + "United Kingdom": "United Kingdom", + "Uzbekistan": "Uzbekistan", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "World": "World", + "ElSalvador": "El Salvador", + "SaudiArabia": "Saudi Arabia", + "SouthAfrica": "South Africa", + "SriLanka": "Sri Lanka", + "Taipei": "Taiwan", + "TrinidadandTobago": "Trinidad and Tobago" +} \ No newline at end of file diff --git a/etl/steps/data/garden/iea/2024-11-20/fossil_fuel_subsidies.meta.yml b/etl/steps/data/garden/iea/2024-11-20/fossil_fuel_subsidies.meta.yml new file mode 100644 index 00000000000..772f0f01622 --- /dev/null +++ b/etl/steps/data/garden/iea/2024-11-20/fossil_fuel_subsidies.meta.yml @@ -0,0 +1,45 @@ +definitions: + common: + presentation: + topic_tags: + - Energy + - Fossil Fuels + grapher_config: + note: This data is expressed in constant {dollar_year} US$. + processing_level: minor + unit: constant {dollar_year} US$ + short_unit: "$" + description_short: |- + This data is expressed in US dollars. It is adjusted for inflation but does not account for differences in living costs between countries. + +dataset: + update_period_days: 365 + +tables: + fossil_fuel_subsidies: + variables: + coal_subsidy: + title: Subsidies to coal + electricity_subsidy: + title: Subsidies to electricity + gas_subsidy: + title: Subsidies to gas + oil_subsidy: + title: Subsidies to oil + total_subsidy: + title: Total subsidies + transport_oil_subsidy: + title: Subsidies to oil in transport + subsidization_rate: + title: Subsidization rate + unit: "%" + short_unit: "%" + subsidy_per_capita: + title: Subsidy per capita + unit: constant {dollar_year} US$ per person + short_unit: "$/person" + subsidy_as_share_of_gdp: + title: Subsidy as share of GDP + unit: "%" + short_unit: "%" + diff --git a/etl/steps/data/garden/iea/2024-11-20/fossil_fuel_subsidies.py b/etl/steps/data/garden/iea/2024-11-20/fossil_fuel_subsidies.py new file mode 100644 index 00000000000..986ad16ad85 --- /dev/null +++ b/etl/steps/data/garden/iea/2024-11-20/fossil_fuel_subsidies.py @@ -0,0 +1,58 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Assumed USD year. +DOLLAR_YEAR = 2023 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("fossil_fuel_subsidies") + + # Read tables from meadow dataset. + tb = ds_meadow.read("fossil_fuel_subsidies") + tb_indicators = ds_meadow.read("fossil_fuel_subsidies_indicators") + tb_transport = ds_meadow.read("fossil_fuel_subsidies_transport_oil") + + # + # Process data. + # + # Convert units from millions of dollars to dollars. + tb["subsidy"] *= 1e6 + + # Transpose table. + tb = tb.pivot(index=["country", "year"], columns="product", values="subsidy", join_column_levels_with="_") + + # Rename conveniently. + tb = tb.rename( + columns={column: f"{column}_subsidy" for column in tb.drop(columns=["country", "year"]).columns}, errors="raise" + ) + + # Harmonize country names. + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + # Include additional indicators from the other tables. + tb = tb.merge(tb_indicators, on=["country", "year"], how="outer") + tb = tb.merge(tb_transport, on=["country", "year"], how="outer") + + # Improve format. + tb = tb.format() + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, yaml_params={"dollar_year": DOLLAR_YEAR} + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_cause.meta.yml b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_cause.meta.yml index c23476b6a72..bc201974600 100644 --- a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_cause.meta.yml +++ b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_cause.meta.yml @@ -120,6 +120,8 @@ definitions: <%- endif -%> footnote: |- <% if age == "Age-standardized" %>To allow for comparisons between countries and over time, this metric is [age-standardized](#dod:age_standardized).<%- endif -%> + annotation: |- + <% if cause == 'Maternal disorders' %>United States: Values from 2003–2017 affected by measurement change<%- endif -%> dataset: update_period_days: 1460 non_redistributable: true @@ -147,6 +149,8 @@ tables: display: numDecimalPlaces: |- <% if metric == 'Number' %>0<% else %>1<%- endif -%> + entityAnnotationsMap: |- + {definitions.annotation} gbd_cause_dalys: variables: value: diff --git a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_risk.py b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_risk.py index 12e02cf7ab2..c2aa92c0ddb 100644 --- a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_risk.py +++ b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_risk.py @@ -26,7 +26,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("gbd_risk") # Read table from meadow dataset. - tb = ds_meadow.read_table("gbd_risk", reset_index=True) + tb = ds_meadow.read("gbd_risk", reset_index=True) ds_regions = paths.load_dataset("regions") # # Process data. diff --git a/etl/steps/data/garden/imf/2024-11-25/world_economic_outlook.countries.json b/etl/steps/data/garden/imf/2024-11-25/world_economic_outlook.countries.json new file mode 100644 index 00000000000..7b3c8a6d039 --- /dev/null +++ b/etl/steps/data/garden/imf/2024-11-25/world_economic_outlook.countries.json @@ -0,0 +1,198 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hong Kong SAR": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyz Republic": "Kyrgyzstan", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia": "Micronesia (country)", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Republic of Congo": "Congo", + "Romania": "Romania", + "Russia": "Russia", + "Rwanda": "Rwanda", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovak Republic": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "St. Kitts and Nevis": "Saint Kitts and Nevis", + "St. Lucia": "Saint Lucia", + "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "The Bahamas": "Bahamas", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "West Bank and Gaza": "Palestine", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Islamic Republic of Iran": "Iran", + "Korea": "South Korea", + "Lao P.D.R.": "Laos", + "Macao SAR": "Macao", + "S\u00e3o Tom\u00e9 and Pr\u00edncipe": "Sao Tome and Principe", + "Taiwan Province of China": "Taiwan", + "The Gambia": "Gambia", + "T\u00fcrkiye": "Turkey" +} \ No newline at end of file diff --git a/etl/steps/data/garden/imf/2024-11-25/world_economic_outlook.meta.yml b/etl/steps/data/garden/imf/2024-11-25/world_economic_outlook.meta.yml new file mode 100644 index 00000000000..a14cc9b7857 --- /dev/null +++ b/etl/steps/data/garden/imf/2024-11-25/world_economic_outlook.meta.yml @@ -0,0 +1,58 @@ +definitions: + common: + presentation: + topic_tags: + - Economic Growth + processing_level: minor + +dataset: + update_period_days: 183 + +tables: + world_economic_outlook: + variables: + # GDP growth + gross_domestic_product__constant_prices__percent_change_observation: + title: Gross domestic product, constant prices - Percent change - Observations + unit: "%" + short_unit: "%" + description_short: "Annual percent change in gross domestic product. This data is adjusted for inflation." + description_from_producer: "Gross domestic product, constant prices. Percent change. Annual percentages of constant price GDP are year-on-year changes; the base year is country-specific. Expenditure-based GDP is total final expenditures at purchasers' prices (including the f.o.b. value of exports of goods and services), less the f.o.b. value of imports of goods and services. [SNA 1993]" + display: + numDecimalPlaces: 1 + tableDisplay: + hideRelativeChange: True + + gross_domestic_product__constant_prices__percent_change_forecast: + title: Gross domestic product, constant prices - Percent change - Forecasts + unit: "%" + short_unit: "%" + description_short: "Near-term projections of the annual percent change in gross domestic product. This data is adjusted for inflation." + description_from_producer: "Gross domestic product, constant prices. Percent change. Annual percentages of constant price GDP are year-on-year changes; the base year is country-specific. Expenditure-based GDP is total final expenditures at purchasers' prices (including the f.o.b. value of exports of goods and services), less the f.o.b. value of imports of goods and services. [SNA 1993]" + display: + numDecimalPlaces: 1 + tableDisplay: + hideRelativeChange: True + + # Unemployment rate + unemployment_rate__percent_of_total_labor_force_observation: + title: Unemployment rate - Percent of total labor force - Observations + unit: "%" + short_unit: "%" + description_short: "Unemployment refers to the share of the labor force that is without work but available for and seeking employment." + description_from_producer: "Unemployment rate can be defined by either the national definition, the ILO harmonized definition, or the OECD harmonized definition. The OECD harmonized unemployment rate gives the number of unemployed persons as a percentage of the labor force (the total number of people employed plus unemployed). [OECD Main Economic Indicators, OECD, monthly] As defined by the International Labour Organization, unemployed workers are those who are currently not working but are willing and able to work for pay, currently available to work, and have actively searched for work. [ILO, http://www.ilo.org/public/english/bureau/stat/res/index.htm]" + display: + numDecimalPlaces: 1 + tableDisplay: + hideRelativeChange: True + + unemployment_rate__percent_of_total_labor_force_forecast: + title: Unemployment rate - Percent of total labor force - Forecasts + unit: "%" + short_unit: "%" + description_short: "Near-term projections. Unemployment refers to the share of the labor force that is without work but available for and seeking employment." + description_from_producer: "Unemployment rate can be defined by either the national definition, the ILO harmonized definition, or the OECD harmonized definition. The OECD harmonized unemployment rate gives the number of unemployed persons as a percentage of the labor force (the total number of people employed plus unemployed). [OECD Main Economic Indicators, OECD, monthly] As defined by the International Labour Organization, unemployed workers are those who are currently not working but are willing and able to work for pay, currently available to work, and have actively searched for work. [ILO, http://www.ilo.org/public/english/bureau/stat/res/index.htm]" + display: + numDecimalPlaces: 1 + tableDisplay: + hideRelativeChange: True \ No newline at end of file diff --git a/etl/steps/data/garden/imf/2024-11-25/world_economic_outlook.py b/etl/steps/data/garden/imf/2024-11-25/world_economic_outlook.py new file mode 100644 index 00000000000..a528b98f01c --- /dev/null +++ b/etl/steps/data/garden/imf/2024-11-25/world_economic_outlook.py @@ -0,0 +1,38 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("world_economic_outlook") + + # Read table from meadow dataset. + tb = ds_meadow.read("world_economic_outlook") + + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/irena/2024-10-29/renewable_power_generation_costs.py b/etl/steps/data/garden/irena/2024-10-29/renewable_power_generation_costs.py index c88f7363483..252b73b8119 100644 --- a/etl/steps/data/garden/irena/2024-10-29/renewable_power_generation_costs.py +++ b/etl/steps/data/garden/irena/2024-10-29/renewable_power_generation_costs.py @@ -13,8 +13,8 @@ def run(dest_dir: str) -> None: # # Load meadow dataset and read its tables. ds_meadow = paths.load_dataset("renewable_power_generation_costs") - tb = ds_meadow.read_table("renewable_power_generation_costs") - tb_solar_pv = ds_meadow.read_table("solar_photovoltaic_module_prices", reset_index=False) + tb = ds_meadow.read("renewable_power_generation_costs") + tb_solar_pv = ds_meadow.read("solar_photovoltaic_module_prices", reset_index=False) # # Process data. diff --git a/etl/steps/data/garden/irena/2024-11-01/renewable_capacity_statistics.countries.json b/etl/steps/data/garden/irena/2024-11-01/renewable_capacity_statistics.countries.json new file mode 100644 index 00000000000..d470988c911 --- /dev/null +++ b/etl/steps/data/garden/irena/2024-11-01/renewable_capacity_statistics.countries.json @@ -0,0 +1,236 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Faroe Islands": "Faroe Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guam": "Guam", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Mexico": "Mexico", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Romania": "Romania", + "Rwanda": "Rwanda", + "Samoa": "Samoa", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Togo": "Togo", + "Tokelau": "Tokelau", + "Tonga": "Tonga", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Viet Nam": "Vietnam", + "World": "World", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Central America and the Caribbean": "Central America and the Caribbean (IRENA)", + "Chinese Taipei": "Taiwan", + "Middle East": "Middle East (IRENA)", + "T\u00fcrkiye": "Turkey", + "Africa": "Africa (IRENA)", + "Asia": "Asia (IRENA)", + "Europe": "Europe (IRENA)", + "Oceania": "Oceania (IRENA)", + "Eurasia": "Eurasia (IRENA)", + "American Samoa": "American Samoa", + "Antigua and Barbuda": "Antigua and Barbuda", + "Bahamas (the)": "Bahamas", + "Bolivia (Plurinational State of)": "Bolivia", + "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Cayman Islands": "Cayman Islands", + "Central African Republic (the)": "Central African Republic", + "Comoros (the)": "Comoros", + "Congo (the)": "Congo", + "Cook Islands (the)": "Cook Islands", + "Cura\u00e7ao": "Curacao", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Dominican Republic (the)": "Dominican Republic", + "Equatorial Guinea": "Equatorial Guinea", + "Falkland Islands (Malvinas)": "Falkland Islands", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Gambia (the)": "Gambia", + "Guinea-Bissau": "Guinea-Bissau", + "Iran (Islamic Republic of)": "Iran", + "Kosovo": "Kosovo", + "Lao People's Democratic Republic (the)": "Laos", + "Micronesia (Federated States of)": "Micronesia (country)", + "New Caledonia": "New Caledonia", + "Niger (the)": "Niger", + "North America": "North America (IRENA)", + "Papua New Guinea": "Papua New Guinea", + "Philippines (the)": "Philippines", + "Russian Federation (the)": "Russia", + "R\u00e9union": "Reunion", + "Saint Barth\u00e9lemy": "Saint Barthelemy", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Martin (French Part)": "Saint Martin (French part)", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Sao Tome and Principe": "Sao Tome and Principe", + "Sint Maarten (Dutch Part)": "Sint Maarten (Dutch part)", + "Solomon Islands": "Solomon Islands", + "South America": "South America (IRENA)", + "South Georgia and the South Sandwich Islands": "South Georgia and the South Sandwich Islands", + "Sudan (the)": "Sudan", + "Syrian Arab Republic (the)": "Syria", + "Timor-Leste": "East Timor", + "Trinidad and Tobago": "Trinidad and Tobago", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "United States Virgin Islands": "United States Virgin Islands", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "China, Hong Kong Special Administrative Region": "Hong Kong", + "Democratic People's Republic of Korea (the)": "North Korea", + "Democratic Republic of the Congo (the)": "Democratic Republic of Congo", + "Marshall Islands (the)": "Marshall Islands", + "Netherlands (Kingdom of the)": "Netherlands", + "Republic of Korea (the)": "South Korea", + "Republic of Moldova (the)": "Moldova", + "State of Palestine (the)": "Palestine", + "United Arab Emirates (the)": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland (the)": "United Kingdom", + "United Republic of Tanzania (the)": "Tanzania", + "United States of America (the)": "United States" +} diff --git a/etl/steps/data/garden/irena/2024-11-01/renewable_capacity_statistics.meta.yml b/etl/steps/data/garden/irena/2024-11-01/renewable_capacity_statistics.meta.yml new file mode 100644 index 00000000000..8319a3ae4ad --- /dev/null +++ b/etl/steps/data/garden/irena/2024-11-01/renewable_capacity_statistics.meta.yml @@ -0,0 +1,231 @@ +definitions: + common: + presentation: + topic_tags: + - Energy + short_unit: MW + unit: megawatts + +dataset: + update_period_days: 365 + +tables: + renewable_capacity_statistics: + title: Renewable electricity capacity + variables: + bioenergy__total: + title: Bioenergy capacity (total) + description_short: |- + Total bioenergy (on- and off-grid) electricity installed capacity, measured in megawatts. This includes biogas, liquid biofuels, solid biofuels, and renewable municipal waste. + display: + name: Bioenergy + presentation: + title_public: Total bioenergy capacity + biogas: + title: Biogas capacity (on-grid) + description_short: |- + Biogas (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Biogas + presentation: + title_public: Biogas capacity + biogas__off_grid: + title: Biogas capacity (off-grid) + description_short: |- + Biogas (off-grid) electricity installed capacity, measured in megawatts. + display: + name: Biogas (off-grid) + presentation: + title_public: Biogas capacity (off-grid) + geothermal: + title: Geothermal capacity (on-grid) + description_short: |- + Geothermal (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Geothermal + presentation: + title_public: Geothermal capacity + geothermal__off_grid: + title: Geothermal capacity (off-grid) + description_short: |- + Geothermal (off-grid) electricity installed capacity, measured in megawatts. + display: + name: Geothermal (off-grid) + presentation: + title_public: Geothermal capacity (off-grid) + geothermal__total: + title: Geothermal capacity (total) + description_short: |- + Total geothermal (on- and off-grid) electricity installed capacity, measured in megawatts. + display: + name: Geothermal (total) + presentation: + title_public: Total geothermal capacity + hydropower: + title: Hydropower capacity (on-grid) + description_short: |- + Hydropower (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Hydropower + presentation: + title_public: Hydropower capacity + hydropower__off_grid: + title: Hydropower capacity (off-grid) + description_short: |- + Hydropower (off-grid) electricity installed capacity, measured in megawatts. + display: + name: Hydropower (off-grid) + presentation: + title_public: Hydropower capacity (off-grid) + hydropower__total: + title: Hydropower capacity (total) + description_short: |- + Total hydropower (on- and off-grid) electricity installed capacity, including pumped storage, measured in megawatts. This includes mixed hydro plans. + display: + name: Hydropower (incl. pumped storage) + presentation: + title_public: Total hydropower capacity + hydropower__total__excl__pumped_storage: + title: Hydropower capacity (total, excluding pumped storage) + description_short: |- + Total hydropower (on- and off-grid) electricity installed capacity, measured in megawatts. This includes mixed hydro plans, but excludes pumped storage. + display: + name: Hydropower (excl. pumped storage) + presentation: + title_public: Total hydropower capacity (excluding pumped storage) + liquid_biofuels: + title: Liquid biofuels capacity (on-grid) + description_short: |- + Liquid biofuels (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Liquid biofuels + presentation: + title_public: Liquid biofuels capacity + liquid_biofuels__off_grid: + title: Liquid biofuels capacity (off-grid) + description_short: |- + Liquid biofuels (off-grid) electricity installed capacity, measured in megawatts. + display: + name: Liquid biofuels (off-grid) + presentation: + title_public: Liquid biofuels capacity (off-grid) + marine: + title: Marine capacity (on-grid) + description_short: |- + Marine (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Marine + presentation: + title_public: Marine capacity + mixed_hydro_plants: + title: Mixed hydro plants capacity (on-grid) + description_short: |- + Mixed hydro plants (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Mixed hydro plants + presentation: + title_public: Mixed hydro plants capacity + offshore_wind: + title: Offshore wind capacity (on-grid) + description_short: |- + Offshore wind (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Offshore wind + presentation: + title_public: Offshore wind capacity + onshore_wind: + title: Onshore wind capacity (on-grid) + description_short: |- + Onshore wind (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Onshore wind + presentation: + title_public: Onshore wind capacity + onshore_wind__off_grid: + title: Onshore wind capacity (off-grid) + description_short: |- + Onshore wind (off-grid) electricity installed capacity, measured in megawatts. + display: + name: Onshore wind (off-grid) + presentation: + title_public: Onshore wind capacity (off-grid) + pumped_storage: + title: Pumped storage capacity (on-grid) + description_short: |- + Pumped storage (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Pumped storage + presentation: + title_public: Pumped storage capacity + renewable_municipal_waste: + title: Renewable municipal waste capacity (on-grid) + description_short: |- + Renewable municipal waste (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Renewable municipal waste + presentation: + title_public: Renewable municipal waste capacity + renewables__total: + title: Renewable electricity capacity (total) + description_short: |- + Total renewable (on- and off-grid) electricity installed capacity, measured in megawatts. This includes bioenergy, geothermal,hydropower (excluding pumped storage), solar, wind, and marine energy. + display: + name: Renewable electricity + presentation: + title_public: Total renewable capacity + solar__total: + title: Solar capacity (total) + description_short: |- + Total solar (on- and off-grid) electricity installed capacity, measured in megawatts. This includes solar photovoltaic and concentrated solar power. + display: + name: Solar + presentation: + title_public: Total solar capacity + solar_photovoltaic: + title: Solar photovoltaic capacity (on-grid) + description_short: |- + Solar photovoltaic (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Solar photovoltaic + presentation: + title_public: Solar photovoltaic capacity + solar_photovoltaic__off_grid: + title: Solar photovoltaic capacity (off-grid) + description_short: |- + Solar photovoltaic (off-grid) electricity installed capacity, measured in megawatts. + display: + name: Solar photovoltaic (off-grid) + presentation: + title_public: Solar photovoltaic capacity (off-grid) + concentrated_solar_power: + title: Concentrated solar power capacity (on-grid) + description_short: |- + Concentrated solar power (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Concentrated solar power + presentation: + title_public: Concentrated solar power capacity + solid_biofuels: + title: Solid biofuels capacity (on-grid) + description_short: |- + Solid biofuels (on-grid) electricity installed capacity, measured in megawatts. + display: + name: Solid biofuels + presentation: + title_public: Solid biofuels capacity + solid_biofuels__off_grid: + title: Solid biofuels capacity (off-grid) + description_short: |- + Solid biofuels (off-grid) electricity installed capacity, measured in megawatts. + display: + name: Solid biofuels (off-grid) + presentation: + title_public: Solid biofuels capacity (off-grid) + wind__total: + title: Wind capacity (total) + description_short: |- + Total wind (on- and off-grid) electricity installed capacity, measured in megawatts. This includes onshore and offshore wind. + display: + name: Wind + presentation: + title_public: Total wind capacity diff --git a/etl/steps/data/garden/irena/2024-11-01/renewable_capacity_statistics.py b/etl/steps/data/garden/irena/2024-11-01/renewable_capacity_statistics.py new file mode 100644 index 00000000000..6884b7c3eb7 --- /dev/null +++ b/etl/steps/data/garden/irena/2024-11-01/renewable_capacity_statistics.py @@ -0,0 +1,408 @@ +"""Create a dataset of renewable electricity capacity using IRENA's Renewable Electricity Capacity and Generation. + +We will map the input data as follows (to generate the following mapping, uncomment the DEBUGGING section below): + +[Old categories] -> [New categories] +Renewable or not | Producer type | Group technology | Technology | Sub-technology -> Producer type | Technology + +No |Off-grid|Fossil fuels |Coal and peat |Coal and peat -> Off-grid|Coal and peat +No |Off-grid|Fossil fuels |Other fossil fuels |Fossil fuels n.e.s. -> Off-grid|Other fossil fuels +No |Off-grid|Fossil fuels |Natural gas |Natural gas -> Off-grid|Natural gas +No |Off-grid|Fossil fuels |Oil |Oil -> Off-grid|Oil +No |On-grid |Fossil fuels |Coal and peat |Coal and peat -> On-grid |Coal and peat +No |On-grid |Fossil fuels |Other fossil fuels |Fossil fuels n.e.s. -> On-grid |Other fossil fuels +No |On-grid |Fossil fuels |Natural gas |Natural gas -> On-grid |Natural gas +No |On-grid |Fossil fuels |Oil |Oil -> On-grid |Oil +No |On-grid |Nuclear |Nuclear |Nuclear -> On-grid |Nuclear +No |On-grid |Other non-renewable |Other non-renewable |Other non-renewable energ -> On-grid |Other non-renewable +No |On-grid |Pumped storage |Pumped storage |Pumped storage -> On-grid |Pumped storage +Yes|Off-grid|Bioenergy |Biogas |Biogas n.e.s. -> Off-grid|Biogas +Yes|Off-grid|Bioenergy |Biogas |Biogases from thermal pro -> Off-grid|Biogas +Yes|Off-grid|Bioenergy |Biogas |Landfill gas -> Off-grid|Biogas +Yes|Off-grid|Bioenergy |Biogas |Other biogases from anaer -> Off-grid|Biogas +Yes|Off-grid|Bioenergy |Liquid biofuels |Other liquid biofuels -> Off-grid|Liquid biofuels +Yes|Off-grid|Bioenergy |Solid biofuels |Animal waste -> Off-grid|Solid biofuels +Yes|Off-grid|Bioenergy |Solid biofuels |Bagasse -> Off-grid|Solid biofuels +Yes|Off-grid|Bioenergy |Solid biofuels |Black liquor -> Off-grid|Solid biofuels +Yes|Off-grid|Bioenergy |Solid biofuels |Energy crops -> Off-grid|Solid biofuels +Yes|Off-grid|Bioenergy |Solid biofuels |Other primary solid biofu -> Off-grid|Solid biofuels +Yes|Off-grid|Bioenergy |Solid biofuels |Other vegetal and agricul -> Off-grid|Solid biofuels +Yes|Off-grid|Bioenergy |Solid biofuels |Rice husks -> Off-grid|Solid biofuels +Yes|Off-grid|Bioenergy |Solid biofuels |Wood fuel -> Off-grid|Solid biofuels +Yes|Off-grid|Bioenergy |Solid biofuels |Wood waste -> Off-grid|Solid biofuels +Yes|Off-grid|Geothermal |Geothermal |Geothermal energy -> Off-grid|Geothermal +Yes|Off-grid|Hydropower |Hydropower |Renewable hydropower -> Off-grid|Hydropower +Yes|Off-grid|Solar |Solar photovoltaic |Off-grid Solar photovolta -> Off-grid|Solar photovoltaic +Yes|Off-grid|Wind |Onshore wind |Onshore wind energy -> Off-grid|Onshore wind +Yes|On-grid |Bioenergy |Biogas |Biogas n.e.s. -> On-grid |Biogas +Yes|On-grid |Bioenergy |Biogas |Biogases from thermal pro -> On-grid |Biogas +Yes|On-grid |Bioenergy |Biogas |Landfill gas -> On-grid |Biogas +Yes|On-grid |Bioenergy |Biogas |Other biogases from anaer -> On-grid |Biogas +Yes|On-grid |Bioenergy |Biogas |Sewage sludge gas -> On-grid |Biogas +Yes|On-grid |Bioenergy |Liquid biofuels |Advanced biodiesel -> On-grid |Liquid biofuels +Yes|On-grid |Bioenergy |Liquid biofuels |Advanced biogasoline -> On-grid |Liquid biofuels +Yes|On-grid |Bioenergy |Liquid biofuels |Conventional biodiesel -> On-grid |Liquid biofuels +Yes|On-grid |Bioenergy |Liquid biofuels |Other liquid biofuels -> On-grid |Liquid biofuels +Yes|On-grid |Bioenergy |Renewable municipal waste|Renewable municipal waste -> On-grid |Renewable municipal waste +Yes|On-grid |Bioenergy |Solid biofuels |Animal waste -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Bagasse -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Biomass pellets and briqu -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Black liquor -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Energy crops -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Other primary solid biofu -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Other vegetal and agricul -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Renewable industrial wast -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Rice husks -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Straw -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Wood fuel -> On-grid |Solid biofuels +Yes|On-grid |Bioenergy |Solid biofuels |Wood waste -> On-grid |Solid biofuels +Yes|On-grid |Geothermal |Geothermal |Geothermal energy -> On-grid |Geothermal +Yes|On-grid |Hydropower |Mixed hydro plants |Mixed Hydro Plants -> On-grid |Mixed hydro plants +Yes|On-grid |Hydropower |Hydropower |Renewable hydropower -> On-grid |Hydropower +Yes|On-grid |Marine |Marine |Marine energy -> On-grid |Marine +Yes|On-grid |Solar |Solar photovoltaic |On-grid Solar photovoltai -> On-grid |Solar photovoltaic +Yes|On-grid |Solar |Solar thermal |Concentrated solar power -> On-grid |Concentrated solar power +Yes|On-grid |Wind |Offshore wind |Offshore wind energy -> On-grid |Offshore wind +Yes|On-grid |Wind |Onshore wind |Onshore wind energy -> On-grid |Onshore wind + +""" +import owid.catalog.processing as pr +from owid.catalog import Table +from owid.datautils.dataframes import map_series + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Select and rename columns. +# NOTE: IRENA includes non-renewable technologies and heat indicators, but for now we will only consider renewable electricity. +COLUMNS = { + "country": "country", + "year": "year", + "re_or_non_re": "is_renewable", + "group_technology": "group_technology", + "technology": "technology", + "sub_technology": "sub_technology", + "producer_type": "producer_type", + "electricity_installed_capacity__mw": "capacity", +} + +# Mapping of different categories. +CATEGORY_MAPPING = { + "is_renewable": { + "Total Non-Renewable": "No", + "Total Renewable": "Yes", + }, + "producer_type": { + "Off-grid electricity": "Off-grid", + "On-grid electricity": "On-grid", + }, + "group_technology": { + "Fossil fuels": "Fossil fuels", + "Nuclear": "Nuclear", + "Other non-renewable energy": "Other non-renewable", + "Pumped storage": "Pumped storage", + "Bioenergy": "Bioenergy", + "Geothermal energy": "Geothermal", + "Hydropower (excl. Pumped Storage)": "Hydropower", + "Solar energy": "Solar", + "Wind energy": "Wind", + "Marine energy": "Marine", + }, + "technology": { + "Coal and peat": "Coal and peat", + "Fossil fuels n.e.s.": "Other fossil fuels", + "Natural gas": "Natural gas", + "Oil": "Oil", + "Nuclear": "Nuclear", + "Other non-renewable energy": "Other non-renewable", + "Pumped storage": "Pumped storage", + "Biogas": "Biogas", + "Liquid biofuels": "Liquid biofuels", + "Solid biofuels": "Solid biofuels", + "Geothermal energy": "Geothermal", + "Renewable hydropower": "Hydropower", + "Solar photovoltaic": "Solar photovoltaic", + "Onshore wind energy": "Onshore wind", + "Renewable municipal waste": "Renewable municipal waste", + "Mixed Hydro Plants": "Mixed hydro plants", + "Marine energy": "Marine", + "Solar thermal energy": "Concentrated solar power", + "Offshore wind energy": "Offshore wind", + }, + # NOTE: Sub-technologies will not be stored (we will keep data aggregated at the technology level). + # However, we keep this mapping just to be warned in case the data changes in a future update. + "sub_technology": { + "Onshore wind energy": "Onshore wind energy", + "Straw": "Straw", + "Pumped storage": "Pumped storage", + "Advanced biodiesel": "Advanced biodiesel", + "Oil": "Oil", + "Energy crops": "Energy crops", + "Rice husks": "Rice husks", + "Renewable industrial waste": "Renewable industrial waste", + "Coal and peat": "Coal and peat", + "Renewable hydropower": "Renewable hydropower", + "Advanced biogasoline": "Advanced biogasoline", + "Natural gas": "Natural gas", + "On-grid Solar photovoltaic": "On-grid Solar photovoltaic", + "Biogas n.e.s.": "Biogas n.e.s.", + "Sewage sludge gas": "Sewage sludge gas", + "Bagasse": "Bagasse", + "Offshore wind energy": "Offshore wind energy", + "Biogases from thermal processes": "Biogases from thermal processes", + "Other biogases from anaerobic fermentation": "Other biogases from anaerobic fermentation", + "Renewable municipal waste": "Renewable municipal waste", + "Biomass pellets and briquettes": "Biomass pellets and briquettes", + "Marine energy": "Marine energy", + "Nuclear": "Nuclear", + "Geothermal energy": "Geothermal energy", + "Black liquor": "Black liquor", + "Fossil fuels n.e.s.": "Fossil fuels n.e.s.", + "Other liquid biofuels": "Other liquid biofuels", + "Conventional biodiesel": "Conventional biodiesel", + "Off-grid Solar photovoltaic": "Off-grid Solar photovoltaic", + "Other vegetal and agricultural waste": "Other vegetal and agricultural waste", + "Animal waste": "Animal waste", + "Concentrated solar power": "Concentrated solar power", + "Mixed Hydro Plants": "Mixed Hydro Plants", + "Other primary solid biofuels n.e.s.": "Other primary solid biofuels n.e.s.", + "Landfill gas": "Landfill gas", + "Wood waste": "Wood waste", + "Other non-renewable energy": "Other non-renewable energy", + "Wood fuel": "Wood fuel", + }, +} +# Create new groups for total capacity of each technology. +# NOTE: The following groups will include both on-grid and off-grid. The new producer type will be "Both". +NEW_GROUPS = { + "Fossil fuels (total)": ["Coal and peat", "Other fossil fuels", "Natural gas", "Oil"], + "Bioenergy (total)": ["Biogas", "Liquid biofuels", "Solid biofuels", "Renewable municipal waste"], + # In IRENA's Renewable Capacity Statistics's PDF, they show: + # * "Renewable hydropower (including mixed plants)" which includes Hydropower + Mixed hydro plants. + # * "Hydropower" which includes Hydropower + Mixed hydro plants + Pumped storage. + # * "Total renewable energy" which includes all renewables, but excludes Pumped storage. + # So, for consistency with them, we will create a hydropower total group, which includes pumped storage, and another that doesn't. + # And, when constructing the total of renewables, pumped storage will not be included. + # Also note that other totals seem to include off-grid capacity. + # For example, "Solar" in the PDF is the sum of on- and off-grid "Solar photovoltaic" and "Concentrated solar power". + "Hydropower (total)": ["Hydropower", "Mixed hydro plants", "Pumped storage"], + "Hydropower (total, excl. pumped storage)": ["Hydropower", "Mixed hydro plants"], + "Solar (total)": ["Solar photovoltaic", "Concentrated solar power"], + "Wind (total)": ["Onshore wind", "Offshore wind"], + "Renewables (total)": [ + "Bioenergy (total)", + "Geothermal", + "Hydropower (total, excl. pumped storage)", + "Solar (total)", + "Wind (total)", + "Marine", + ], + "Geothermal (total)": ["Geothermal"], + # Other groups that could be created, but, since they have only one item (for one producer type), they are unnecessary, and create redundancy. + # "Nuclear": ["Nuclear"], + # "Other non-renewable": ["Other non-renewable"], + # "Pumped storage": ["Pumped storage"], + # "Marine": ["Marine"], +} + +# We will keep only technologies that appear explicitly in the Renewable Capacity Statistics 2024 document. +# So we will exclude the rest. +# NOTE: We do this after creating all aggregates, in case in the future we decide to include them. +EXCLUDE_TECHNOLOGIES = [ + "Fossil fuels (total)", + "Coal and peat", + "Other fossil fuels", + "Natural gas", + "Oil", + "Nuclear", + "Other non-renewable", +] + +# Regions for which aggregates will be created. +REGIONS = [ + "North America", + "South America", + "Europe", + "European Union (27)", + "Africa", + "Asia", + "Oceania", + "Low-income countries", + "Upper-middle-income countries", + "Lower-middle-income countries", + "High-income countries", + "World", +] + + +def remove_original_regional_and_global_data(tb: Table, tb_global: Table) -> Table: + # The spreadsheet doesn't explicitly say whether global data corresponds to off-grid, on-grid, or both. + # After inspection, it seems to be only on-grid. + # Check that adding up the capacity of all on-grid technologies, sub-technologies and countries reproduces global data + # (within a certain percentage error). + aggregates = ["World"] + [region for region in set(tb["country"]) if "(IRENA)" in region] + _tb_global = ( + tb[(tb["producer_type"] == "On-grid electricity") & (~tb["country"].isin(aggregates))] + .groupby(["group_technology", "year"], observed=True, as_index=False) + .agg({"capacity": "sum"}) + ) + check = tb_global.merge(_tb_global, on=["group_technology", "year"], suffixes=("", "_sum"), validate="1:1") + error = "Adding up on-grid capacity for all countries does not add up to global data." + assert check[(100 * abs(check["capacity_sum"] - check["capacity"]) / check["capacity"]) > 6].empty, error + + # Drop global and regional data (they will be recalculated afterwards consistently). + tb = tb.loc[~tb["country"].isin(aggregates)].reset_index(drop=True) + + # Check that the only index columns strictly required are producer type and subtechnology. + error = "Expected columns producer type and subtechnology (together with country-year) to be a unique index." + assert len( + tb[["is_renewable", "group_technology", "technology", "sub_technology", "producer_type"]].drop_duplicates() + ) == len(tb[["producer_type", "sub_technology"]].drop_duplicates()), error + + return tb + + +def remap_categories(tb: Table) -> Table: + # Store the number of unique categories and unique combinations (up to the technology level) before mapping. + n_categories = { + category: len(set(tb[category])) + for category in ["is_renewable", "group_technology", "technology", "sub_technology", "producer_type"] + } + n_combinations = len(set(tb[["is_renewable", "group_technology", "technology", "producer_type"]].drop_duplicates())) + # Rename categories conveniently. + for category in CATEGORY_MAPPING: + tb[category] = map_series( + tb[category], + mapping=CATEGORY_MAPPING[category], + warn_on_missing_mappings=True, + warn_on_unused_mappings=True, + show_full_warning=True, + ) + # Check that the number of unique categories and unique combinations (up to the technology level) are the same as before mapping. + error = "Unexpected number of unique categories after mapping." + assert { + category: len(set(tb[category])) + for category in ["is_renewable", "group_technology", "technology", "sub_technology", "producer_type"] + } == n_categories, error + assert ( + len(set(tb[["is_renewable", "group_technology", "technology", "producer_type"]].drop_duplicates())) + == n_combinations + ), error + + # We will group at the technology level. + # DEBUGGING: Print the final mapping. + # _technologies = ["is_renewable", "producer_type", "group_technology", "technology", "sub_technology"] + # for _, row in tb.sort_values(_technologies)[_technologies].drop_duplicates().iterrows(): + # print(f"{row['is_renewable']:<3}|{row['producer_type']:<8}|{row['group_technology']:<20}|{row['technology']:<20}|{row['sub_technology'][:25]:<25} -> {row['producer_type']:<8}|{row['technology']:<20}") + + # Group by producer type and technology (therefore dropping subtechnology level). + tb = tb.groupby(["country", "year", "producer_type", "technology"], observed=True, as_index=False).agg( + {"capacity": "sum"} + ) + + return tb + + +def sanity_check_outputs(tb: Table, tb_global: Table) -> None: + # Just for peace of mind, check again that the resulting global data (for on-grid technologies) matches (within a small error) with the original global data. + _tb_global = ( + tb[(tb["producer_type"] == "On-grid") & (tb["country"] == "World")] + .groupby(["year"], observed=True, as_index=False) + .agg({"capacity": "sum"}) + ) + check = ( + tb_global.groupby("year", observed=True, as_index=False) + .agg({"capacity": "sum"}) + .merge(_tb_global, on="year", suffixes=("", "_sum"), validate="1:1") + ) + error = "Adding up on-grid capacity for all countries does not add up to global data." + assert check[(100 * abs(check["capacity_sum"] - check["capacity"]) / check["capacity"]) > 1].empty, error + + # Check that there are no missing values or negative values. + error = "Unexpected missing values." + assert tb.notnull().all().all(), error + error = "Unexpected negative values." + assert (tb["capacity"] >= 0).all(), error + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load dataset from Meadow and read its main table. + ds_meadow = paths.load_dataset("renewable_capacity_statistics") + tb = ds_meadow.read("renewable_capacity_statistics") + + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # + # Process data. + # + # Select and rename columns. + tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Drop empty rows. + tb = tb.dropna(subset="capacity").reset_index(drop=True) + + # Harmonize country names. + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + # Get original global data (used for sanity checks). + tb_global = tb[(tb["country"] == "World")][["group_technology", "year", "capacity"]].reset_index(drop=True) + + # Remove original regional and global data, and perform some sanity checks. + tb = remove_original_regional_and_global_data(tb=tb, tb_global=tb_global) # type: ignore + + # Remap categories. + tb = remap_categories(tb=tb) + + # Add region aggregates. + tb = geo.add_regions_to_table( + tb, + index_columns=["country", "year", "producer_type", "technology"], + regions=REGIONS, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + min_num_values_per_year=1, + ) + + # Add groups with total capacity (e.g. "Solar (total)"). + for group_name, group_members in NEW_GROUPS.items(): + _tb = ( + tb[(tb["technology"].isin(group_members))] + .groupby(["country", "year"], observed=True, as_index=False) + .agg({"capacity": "sum"}) + .assign(**{"technology": group_name, "producer_type": "Both"}) + ) + tb = pr.concat([tb, _tb], ignore_index=True) + + # Sanity check outputs. + sanity_check_outputs(tb=tb, tb_global=tb_global) # type: ignore + + # Exclude technologies that are not explicitly mentioned in the IRENA's Renewable Capacity Statistics 2024 document. + tb = tb[~tb["technology"].isin(EXCLUDE_TECHNOLOGIES)].reset_index(drop=True) + + # Change from long to wide format. + off_grid_filter = tb["producer_type"] == "Off-grid" + tb["technology"] = tb["technology"].astype(str) + tb.loc[off_grid_filter, "technology"] = tb[off_grid_filter]["technology"] + " (off-grid)" + tb = tb.drop(columns="producer_type").pivot( + index=["country", "year"], columns="technology", values="capacity", join_column_levels_with="_" + ) + + # Set an appropriate index and sort conveniently. + tb = tb.format(keys=["country", "year"], sort_columns=True) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/irena/2024-11-15/renewable_power_generation_costs.countries.json b/etl/steps/data/garden/irena/2024-11-15/renewable_power_generation_costs.countries.json new file mode 100644 index 00000000000..649968e89e6 --- /dev/null +++ b/etl/steps/data/garden/irena/2024-11-15/renewable_power_generation_costs.countries.json @@ -0,0 +1,49 @@ +{ + "Argentina": "Argentina", + "Australia": "Australia", + "Austria": "Austria", + "Brazil": "Brazil", + "Canada": "Canada", + "Chile": "Chile", + "China": "China", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cyprus": "Cyprus", + "Denmark": "Denmark", + "Dominican Republic": "Dominican Republic", + "Egypt": "Egypt", + "Ethiopia": "Ethiopia", + "Finland": "Finland", + "France": "France", + "Germany": "Germany", + "Greece": "Greece", + "India": "India", + "Indonesia": "Indonesia", + "Ireland": "Ireland", + "Italy": "Italy", + "Japan": "Japan", + "Mexico": "Mexico", + "Morocco": "Morocco", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Norway": "Norway", + "Pakistan": "Pakistan", + "Panama": "Panama", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Republic of Korea": "South Korea", + "Russian Federation": "Russia", + "Saudi Arabia": "Saudi Arabia", + "South Africa": "South Africa", + "Spain": "Spain", + "Sweden": "Sweden", + "Turkey": "Turkey", + "Ukraine": "Ukraine", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Viet Nam": "Vietnam", + "World": "World" +} diff --git a/etl/steps/data/garden/irena/2024-11-15/renewable_power_generation_costs.meta.yml b/etl/steps/data/garden/irena/2024-11-15/renewable_power_generation_costs.meta.yml new file mode 100644 index 00000000000..43eb368cb5e --- /dev/null +++ b/etl/steps/data/garden/irena/2024-11-15/renewable_power_generation_costs.meta.yml @@ -0,0 +1,64 @@ +definitions: + common: + processing_level: minor + presentation: + topic_tags: + - Energy + +dataset: + update_period_days: 365 + +tables: + renewable_power_generation_costs: + title: Renewable power generation costs + common: + description_key: + - Levelized cost of energy (LCOE) estimates the average cost per unit of energy generated across the lifetime of a new power plant. It is measured in US$ per kilowatt-hour. + variables: + bioenergy: + title: Bioenergy levelized cost of energy + display: + name: Bioenergy + presentation: + title_public: Bioenergy levelized cost of energy + concentrated_solar_power: + title: Concentrated solar power levelized cost of energy + display: + name: Concentrated solar power + presentation: + title_public: Concentrated solar power levelized cost of energy + geothermal: + title: Geothermal levelized cost of energy + display: + name: Geothermal + presentation: + title_public: Geothermal levelized cost of energy + hydropower: + title: Hydropower levelized cost of energy + display: + name: Hydropower + presentation: + title_public: Hydropower levelized cost of energy + offshore_wind: + title: Offshore wind levelized cost of energy + display: + name: Offshore wind + presentation: + title_public: Offshore wind levelized cost of energy + onshore_wind: + title: Onshore wind levelized cost of energy + display: + name: Onshore wind + presentation: + title_public: Onshore wind levelized cost of energy + solar_photovoltaic: + title: Solar photovoltaic levelized cost of energy + display: + name: Solar photovoltaic + presentation: + title_public: Solar photovoltaic levelized cost of energy + solar_photovoltaic_module_prices: + title: Solar photovoltaic module prices + variables: + cost: + title: Solar photovoltaic module prices diff --git a/etl/steps/data/garden/irena/2024-11-15/renewable_power_generation_costs.py b/etl/steps/data/garden/irena/2024-11-15/renewable_power_generation_costs.py new file mode 100644 index 00000000000..2fc99d47fe7 --- /dev/null +++ b/etl/steps/data/garden/irena/2024-11-15/renewable_power_generation_costs.py @@ -0,0 +1,35 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("renewable_power_generation_costs") + tb = ds_meadow.read("renewable_power_generation_costs", safe_types=False) + tb_solar_pv = ds_meadow.read("solar_photovoltaic_module_prices", reset_index=False, safe_types=False) + + # + # Process data. + # + # Harmonize country names. + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + # Improve table formatting. + tb = tb.format() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb, tb_solar_pv], default_metadata=ds_meadow.metadata, check_variables_metadata=True + ) + ds_garden.save() diff --git a/etl/steps/data/garden/itopf/2024-10-16/oil_spills.py b/etl/steps/data/garden/itopf/2024-10-16/oil_spills.py index 7da88d81656..5578924f81b 100644 --- a/etl/steps/data/garden/itopf/2024-10-16/oil_spills.py +++ b/etl/steps/data/garden/itopf/2024-10-16/oil_spills.py @@ -30,7 +30,7 @@ def run(dest_dir: str) -> None: ) # set NaN everywhere except start of a decade tb.loc[mask, "decadal_" + str(column)] = tb.loc[mask, "decadal_" + str(column)].where( - tb.loc[mask, "year"] % 10 == 0, np.nan + tb.loc[mask, "year"].astype(int) % 10 == 0, np.nan ) # Replace any '__' in column names with a space (done because of double _ in some variable names) diff --git a/etl/steps/data/garden/iucn/2022-12-08/threatened_and_evaluated_species.py b/etl/steps/data/garden/iucn/2022-12-08/threatened_and_evaluated_species.py index a4968d65180..02d9f00c5ef 100644 --- a/etl/steps/data/garden/iucn/2022-12-08/threatened_and_evaluated_species.py +++ b/etl/steps/data/garden/iucn/2022-12-08/threatened_and_evaluated_species.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # snap = paths.load_snapshot() - tb = snap.read().set_index(["country", "year"]) + tb = snap.read(safe_types=False).set_index(["country", "year"]) # # Save outputs. diff --git a/etl/steps/data/garden/lgbt_rights/2023-04-27/lgbti_policy_index.py b/etl/steps/data/garden/lgbt_rights/2023-04-27/lgbti_policy_index.py index 159b6cc0de6..991d924a498 100644 --- a/etl/steps/data/garden/lgbt_rights/2023-04-27/lgbti_policy_index.py +++ b/etl/steps/data/garden/lgbt_rights/2023-04-27/lgbti_policy_index.py @@ -56,7 +56,7 @@ def run(dest_dir: str) -> None: ds_population = paths.load_dataset("population") # Read table from meadow dataset. - tb = ds_meadow["lgbti_policy_index"].reset_index() + tb = ds_meadow.read("lgbti_policy_index") # # Process data. diff --git a/etl/steps/data/garden/lis/2024-06-13/luxembourg_income_study.py b/etl/steps/data/garden/lis/2024-06-13/luxembourg_income_study.py index 3941e80aba1..028cc1ed7a4 100644 --- a/etl/steps/data/garden/lis/2024-06-13/luxembourg_income_study.py +++ b/etl/steps/data/garden/lis/2024-06-13/luxembourg_income_study.py @@ -145,7 +145,7 @@ def make_table_wide(tb: Table, cols_to_wide: List[str]) -> Table: tb = tb.drop(columns=["dataset"]) # Change names of equivalized variable, to create a distinguishable name - tb["eq"] = tb["eq"].replace({1: "eq", 0: "pc"}) + tb["eq"] = tb["eq"].astype(int).replace({1: "eq", 0: "pc"}).astype("string[pyarrow]") # Create pivot table and join different levels of column tb = tb.pivot(index=["country", "year"], columns=cols_to_wide, join_column_levels_with="_").reset_index(drop=True) @@ -155,7 +155,7 @@ def make_table_wide(tb: Table, cols_to_wide: List[str]) -> Table: # Load `keyvars` meadow dataset, rename and drop variables def load_keyvars(age: str, ds_meadow: Dataset) -> Table: - tb_keyvars = ds_meadow[f"lis_keyvars{age}"].reset_index() + tb_keyvars = ds_meadow.read(f"lis_keyvars{age}", safe_types=False) # Use less technical names for some variables tb_keyvars.columns = tb_keyvars.columns.str.replace("fgt0", "headcount_ratio") @@ -214,7 +214,7 @@ def create_relative_pov_variables(tb_keyvars: Table, relative_povlines: List[int # Load `abs_poverty` meadow dataset, rename variables def load_abs_poverty(tb_keyvars: Table, age: str, ds_meadow: Dataset) -> Table: - tb_abs_poverty = ds_meadow[f"lis_abs_poverty{age}"].reset_index() + tb_abs_poverty = ds_meadow.read(f"lis_abs_poverty{age}", safe_types=False) # Add population variable from keyvars tb_abs_poverty = pr.merge( @@ -260,7 +260,7 @@ def create_absolute_pov_variables(tb_abs_poverty: Table) -> Table: # Load `distribution` meadow dataset, rename variables def load_distribution(age: str, ds_meadow: Dataset) -> Table: - tb_distribution = ds_meadow[f"lis_distribution{age}"].reset_index() + tb_distribution = ds_meadow.read(f"lis_distribution{age}", safe_types=False) # Transform percentile variable to `pxx` tb_distribution["percentile"] = "p" + tb_distribution["percentile"].astype(str) @@ -327,13 +327,13 @@ def create_distributional_variables(tb_distribution: Table, age: str, ds_meadow: def percentiles_table(tb_name: str, ds_meadow: Dataset, tb_keyvars: Table) -> Table: # Read table from meadow dataset. - tb = ds_meadow[tb_name].reset_index() + tb = ds_meadow.read(tb_name, safe_types=False) # Drop dataset variable tb = tb.drop(columns=["dataset"]) # Change names of equivalized variable, to create a distinguishable name - tb["eq"] = tb["eq"].replace({1: "eq", 0: "pc"}) + tb["eq"] = tb["eq"].astype(int).replace({1: "eq", 0: "pc"}).astype("string[pyarrow]") # Rename variable column to welfare tb = tb.rename(columns={"variable": "welfare", "eq": "equivalization"}) diff --git a/etl/steps/data/garden/maternal_mortality/2024-07-08/maternal_mortality.meta.yml b/etl/steps/data/garden/maternal_mortality/2024-07-08/maternal_mortality.meta.yml index 768460ae604..21f6c02e771 100644 --- a/etl/steps/data/garden/maternal_mortality/2024-07-08/maternal_mortality.meta.yml +++ b/etl/steps/data/garden/maternal_mortality/2024-07-08/maternal_mortality.meta.yml @@ -37,8 +37,10 @@ tables: short_unit: "" display: numDecimalPlaces: 1 + entityAnnotationsMap: 'United States: Values from 2003–2017 affected by measurement change' description_key: - "{definitions.description_maternal_mortality}" + mm_rate: title: Maternal mortality rate description_short: |- diff --git a/etl/steps/data/garden/met_office_hadley_centre/2024-11-18/near_surface_temperature.meta.yml b/etl/steps/data/garden/met_office_hadley_centre/2024-11-18/near_surface_temperature.meta.yml new file mode 100644 index 00000000000..464ce66829d --- /dev/null +++ b/etl/steps/data/garden/met_office_hadley_centre/2024-11-18/near_surface_temperature.meta.yml @@ -0,0 +1,28 @@ +definitions: + common: + description_key: + - Temperature anomalies are given in degrees Celsius relative to the average temperature over the period 1961-1990. + - Temperature anomalies are available for the Northern Hemisphere and the Southern Hemisphere. + - The global mean is calculated by averaging anomalies for northern and southern hemispheres. + presentation: + topic_tags: + - Climate Change + +dataset: + update_period_days: 90 + +tables: + near_surface_temperature: + variables: + temperature_anomaly: + title: Global average temperature anomaly relative to 1961-1990 + short_unit: °C + unit: degrees Celsius + upper_limit: + title: Upper bound of the annual temperature anomaly (95% confidence interval) + short_unit: °C + unit: degrees Celsius + lower_limit: + title: Lower bound of the annual temperature anomaly (95% confidence interval) + short_unit: °C + unit: degrees Celsius diff --git a/etl/steps/data/garden/met_office_hadley_centre/2024-11-18/near_surface_temperature.py b/etl/steps/data/garden/met_office_hadley_centre/2024-11-18/near_surface_temperature.py new file mode 100644 index 00000000000..1fcefd7510f --- /dev/null +++ b/etl/steps/data/garden/met_office_hadley_centre/2024-11-18/near_surface_temperature.py @@ -0,0 +1,22 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("near_surface_temperature") + tb_meadow = ds_meadow["near_surface_temperature"] + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_meadow], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/migration/2024-11-18/migration_between_regions.meta.yml b/etl/steps/data/garden/migration/2024-11-18/migration_between_regions.meta.yml new file mode 100644 index 00000000000..92551c5a449 --- /dev/null +++ b/etl/steps/data/garden/migration/2024-11-18/migration_between_regions.meta.yml @@ -0,0 +1,17 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Migration + description_processing: We aggregated migrants between different regions, using our definition of continents and the World Bank definition of regions. + processing_level: major + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + title: Migration between regions + + diff --git a/etl/steps/data/garden/migration/2024-11-18/migration_between_regions.py b/etl/steps/data/garden/migration/2024-11-18/migration_between_regions.py new file mode 100644 index 00000000000..e053657eb5d --- /dev/null +++ b/etl/steps/data/garden/migration/2024-11-18/migration_between_regions.py @@ -0,0 +1,57 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +REGIONS = [reg for reg in geo.REGIONS.keys() if reg != "European Union (27)"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_migration = paths.load_dataset("migrant_stock") + ds_regions = paths.load_dataset("regions") + ds_income_groups = paths.load_dataset("income_groups") + + # Read table from meadow dataset. + tb = ds_migration["migrant_stock_dest_origin"].reset_index() + + # Aggregate regions (twice, once for destination and once for origin). + tb_reg = geo.add_regions_to_table( + tb, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + index_columns=["country_destination", "country_origin", "year"], + country_col="country_destination", + frac_allowed_nans_per_year=0.1, + ) + + tb_reg = geo.add_regions_to_table( + tb_reg, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + index_columns=["country_destination", "country_origin", "year"], + country_col="country_origin", + frac_allowed_nans_per_year=0.1, + ) + + # Filter only on regions + tb_reg = tb_reg[tb_reg["country_destination"].isin(REGIONS) & tb_reg["country_origin"].isin(REGIONS)] + + tb_reg = tb_reg.format(["country_destination", "country_origin", "year"], short_name="migration_between_regions") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb_reg], check_variables_metadata=True, default_metadata=ds_migration.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/migration/2024-11-19/migration_distance.countries.json b/etl/steps/data/garden/migration/2024-11-19/migration_distance.countries.json new file mode 100644 index 00000000000..e3cbe39c0a3 --- /dev/null +++ b/etl/steps/data/garden/migration/2024-11-19/migration_distance.countries.json @@ -0,0 +1,260 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Angola": "Angola", + "Antarctica": "Antarctica", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bangladesh": "Bangladesh", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Congo": "Congo", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Greenland": "Greenland", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Mali": "Mali", + "Mauritania": "Mauritania", + "Mexico": "Mexico", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Korea": "North Korea", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palestine": "Palestine", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Romania": "Romania", + "Russia": "Russia", + "Rwanda": "Rwanda", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Sierra Leone": "Sierra Leone", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Somalia": "Somalia", + "Somaliland": "Somaliland", + "South Africa": "South Africa", + "South Korea": "South Korea", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Taiwan": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Togo": "Togo", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Fr. S. Antarctic Lands": "French Southern Territories", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Anguilla": "Anguilla", + "Antigua and Barbuda": "Antigua and Barbuda", + "Aruba": "Aruba", + "Bahrain": "Bahrain", + "Barbados": "Barbados", + "Bermuda": "Bermuda", + "British Indian Ocean Territory": "British Indian Ocean Territory", + "British Virgin Islands": "British Virgin Islands", + "Cayman Islands": "Cayman Islands", + "Comoros": "Comoros", + "Cook Islands": "Cook Islands", + "Dominica": "Dominica", + "Faroe Islands": "Faroe Islands", + "French Polynesia": "French Polynesia", + "Gibraltar": "Gibraltar", + "Grenada": "Grenada", + "Guam": "Guam", + "Guernsey": "Guernsey", + "Isle of Man": "Isle of Man", + "Jersey": "Jersey", + "Kiribati": "Kiribati", + "Liechtenstein": "Liechtenstein", + "Maldives": "Maldives", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritius": "Mauritius", + "Monaco": "Monaco", + "Nauru": "Nauru", + "Niue": "Niue", + "Northern Mariana Islands": "Northern Mariana Islands", + "Palau": "Palau", + "Saint Barthelemy": "Saint Barthelemy", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Seychelles": "Seychelles", + "Singapore": "Singapore", + "Tonga": "Tonga", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "United States Virgin Islands": "United States Virgin Islands", + "Vatican": "Vatican", + "Wallis and Futuna": "Wallis and Futuna", + "Aland Islands": "Aland Islands", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Cote d'Ivoire": "Cote d'Ivoire", + "Curacao": "Curacao", + "Democratic Republic of Congo": "Democratic Republic of Congo", + "Dominican Republic": "Dominican Republic", + "East Timor": "East Timor", + "Equatorial Guinea": "Equatorial Guinea", + "Eswatini": "Eswatini", + "Falkland Islands": "Falkland Islands", + "Heard Island and McDonald Islands": "Heard Island and McDonald Islands", + "Hong Kong": "Hong Kong", + "Macao": "Macao", + "Micronesia (country)": "Micronesia (country)", + "Montserrat": "Montserrat", + "Norfolk Island": "Norfolk Island", + "Northern Cyprus": "Northern Cyprus", + "Pitcairn": "Pitcairn", + "Saint Martin (French part)": "Saint Martin (French part)", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Solomon Islands": "Solomon Islands", + "South Sudan": "South Sudan", + "United States": "United States", + "Western Sahara": "Western Sahara", + "USNB Guantanamo Bay": "Guantanamo Bay", + "Akrotiri": "Akrotiri", + "Serranilla Bank": "Serranilla Bank", + "S. Geo. and the Is.": "South Georgia and the South Sandwich Islands", + "Bir Tawil": "Bir Tawil", + "Scarborough Reef": "Scarborough Reef", + "Spratly Is.": "Spratly Islands", + "Coral Sea Is.": "Coral Sea Islands", + "U.S. Minor Outlying Is.": "U.S. Minor Outlying Islands", + "Siachen Glacier": "Siachen Glacier", + "Indian Ocean Ter.": "Indian Ocean Territories", + "Southern Patagonian Ice Field": "Southern Patagonian Ice Field", + "Cyprus U.N. Buffer Zone": "Cyprus U.N. Buffer Zone", + "Dhekelia": "Dhekelia", + "Clipperton I.": "Clipperton Island", + "Ashmore and Cartier Is.": "Ashmore and Cartier Islands", + "Bajo Nuevo Bank": "Bajo Nuevo Bank", + "Baikonur": "Baikonur", + "Brazilian I.": "Brazilian Island" +} diff --git a/etl/steps/data/garden/migration/2024-11-19/migration_distance.meta.yml b/etl/steps/data/garden/migration/2024-11-19/migration_distance.meta.yml new file mode 100644 index 00000000000..23950136cf1 --- /dev/null +++ b/etl/steps/data/garden/migration/2024-11-19/migration_distance.meta.yml @@ -0,0 +1,37 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Migration + processing_level: major + description_processing: >- + - The migration distance indicator measures the median distance between the borders of the origin and destination country of migrants. It is calculated by first taking the minimum distance between the borders of origin and destination countries for each migrant, and then taking the median of these minimum distances. + - The indicator is based on the UN DESA International Migrant Stock data, which provides the number of international migrants by country of origin and destination. The geospatial data for the borders of countries is retrieved from the Natural Earth project (https://www.naturalearthdata.com/downloads/110m-cultural-vectors/), specifically the "Admin 0 - countries and tiny country points" datasets. + + +dataset: + update_period_days: 365 + title: Median migration distance (between borders) + description: |- + This data set includes the median distance between the borders of the origin and destination country of migrants. It is calculated by first taking the minimum distance between the borders of origin and destination countries for each migrant, and then calculating the median of these minimum distances. + + The number of international migrants and their origin and destination countries are based on the UN DESA International Migrant Stock data. The geospatial data for the borders of countries is retrieved from the Natural Earth project (https://www.naturalearthdata.com/downloads/110m-cultural-vectors/), specifically the admin 0 - countries and tiny country points datasets. + + + +tables: + migration_distance: + variables: + median_distance: + title: Median migration distance for emigrants leaving this country + description_short: |- + The median distance between the borders of the origin and destination country of migrants. It is calculated by first taking the minimum distance between the borders of origin and destination countries for each migrant, and then calculating the median of these minimum distances. + unit: kilometers + short_unit: km + total_emigrants: + title: Total number of migrants leaving this country + description_short: The total number of international migrants leaving this country. + unit: persons + short_unit: "" + diff --git a/etl/steps/data/garden/migration/2024-11-19/migration_distance.py b/etl/steps/data/garden/migration/2024-11-19/migration_distance.py new file mode 100644 index 00000000000..f48fd00d872 --- /dev/null +++ b/etl/steps/data/garden/migration/2024-11-19/migration_distance.py @@ -0,0 +1,139 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import warnings + +import geopandas as gpd +import pandas as pd +import structlog +from geopy.distance import geodesic +from shapely import wkt +from shapely.ops import nearest_points +from tqdm import tqdm + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +LOG = structlog.get_logger() + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("migrant_stock") + ds_nat_earth = paths.load_dataset("nat_earth_110") + + # Read table from meadow dataset. + tb = ds_meadow["migrant_stock_dest_origin"].reset_index() + tb_countries = ds_nat_earth["nat_earth_110"].reset_index() + + # Read natural earth data + # Convert the geometry string column to a Shapely object. + tb_countries["geometry"] = tb_countries["geometry"].apply(wkt.loads) + world = gpd.GeoDataFrame(tb_countries, geometry="geometry") + + # use World Geodetic System 1984 as projection + world = world.set_crs("EPSG:4326") + + # harmonize country names + world = geo.harmonize_countries( + df=world, # type: ignore + country_col="name", + countries_file=paths.country_mapping_path, + ) + + # Calculate distance matrix (in km) (catch warnings to ignore "invalid value encountered" warning) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=RuntimeWarning) + distance_matrix = calculate_distance_matrix(world) + + ## Add distances to migration flows table + # Remove countries not included in nat earth data and "Other" from country destination or country origin columns + cty_no_data = [ + "Other", + "Tokelau", + "Bonaire Sint Eustatius and Saba", + "French Guiana", + "Guadeloupe", + "Martinique", + "Reunion", + "Channel Islands", + "Mayotte", + ] + + cty_data = [cty for cty in tb["country_origin"].unique() if cty not in cty_no_data] + tb = tb[(tb["country_destination"].isin(cty_data)) & (tb["country_origin"].isin(cty_data))] + + # Add distance to the table + tb["distance"] = tb.apply( + lambda row: distance_matrix.loc[row["country_origin"], row["country_destination"]], axis=1 + ) + tb["distance"] = tb["distance"].apply(get_min_distance).astype("Float64") + + migrant_groups = tb.groupby(["country_origin", "year"]) + med_distance = migrant_groups.apply(calc_median).reset_index() + med_distance["median_distance"] = med_distance[0].apply(lambda x: x[0]) + med_distance["total_emigrants"] = med_distance[0].apply(lambda x: x[1]) + med_distance = med_distance.drop(columns=[0]).copy_metadata(tb) + + med_distance.metadata.dataset.short_name = "migration_distance" + med_distance.metadata.short_name = "migration_distance" + + for col in med_distance.columns: + med_distance[col].metadata.origins = tb["country_origin"].m.origins + tb_countries["name"].m.origins + + med_distance = med_distance.format(["country_origin", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[med_distance], check_variables_metadata=True, default_metadata=None) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def get_min_distance(distance): + """Get the minimum distance between two countries. + Sometimes the distance is a DataFrame or Series (if country is represented by multiple polygons). Then we take the minimum distance.""" + if isinstance(distance, pd.Series): + return distance.min() + elif isinstance(distance, pd.DataFrame): + return distance.min().min() + else: + return distance + + +def calc_median(group, col="distance"): + """Calculate the median distance for each country origin and year.""" + group = group.sort_values(by=col) + group["cumulative_journeys"] = group["migrants_all_sexes"].cumsum() + total_journeys = group["migrants_all_sexes"].sum() + median_journey = total_journeys / 2 + median_dist = group[group["cumulative_journeys"] >= median_journey].iloc[0]["distance"] + return median_dist, total_journeys + + +def calculate_distance_matrix(world): + # Create an empty distance matrix + distance_matrix = pd.DataFrame(index=world["name"], columns=world["name"]) + + for i, row1 in tqdm(world.iterrows(), total=len(world), desc="Calculating distance matrix"): + for j, row2 in world.iterrows(): + if i == j: + distance_matrix.iloc[i, j] = 0 # Distance to itself + elif i > j: + distance_matrix.iloc[i, j] = distance_matrix.iloc[j, i] # Distance is symmetric + else: + # Get the nearest points between two geometries + point1, point2 = nearest_points(row1.geometry, row2.geometry) # type: ignore + + # Calculate geodesic distance between the nearest points + distance_matrix.iloc[i, j] = geodesic((point1.y, point1.x), (point2.y, point2.x)).kilometers # type: ignore + + return distance_matrix diff --git a/etl/steps/data/garden/minerals/2024-07-15/minerals.meta.yml b/etl/steps/data/garden/minerals/2024-07-15/minerals.meta.yml index 21a8f682331..51a5d64a43b 100644 --- a/etl/steps/data/garden/minerals/2024-07-15/minerals.meta.yml +++ b/etl/steps/data/garden/minerals/2024-07-15/minerals.meta.yml @@ -999,6 +999,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of aluminum, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Aluminum + shortUnit: $ presentation: title_public: Aluminum unit value unit_value_antimony_mine_constant_1998_usd_per_tonne: @@ -1007,6 +1010,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [antimony](#dod:antimony), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Antimony + shortUnit: $ presentation: title_public: Antimony unit value unit_value_arsenic_processing_constant_1998_usd_per_tonne: @@ -1015,6 +1021,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of arsenic, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Arsenic + shortUnit: $ presentation: title_public: Arsenic unit value unit_value_asbestos_mine_constant_1998_usd_per_tonne: @@ -1023,6 +1032,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of asbestos, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Asbestos + shortUnit: $ presentation: title_public: Asbestos unit value unit_value_barite_mine_constant_1998_usd_per_tonne: @@ -1031,6 +1043,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [barite](#dod:barite), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Barite + shortUnit: $ presentation: title_public: Barite unit value unit_value_bauxite_mine_constant_1998_usd_per_tonne: @@ -1039,6 +1054,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [bauxite](#dod:bauxite), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Bauxite + shortUnit: $ presentation: title_public: Bauxite unit value unit_value_beryllium_mine_constant_1998_usd_per_tonne: @@ -1047,6 +1065,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [beryllium](#dod:beryllium), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Beryllium + shortUnit: $ presentation: title_public: Beryllium unit value unit_value_bismuth_mine_constant_1998_usd_per_tonne: @@ -1055,6 +1076,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [bismuth](#dod:bismuth), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Bismuth + shortUnit: $ presentation: title_public: Bismuth unit value unit_value_boron_mine_constant_1998_usd_per_tonne: @@ -1063,6 +1087,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [boron](#dod:boron), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Boron + shortUnit: $ presentation: title_public: Boron unit value unit_value_cadmium_refinery_constant_1998_usd_per_tonne: @@ -1071,6 +1098,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [cadmium](#dod:cadmium), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Cadmium + shortUnit: $ presentation: title_public: Refined cadmium unit value unit_value_cement_processing_constant_1998_usd_per_tonne: @@ -1079,6 +1109,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of cement, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Cement + shortUnit: $ presentation: title_public: Cement unit value # unit_value_chromium_mine_constant_1998_usd_per_tonne: @@ -1087,6 +1120,9 @@ tables: # short_unit: $/t # description_short: Value of 1 tonne of chromium, in constant 1998 US$ per tonne. # description_key: *unit-value-description-key + # display: + # name: Chromium + # shortUnit: $ # presentation: # title_public: Chromium unit value unit_value_cobalt_value_constant_1998_usd_per_tonne: @@ -1095,6 +1131,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [cobalt](#dod:cobalt), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Cobalt + shortUnit: $ presentation: title_public: Cobalt unit value unit_value_copper_mine_constant_1998_usd_per_tonne: @@ -1103,6 +1142,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of copper, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Copper + shortUnit: $ presentation: title_public: Copper unit value unit_value_diamond_mine_and_synthetic__industrial_constant_1998_usd_per_tonne: @@ -1111,6 +1153,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of diamond, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Diamond + shortUnit: $ presentation: title_public: Diamond unit value unit_value_feldspar_mine_constant_1998_usd_per_tonne: @@ -1119,6 +1164,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [feldspar](#dod:feldspar), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Feldspar + shortUnit: $ presentation: title_public: Feldspar unit value unit_value_fluorspar_mine_constant_1998_usd_per_tonne: @@ -1127,6 +1175,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [fluorspar](#dod:fluorspar), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Fluorspar + shortUnit: $ presentation: title_public: Fluorspar unit value unit_value_gallium_refinery_constant_1998_usd_per_tonne: @@ -1135,6 +1186,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [gallium](#dod:gallium), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Gallium + shortUnit: $ presentation: title_public: Refined gallium unit value unit_value_garnet_mine_constant_1998_usd_per_tonne: @@ -1143,6 +1197,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [garnet](#dod:garnet), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Garnet + shortUnit: $ presentation: title_public: Garnet unit value unit_value_germanium_refinery_constant_1998_usd_per_tonne: @@ -1151,6 +1208,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [germanium](#dod:germanium), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Germanium + shortUnit: $ presentation: title_public: Refined germanium unit value unit_value_gold_mine_constant_1998_usd_per_tonne: @@ -1159,6 +1219,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of gold, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Gold + shortUnit: $ presentation: title_public: Gold unit value unit_value_graphite_mine_constant_1998_usd_per_tonne: @@ -1167,6 +1230,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [graphite](#dod:graphite), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Graphite + shortUnit: $ presentation: title_public: Graphite unit value unit_value_gypsum_mine_constant_1998_usd_per_tonne: @@ -1175,6 +1241,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [gypsum](#dod:gypsum), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Gypsum + shortUnit: $ presentation: title_public: Gypsum unit value unit_value_helium_mine_constant_1998_usd_per_tonne: @@ -1183,6 +1252,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of helium, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Helium + shortUnit: $ presentation: title_public: Helium unit value unit_value_iron_ore_mine__crude_ore_constant_1998_usd_per_tonne: @@ -1191,6 +1263,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of iron ore, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Iron ore + shortUnit: $ presentation: title_public: Iron ore (crude ore) unit value unit_value_iron_smelter__pig_iron_constant_1998_usd_per_tonne: @@ -1199,6 +1274,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of iron, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Iron + shortUnit: $ presentation: title_public: Iron unit value unit_value_lead_mine_constant_1998_usd_per_tonne: @@ -1207,6 +1285,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of lead, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Lead + shortUnit: $ presentation: title_public: Lead unit value unit_value_lime_processing_constant_1998_usd_per_tonne: @@ -1215,14 +1296,31 @@ tables: short_unit: $/t description_short: Value of 1 tonne of lime, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Lime + shortUnit: $ presentation: title_public: Lime unit value + unit_value_lithium_mine_constant_1998_usd_per_tonne: + title: unit value|Lithium|Mine|constant 1998 US$ per tonne + unit: constant 1998 US$ per tonne + short_unit: $/t + description_short: Value of 1 tonne of lithium, in constant 1998 US$ per tonne. + description_key: *unit-value-description-key + display: + name: Lithium + shortUnit: $ + presentation: + title_public: Lithium unit value unit_value_magnesium_compounds_mine_constant_1998_usd_per_tonne: title: unit value|Magnesium compounds|Mine|constant 1998 US$ per tonne unit: constant 1998 US$ per tonne short_unit: $/t description_short: Value of 1 tonne of magnesium compounds, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Magnesium compounds + shortUnit: $ presentation: title_public: Magnesium compounds unit value unit_value_magnesium_metal_smelter_constant_1998_usd_per_tonne: @@ -1231,6 +1329,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of magnesium metal, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Magnesium metal + shortUnit: $ presentation: title_public: Magnesium metal unit value unit_value_manganese_mine_constant_1998_usd_per_tonne: @@ -1239,6 +1340,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of manganese, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Manganese + shortUnit: $ presentation: title_public: Manganese unit value unit_value_mercury_mine_constant_1998_usd_per_tonne: @@ -1247,6 +1351,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of mercury, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Mercury + shortUnit: $ presentation: title_public: Mercury unit value unit_value_mica_mine__scrap_and_flake_constant_1998_usd_per_tonne: @@ -1255,6 +1362,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [mica](#dod:mica) (scrap and flake), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Mica (scrap and flake) + shortUnit: $ presentation: title_public: Mica (scrap and flake) unit value unit_value_mica_mine__sheet_constant_1998_usd_per_tonne: @@ -1263,6 +1373,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [mica](#dod:mica) (sheet), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Mica (sheet) + shortUnit: $ presentation: title_public: Mica (sheet) unit value unit_value_molybdenum_mine_constant_1998_usd_per_tonne: @@ -1271,6 +1384,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [molybdenum](#dod:molybdenum), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Molybdenum + shortUnit: $ presentation: title_public: Molybdenum unit value unit_value_nickel_mine_constant_1998_usd_per_tonne: @@ -1279,6 +1395,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of nickel, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Nickel + shortUnit: $ presentation: title_public: Nickel unit value unit_value_niobium_mine_constant_1998_usd_per_tonne: @@ -1287,6 +1406,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [niobium](#dod:niobium), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Niobium + shortUnit: $ presentation: title_public: Niobium unit value unit_value_nitrogen_fixed_ammonia_constant_1998_usd_per_tonne: @@ -1295,6 +1417,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of nitrogen, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Nitrogen + shortUnit: $ presentation: title_public: Nitrogen unit value unit_value_phosphate_rock_mine_constant_1998_usd_per_tonne: @@ -1303,14 +1428,31 @@ tables: short_unit: $/t description_short: Value of 1 tonne of phosphate rock, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Phosphate rock + shortUnit: $ presentation: title_public: Phosphate rock unit value + unit_value_rare_earths_mine_constant_1998_usd_per_tonne: + title: unit value|Rare earths|Mine|constant 1998 US$ per tonne + unit: constant 1998 US$ per tonne + short_unit: $/t + description_short: Value of 1 tonne of rare earths, in constant 1998 US$ per tonne. + description_key: *unit-value-description-key + display: + name: Rare earths + shortUnit: $ + presentation: + title_public: Rare earths unit value unit_value_salt_mine_constant_1998_usd_per_tonne: title: unit value|Salt|Mine|constant 1998 US$ per tonne unit: constant 1998 US$ per tonne short_unit: $/t description_short: Value of 1 tonne of salt, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Salt + shortUnit: $ presentation: title_public: Salt unit value unit_value_sand_and_gravel_mine__construction_constant_1998_usd_per_tonne: @@ -1319,6 +1461,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of sand and gravel, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Sand and gravel + shortUnit: $ presentation: title_public: Sand and gravel (construction) unit value unit_value_sand_and_gravel_mine__industrial_constant_1998_usd_per_tonne: @@ -1327,6 +1472,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of sand and gravel, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Sand and gravel + shortUnit: $ presentation: title_public: Sand and gravel (industrial) unit value unit_value_selenium_refinery_constant_1998_usd_per_tonne: @@ -1335,6 +1483,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of selenium, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Selenium + shortUnit: $ presentation: title_public: Refined selenium unit value unit_value_silicon_processing_constant_1998_usd_per_tonne: @@ -1343,6 +1494,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of silicon, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Silicon + shortUnit: $ presentation: title_public: Silicon unit value unit_value_silver_mine_constant_1998_usd_per_tonne: @@ -1351,6 +1505,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of silver, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Silver + shortUnit: $ presentation: title_public: Silver unit value unit_value_steel_processing__crude_constant_1998_usd_per_tonne: @@ -1359,6 +1516,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of steel, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Steel + shortUnit: $ presentation: title_public: Steel unit value unit_value_strontium_mine_constant_1998_usd_per_tonne: @@ -1367,6 +1527,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [strontium](#dod:strontium), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Strontium + shortUnit: $ presentation: title_public: Strontium unit value unit_value_sulfur_processing_constant_1998_usd_per_tonne: @@ -1375,6 +1538,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of sulfur, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Sulfur + shortUnit: $ presentation: title_public: Sulfur unit value unit_value_talc_and_pyrophyllite_mine_constant_1998_usd_per_tonne: @@ -1383,6 +1549,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of talc and pyrophyllite, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Talc and pyrophyllite + shortUnit: $ presentation: title_public: Talc and pyrophyllite unit value unit_value_tantalum_mine_constant_1998_usd_per_tonne: @@ -1391,6 +1560,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [tantalum](#dod:tantalum), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Tantalum + shortUnit: $ presentation: title_public: Tantalum unit value unit_value_tin_mine_constant_1998_usd_per_tonne: @@ -1399,6 +1571,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of tin, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Tin + shortUnit: $ presentation: title_public: Tin unit value unit_value_tungsten_mine_constant_1998_usd_per_tonne: @@ -1407,6 +1582,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [tungsten](#dod:tungsten), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Tungsten + shortUnit: $ presentation: title_public: Tungsten unit value unit_value_vanadium_mine_constant_1998_usd_per_tonne: @@ -1415,6 +1593,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of [vanadium](#dod:vanadium), in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Vanadium + shortUnit: $ presentation: title_public: Vanadium unit value unit_value_zinc_mine_constant_1998_usd_per_tonne: @@ -1423,6 +1604,9 @@ tables: short_unit: $/t description_short: Value of 1 tonne of zinc, in constant 1998 US$ per tonne. description_key: *unit-value-description-key + display: + name: Zinc + shortUnit: $ presentation: title_public: Zinc unit value share_of_global_production_aluminum_smelter_tonnes: diff --git a/etl/steps/data/garden/minerals/2024-07-15/minerals.py b/etl/steps/data/garden/minerals/2024-07-15/minerals.py index 4629110de23..60af1fc0c1c 100644 --- a/etl/steps/data/garden/minerals/2024-07-15/minerals.py +++ b/etl/steps/data/garden/minerals/2024-07-15/minerals.py @@ -237,9 +237,7 @@ ] # Columns to plot with the individual data sources differentiated. PLOT_TO_COMPARE_DATA_SOURCES = [ - # "production|Chromium|Mine|tonnes", - # 'production|Titanium|Mine, ilmenite|tonnes', - # 'production|Helium|Mine|tonnes', + # "production|Rhenium|Mine|tonnes", ] @@ -607,6 +605,10 @@ def combine_data( (tb["country"] != "World") & (tb["year"].isin([1992])), "production|Cobalt|Refinery|tonnes", ] = None + tb.loc[ + (tb["country"] != "World") & (tb["year"].isin([1977, 1978, 1979, 1983])), + "production|Iodine|Mine|tonnes", + ] = None #################################################################################################################### @@ -627,7 +629,7 @@ def combine_data( # # Visually compare the resulting Coal and Oil global data with the ones from the Statistical Review of World Energy. # from etl.paths import DATA_DIR - # tb_sr = Dataset(DATA_DIR / "garden/energy_institute/2024-06-20/statistical_review_of_world_energy").read_table("statistical_review_of_world_energy") + # tb_sr = Dataset(DATA_DIR / "garden/energy_institute/2024-06-20/statistical_review_of_world_energy").read("statistical_review_of_world_energy") # tb_sr = tb_sr[tb_sr["country"]=="World"][["country", "year", 'coal_production_mt', 'oil_production_mt']].rename(columns={"coal_production_mt": "production|Coal|Mine|tonnes", "oil_production_mt": "production|Petroleum|Crude|tonnes"}) # tb_sr[["production|Coal|Mine|tonnes", "production|Petroleum|Crude|tonnes"]] *= 1e6 # for column in ["production|Coal|Mine|tonnes", "production|Petroleum|Crude|tonnes"]: @@ -703,11 +705,9 @@ def run(dest_dir: str) -> None: ds_usgs = paths.load_dataset("mineral_commodity_summaries") # Read tables. - tb_usgs_historical_flat = ds_usgs_historical.read_table( - "historical_statistics_for_mineral_and_material_commodities_flat" - ) - tb_usgs_flat = ds_usgs.read_table("mineral_commodity_summaries_flat") - tb_bgs_flat = ds_bgs.read_table("world_mineral_statistics_flat") + tb_usgs_historical_flat = ds_usgs_historical.read("historical_statistics_for_mineral_and_material_commodities_flat") + tb_usgs_flat = ds_usgs.read("mineral_commodity_summaries_flat") + tb_bgs_flat = ds_bgs.read("world_mineral_statistics_flat") # Load regions dataset. # NOTE: It will only be used for sanity checks. diff --git a/etl/steps/data/garden/moatsos/2023-10-09/moatsos_historical_poverty.py b/etl/steps/data/garden/moatsos/2023-10-09/moatsos_historical_poverty.py index 485cd7981fa..05d4e3557e5 100644 --- a/etl/steps/data/garden/moatsos/2023-10-09/moatsos_historical_poverty.py +++ b/etl/steps/data/garden/moatsos/2023-10-09/moatsos_historical_poverty.py @@ -35,7 +35,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("moatsos_historical_poverty") # Read table from meadow dataset. - tb = ds_meadow["moatsos_historical_poverty"].reset_index() + tb = ds_meadow.read("moatsos_historical_poverty", safe_types=False) # # Process data. @@ -113,10 +113,12 @@ def smooth_estimates(tb: Table, poverty_lines: list) -> Table: # Select decadal years, FIRST_YEAR and LAST_YEAR (HISTORICAL AND FULL) tb_smooth = tb_smooth[ - (tb_smooth["year"] % 10 == 0) | (tb_smooth["year"] == FIRST_YEAR) | (tb_smooth["year"] == LAST_YEAR_HISTORICAL) + (tb_smooth["year"].astype(int) % 10 == 0) + | (tb_smooth["year"] == FIRST_YEAR) + | (tb_smooth["year"] == LAST_YEAR_HISTORICAL) ].reset_index(drop=True) tb_smooth_cbn = tb_smooth_cbn[ - (tb_smooth_cbn["year"] % 10 == 0) + (tb_smooth_cbn["year"].astype(int) % 10 == 0) | (tb_smooth_cbn["year"] == FIRST_YEAR) | (tb_smooth_cbn["year"] == LAST_YEAR_FULL) ].reset_index(drop=True) diff --git a/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py index 1fc4b9d7e34..78d88be085e 100644 --- a/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py +++ b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py @@ -72,7 +72,10 @@ def run(dest_dir: str) -> None: # # Create a new garden dataset with the same metadata as the meadow dataset. ds_garden = create_dataset( - dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + dest_dir, + tables=[tb], + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, ) # Save changes in the new garden dataset. diff --git a/etl/steps/data/garden/oecd/2024-04-30/affordable_housing_database.py b/etl/steps/data/garden/oecd/2024-04-30/affordable_housing_database.py index 307a0206eb0..5af627a1883 100644 --- a/etl/steps/data/garden/oecd/2024-04-30/affordable_housing_database.py +++ b/etl/steps/data/garden/oecd/2024-04-30/affordable_housing_database.py @@ -164,8 +164,7 @@ def run(dest_dir: str) -> None: tb = pr.merge(tb, tb_strategy, on=["country", "year"], how="outer", short_name=paths.short_name) # Fill nan in type_of_strategy with Not applicable - tb["type_of_strategy"] = tb["type_of_strategy"].astype(str) - tb.loc[tb["type_of_strategy"] == "nan", "type_of_strategy"] = "Not applicable" + tb["type_of_strategy"] = tb["type_of_strategy"].astype("string").fillna("Not applicable") tb = tb.format(["country", "year"]) diff --git a/etl/steps/data/garden/ophi/2024-10-28/multidimensional_poverty_index.countries.json b/etl/steps/data/garden/ophi/2024-10-28/multidimensional_poverty_index.countries.json new file mode 100644 index 00000000000..602314eb086 --- /dev/null +++ b/etl/steps/data/garden/ophi/2024-10-28/multidimensional_poverty_index.countries.json @@ -0,0 +1,114 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Armenia": "Armenia", + "Bangladesh": "Bangladesh", + "Belize": "Belize", + "Benin": "Benin", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cote d'Ivoire": "Cote d'Ivoire", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Ghana": "Ghana", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "India": "India", + "Indonesia": "Indonesia", + "Iraq": "Iraq", + "Jamaica": "Jamaica", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kyrgyzstan": "Kyrgyzstan", + "Lao PDR": "Laos", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Mali": "Mali", + "Mauritania": "Mauritania", + "Mexico": "Mexico", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Namibia": "Namibia", + "Nepal": "Nepal", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Pakistan": "Pakistan", + "Palestine, State of": "Palestine", + "Peru": "Peru", + "Philippines": "Philippines", + "Rwanda": "Rwanda", + "Sao Tome and Principe": "Sao Tome and Principe", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Sierra Leone": "Sierra Leone", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "Viet Nam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Congo, Democratic Republic of the": "Democratic Republic of Congo", + "Angola": "Angola", + "Argentina": "Argentina", + "Barbados": "Barbados", + "Bhutan": "Bhutan", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Costa Rica": "Costa Rica", + "Cuba": "Cuba", + "El Salvador": "El Salvador", + "Fiji": "Fiji", + "Georgia": "Georgia", + "Guatemala": "Guatemala", + "Kiribati": "Kiribati", + "Libya": "Libya", + "Maldives": "Maldives", + "Myanmar": "Myanmar", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Saint Lucia": "Saint Lucia", + "Samoa": "Samoa", + "Seychelles": "Seychelles", + "South Africa": "South Africa", + "Sri Lanka": "Sri Lanka", + "Tonga": "Tonga", + "Tuvalu": "Tuvalu", + "Uzbekistan": "Uzbekistan" +} \ No newline at end of file diff --git a/etl/steps/data/garden/ophi/2024-10-28/multidimensional_poverty_index.meta.yml b/etl/steps/data/garden/ophi/2024-10-28/multidimensional_poverty_index.meta.yml new file mode 100644 index 00000000000..bc50164ae0d --- /dev/null +++ b/etl/steps/data/garden/ophi/2024-10-28/multidimensional_poverty_index.meta.yml @@ -0,0 +1,332 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: minor + display: &common_display + tolerance: 12 + presentation: + topic_tags: + - Poverty + faqs: + - fragment_id: mpi-definition + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: mpi-sources + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: mpi-indicators-unavailable + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: mpi-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: mpi-other-sources + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + + # Description from producer + description_from_producer: >- + The global MPI is a measure of acute poverty covering over 100 countries in the developing + regions of the world. This measure is based on the dual-cutoff counting approach to poverty + developed by Alkire and Foster (2011). The global MPI was developed in 2010 by Alkire and + Santos (2014, 2010) in collaboration with the UNDP’s Human Development Report Office + (HDRO). Since its inception, the global MPI has used information from 10 indicators, which are + grouped into three equally weighted dimensions: health, education, and living standards. + These dimensions are the same as those used in the UNDP’s Human Development Index. + + + In 2018, the first major revision of the global MPI was undertaken, considering improvements in + survey microdata and better align to the 2030 development agenda insofar as possible (Alkire and + Jahan, 2018; OPHI, 2018). The revision consisted of adjustments in the definition of five out of + the ten indicators, namely child mortality, nutrition, years of schooling, housing and assets. Alkire, + Kanagaratnam, Nogales and Suppa (2022) provide a comprehensive analysis of the consequences + of the 2018 revision. The normative and empirical decisions that underlie the revision of the global + MPI, and adjustments related to the child mortality, nutrition, years of schooling and housing + indicators are discussed in Alkire and Kanagaratnam (2021). The revision of assets indicator is + detailed in Vollmer and Alkire (2022). + + + The global MPI begins by establishing a deprivation profile for each person, showing which of the + 10 indicators they are deprived in. Each person is identified as deprived or non-deprived in each + indicator based on a deprivation cutoff. In the case of health and education, each + household member may be identified as deprived or not deprived according to available + information for other household members. For example, if any household member for whom data + exist is undernourished, each person in that household is considered deprived in nutrition. Taking + this approach – which was required by the data – does not reveal intrahousehold disparities, but + is intuitive and assumes shared positive (or negative) effects of achieving (or not achieving) certain + outcomes. Next, looking across indicators, each person’s deprivation score is constructed by + adding up the weights of the indicators in which they are deprived. The indicators use a nested + weight structure: equal weights across dimensions and an equal weight for each indicator within a + dimension. The normalised indicator weight structure of the global MPI means that the living + standard indicators receive lower weight than health and education related indicators because from + a policy perspective, each of the three dimensions is of roughly equal normative importance. + + + In the global MPI, a person is identified as multidimensionally poor or MPI poor if they are + deprived in at least one-third of the weighted MPI indicators. In other words, a person is MPI + poor if the person’s deprivation score is equal to or higher than the poverty cutoff of 33.33 percent. + After the poverty identification step, we aggregate across individuals to obtain the incidence of + poverty or headcount ratio (H) which represents the percentage of poor people in the population. + We then compute the intensity of poverty (A), representing the average percentage of weighted + deprivations experienced by the poor. We then compute the adjusted poverty headcount ratio (M0) + or MPI by combining H and A in a multiplicative form (MPI = H x A). + + + Both the incidence and the intensity of these deprivations are highly relevant pieces of information + for poverty measurement. The incidence of poverty is intuitive and understandable by anyone. + People always want to know how many poor people there are in a society as a proportion of the + whole population. Media tend to pick up on the incidence of poverty easily. Yet, the proportion + of poor people as the headline figure is not enough (Alkire, Oldiges and Kanagaratnam, 2021). + + + A headcount ratio is also estimated using two other poverty cutoffs. The global MPI identifies + individuals as vulnerable to poverty if they are close to the one-third threshold, that is, if they are + deprived in 20 to 33.32 percent of weighted indicators. The tables also apply a higher poverty + cutoff to identify those in severe poverty, meaning those deprived in 50 percent or more of the + dimensions. + + + The AF methodology has a property that makes the global MPI even more useful—dimensional + breakdown. This property makes it possible to consistently compute the percentage of the + population who are multidimensionally poor and simultaneously deprived in each indicator. This + is known as the censored headcount ratio of an indicator. The weighted sum of censored + headcount ratios of all MPI indicators is equal to the MPI value. + + + The censored headcount ratio shows the extent of deprivations among the poor but does not + reflect the weights or relative values of the indicators. Two indicators may have the same censored + headcount ratios but different contributions to overall poverty, because the contribution depends + both on the censored headcount ratio and on the weight assigned to each indicator. As such, a + complementary analysis to the censored headcount ratio is the percentage contribution of each + indicator to overall multidimensional poverty. + + # For description_short + description_short_multidimensional_poverty: Multidimensional poverty is defined as being deprived in a range of health, education and living standards indicators. + description_short_mpi: The Multidimensional Poverty Index (MPI) is a measure that combines the prevalence and the intensity of multidimensional poverty on a scale from 0 to 1. Higher values indicate higher poverty. + description_short_headcount_ratio: This is the share of the population that is multidimensionally poor. + description_short_intensity: The intensity is the share of indicators in which people in multidimensional poverty are deprived on average. + description_short_vulnerable: This is the share of the population that is close to being in multidimensional poverty. + description_short_severe: This is the share of the population that is in severe multidimensional poverty. + description_short_censored_headcount_ratio: This is the share of the multidimensionally poor population deprived in the indicator _<>_. + description_short_uncensored_headcount_ratio: This is the share of the population deprived in the indicator _<>_. + description_short_area: |- + <% if area == "Urban" %> + This indicator is calculated for urban areas. + <% elif area == "Rural" %> + This indicator is calculated for rural areas. + <% elif area == "Camp" %> + This indicator is calculated for populations within refugee camps (available only for Palestine). + <%- endif -%> + + # For description_key + description_key_multidimensional_poverty_complement: |- + grouped into three dimensions of well-being: **health** (using two indicators: nutrition, child mortality), **education** (using two indicators: years of schooling, school attendance), and **living standards** (using six indicators: cooking fuel, sanitation, drinking water, electricity, housing, assets). + description_key_multidimensional_poverty: |- + Being in multidimensional poverty means that a person lives in a household deprived in a third or more of ten indicators, {definitions.description_key_multidimensional_poverty_complement} + description_key_vulnerable: |- + Being _vulnerable_ to multidimensional poverty means that a person lives in a household deprived in 20-33.3% of ten indicators, {definitions.description_key_multidimensional_poverty_complement} + description_key_severe: |- + Being in _severe_ multidimensional poverty means that a person lives in a household deprived in 50% or more of ten indicators, {definitions.description_key_multidimensional_poverty_complement} + description_key_assessments: |- + Each household is assessed against specific thresholds for these indicators. For example, a household is considered deprived in the _electricity_ indicator if it does not have access to it. [This article](https://ourworldindata.org/multidimensional-poverty-index) discusses specific thresholds in more detail. + description_key_weights: |- + Each indicator contributes to one of the three dimensions of well-being. Health and education indicators are weighted more (1/6 each) than living standards indicators (1/18 each) so that all three dimensions contribute equally to the overall measure. + description_key_mpi: |- + The Multidimensional Poverty Index (MPI) is calculated by multiplying two values: the [share of people who are multidimensionally poor](https://ourworldindata.org/grapher/share-multi-poverty) and the [intensity of their poverty](https://ourworldindata.org/grapher/intensity-of-multidimensional-poverty-national). + description_key_intensity: |- + The intensity of multidimensional poverty is calculated as the average share of indicators in which those counted as MPI poor are deprived. + description_key_flavor_cme: |- + This indicator is a current margin estimate (CME), meaning that it relies on the most recent survey data available for each country. + description_key_flavor_hot: |- + This indicator is a harmonized over time (HOT) estimate. This harmonization seeks to make two or more MPI estimates comparable by aligning the indicator definitions in each survey. + description_key_flavor: |- + <% if flavor == "Current margin estimate" %> + {definitions.description_key_flavor_cme}{definitions.description_key_flavor_link} + <% elif flavor == "Harmonized over time" %> + {definitions.description_key_flavor_hot}{definitions.description_key_flavor_link} + <%- endif -%> + description_key_flavor_link: |- + <% if flavor == "Current margin estimate" and area == "National" %> + Look for the [harmonized over time (HOT) estimate](<>) to see trends over time. + <% elif flavor == "Harmonized over time" and area == "National" %> + Look for the [current margin estimate (CME)](<>) to see the most recent survey data. + <%- endif -%> + description_key_flavor_no_links_for_national: |- + <% if flavor == "Current margin estimate" %> + {definitions.description_key_flavor_cme} + <% elif flavor == "Harmonized over time" %> + {definitions.description_key_flavor_hot} + <%- endif -%> + description_key_indicator_start: |- + A person in a household is deprived in the indicator _<>_ if + description_key_indicator_end: |- + <% if indicator == "Nutrition" %> + any person under 70 years of age for whom there is nutritional information is undernourished. This indicator is part of the _health_ dimension. + <% elif indicator == "Child mortality" %> + a child under 18 has died in the household in the five-year period preceding the survey. This indicator is part of the _health_ dimension. + <% elif indicator == "Years of schooling" %> + no eligible household member has completed six years of schooling. This indicator is part of the _education_ dimension. + <% elif indicator == "School attendance" %> + any school-aged child is not attending school up to the age at which he/she would complete class 8. This indicator is part of the _education_ dimension. + <% elif indicator == "Cooking fuel" %> + a household cooks using solid fuel, such as dung, agricultural crop, shrubs, wood, charcoal, or coal. This indicator is part of the _living standards_ dimension. + <% elif indicator == "Sanitation" %> + the household has unimproved or no sanitation facility or it is improved but shared with other households. This indicator is part of the _living standards_ dimension. + <% elif indicator == "Drinking water" %> + the household’s source of drinking water is not safe or safe drinking water is a 30-minute or longer walk from home, roundtrip. This indicator is part of the _living standards_ dimension. + <% elif indicator == "Electricity" %> + the household has no electricity. This indicator is part of the _living standards_ dimension. + <% elif indicator == "Housing" %> + the household has inadequate housing materials in any of the three components: floor, roof, or walls. This indicator is part of the _living standards_ dimension. + <% elif indicator == "Assets" %> + the household does not own more than one of these assets: radio, TV, telephone, computer, animal cart, bicycle, motorbike, or refrigerator, and does not own a car or truck. This indicator is part of the _living standards_ dimension. + <%- endif -%> + description_key_indicator: |- + {definitions.description_key_indicator_start} {definitions.description_key_indicator_end} + + # For title_variant + title_variant_area: |- + <% if area == "Urban" %> + , Urban area + <% elif area == "Rural" %> + , Rural area + <% elif area == "Camp" %> + , Refugee camps + <%- endif -%> + + title_variant_flavor: |- + <% if flavor == "Current margin estimate" %> + Most recent year + <% elif flavor == "Harmonized over time" %> + Harmonized over time + <%- endif -%> + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + multidimensional_poverty_index: + variables: + mpi: + title: Multidimensional Poverty Index (MPI) (<>) - <> + unit: "" + short_unit: "" + description_short: "{definitions.description_short_multidimensional_poverty} {definitions.description_short_mpi}{definitions.description_short_area}" + description_key: + - "{definitions.description_key_mpi}" + - "{definitions.description_key_multidimensional_poverty}" + - "{definitions.description_key_assessments}" + - "{definitions.description_key_weights}" + - "{definitions.description_key_intensity}" + - <% set cme_link = "https://ourworldindata.org/grapher/multidimensional-poverty-index-mpi" %> <% set hot_link = "https://ourworldindata.org/grapher/multidimensional-poverty-index-mpi-hot" %>{definitions.description_key_flavor} + presentation: + title_public: Multidimensional Poverty Index (MPI) + title_variant: "{definitions.title_variant_flavor}{definitions.title_variant_area}" + display: + name: Multidimensional Poverty Index (MPI) + numDecimalPlaces: 3 + <<: *common_display + + headcount_ratio: + title: Share of population living in multidimensional poverty (<>) - <> + unit: "%" + short_unit: "%" + description_short: "{definitions.description_short_multidimensional_poverty} {definitions.description_short_headcount_ratio}{definitions.description_short_area}" + description_key: + - "{definitions.description_key_multidimensional_poverty}" + - "{definitions.description_key_assessments}" + - "{definitions.description_key_weights}" + - <% set cme_link = "https://ourworldindata.org/grapher/share-multi-poverty" %> <% set hot_link = "https://ourworldindata.org/grapher/share-of-population-multidimensionally-poor-hot" %>{definitions.description_key_flavor} + presentation: + title_public: Share of population living in multidimensional poverty + title_variant: "{definitions.title_variant_flavor}{definitions.title_variant_area}" + display: + name: Share of population living in multidimensional poverty + numDecimalPlaces: 1 + <<: *common_display + + intensity: + title: Intensity of multidimensional poverty (<>) - <> + unit: "%" + short_unit: "%" + description_short: "{definitions.description_short_multidimensional_poverty} {definitions.description_short_intensity}{definitions.description_short_area}" + description_key: + - "{definitions.description_key_multidimensional_poverty}" + - "{definitions.description_key_assessments}" + - "{definitions.description_key_intensity}" + - "{definitions.description_key_weights}" + - <% set cme_link = "https://ourworldindata.org/grapher/intensity-of-multidimensional-poverty" %> <% set hot_link = "https://ourworldindata.org/grapher/intensity-of-multidimensional-poverty-hot" %>{definitions.description_key_flavor} + presentation: + title_public: Intensity of multidimensional poverty + title_variant: "{definitions.title_variant_flavor}{definitions.title_variant_area}" + display: + name: Intensity of multidimensional poverty + numDecimalPlaces: 1 + <<: *common_display + + vulnerable: + title: Share of population vulnerable to multidimensional poverty (<>) - <> + unit: "%" + short_unit: "%" + description_short: "{definitions.description_short_multidimensional_poverty} {definitions.description_short_vulnerable}{definitions.description_short_area}" + description_key: + - "{definitions.description_key_vulnerable}" + - "{definitions.description_key_assessments}" + - "{definitions.description_key_weights}" + - "{definitions.description_key_flavor_no_links_for_national}" + presentation: + title_public: Share of population vulnerable to multidimensional poverty + title_variant: "{definitions.title_variant_flavor}{definitions.title_variant_area}" + display: + name: Share of population vulnerable to multidimensional poverty + numDecimalPlaces: 1 + <<: *common_display + + severe: + title: Share of population living in severe multidimensional poverty (<>) - <> + unit: "%" + short_unit: "%" + description_short: "{definitions.description_short_multidimensional_poverty} {definitions.description_short_severe}{definitions.description_short_area}" + description_key: + - "{definitions.description_key_severe}" + - "{definitions.description_key_assessments}" + - "{definitions.description_key_weights}" + - "{definitions.description_key_flavor_no_links_for_national}" + presentation: + title_public: Share of population living in severe multidimensional poverty + title_variant: "{definitions.title_variant_flavor}{definitions.title_variant_area}" + display: + name: Share of population living in severe multidimensional poverty + numDecimalPlaces: 1 + <<: *common_display + + censored_headcount_ratio: + title: Share of population in multidimensional poverty deprived in the indicator <> (<>) - <> + unit: "%" + short_unit: "%" + description_short: "{definitions.description_short_multidimensional_poverty} {definitions.description_short_censored_headcount_ratio}{definitions.description_short_area}" + description_key: + - "{definitions.description_key_multidimensional_poverty}" + - "{definitions.description_key_indicator}" + - "{definitions.description_key_flavor_no_links_for_national}" + presentation: + title_public: Share of population in multidimensional poverty deprived in the indicator <> + title_variant: "{definitions.title_variant_flavor}{definitions.title_variant_area}" + display: + name: Share of population in multidimensional poverty deprived in the indicator <> + numDecimalPlaces: 1 + <<: *common_display + + uncensored_headcount_ratio: + title: Share of population deprived in the indicator <> (<>) - <> + unit: "%" + short_unit: "%" + description_short: "{definitions.description_short_multidimensional_poverty} {definitions.description_short_uncensored_headcount_ratio}{definitions.description_short_area}" + description_key: + - "{definitions.description_key_multidimensional_poverty}" + - "{definitions.description_key_indicator}" + - "{definitions.description_key_flavor_no_links_for_national}" + presentation: + title_public: Share of population deprived in the indicator <> + title_variant: "{definitions.title_variant_flavor}{definitions.title_variant_area}" + display: + name: Share of population deprived in the indicator <> + numDecimalPlaces: 1 + <<: *common_display diff --git a/etl/steps/data/garden/ophi/2024-10-28/multidimensional_poverty_index.py b/etl/steps/data/garden/ophi/2024-10-28/multidimensional_poverty_index.py new file mode 100644 index 00000000000..10016506bed --- /dev/null +++ b/etl/steps/data/garden/ophi/2024-10-28/multidimensional_poverty_index.py @@ -0,0 +1,149 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr +from owid.catalog import Table + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +MEASURE_NAMES = { + "A": "intensity", + "H": "headcount_ratio", + "M0": "mpi", + "hd": "uncensored_headcount_ratio", + "hdk": "censored_headcount_ratio", + "sev": "severe", + "vuln": "vulnerable", +} + +# Define categories to keep in each column +CATEGORIES_TO_KEEP = { + "loa": ["area", "nat"], + "measure": list(MEASURE_NAMES.keys()), +} + +# Define indicator categories +INDICATOR_NAMES = [ + "Assets", + "Child mortality", + "Cooking fuel", + "Drinking water", + "Electricity", + "Housing", + "Nutrition", + "Sanitation", + "School attendance", + "Years of schooling", +] + +# Define index column for the final table +INDEX_COLS = ["country", "year", "indicator", "area", "flavor"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("multidimensional_poverty_index") + + # Read table from meadow dataset. + tb_hot = ds_meadow["hot"].reset_index() + tb_cme = ds_meadow["cme"].reset_index() + + # + # Process data. + # + tb_hot = geo.harmonize_countries( + df=tb_hot, + countries_file=paths.country_mapping_path, + warn_on_unused_countries=False, + ) + tb_cme = geo.harmonize_countries( + df=tb_cme, + countries_file=paths.country_mapping_path, + warn_on_unused_countries=False, + ) + + tb = make_tables_wide_and_merge(tb_cme=tb_cme, tb_hot=tb_hot) + + tb = tb.format(keys=INDEX_COLS, short_name="multidimensional_poverty_index") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def make_tables_wide_and_merge(tb_cme: Table, tb_hot: Table) -> Table: + """ + Make tables wide to separate indicators, rename categories and merge hot and cme tables + """ + + tb_cme = select_categories_and_rename(tb_cme) + tb_hot = select_categories_and_rename(tb_hot) + + # Make tables wide + tb_hot = tb_hot.pivot( + index=["country", "year", "indicator", "area"], + columns=["measure"], + values="b", + join_column_levels_with="_", + ).reset_index(drop=True) + + tb_cme = tb_cme.pivot( + index=["country", "year", "indicator", "area"], + columns=["measure"], + values="b", + join_column_levels_with="_", + ).reset_index(drop=True) + + # Add a flavor column to each table + tb_cme["flavor"] = "Current margin estimate" + tb_hot["flavor"] = "Harmonized over time" + + # Concatenate the two tables + tb = pr.concat([tb_cme, tb_hot], ignore_index=True) + + return tb + + +def select_categories_and_rename(tb: Table) -> Table: + """ + Select categories to keep and rename them + """ + + for col, categories in CATEGORIES_TO_KEEP.items(): + # Assert that all categories are in the column + assert set(categories).issubset( + set(tb[col].unique()) + ), f"Categories {set(categories) - set(tb[col].unique())} not in column {col}" + + # Filter categories + tb = tb[tb[col].isin(categories)].reset_index(drop=True) + + # Rename measure categories + tb["measure"] = tb["measure"].cat.rename_categories(MEASURE_NAMES) + + # Check that the column ind_lab contains all INDICATOR_NAMES + indicators_excluding_nan = tb[tb["ind_lab"].notna()]["ind_lab"].unique() + assert ( + set(indicators_excluding_nan) == set(INDICATOR_NAMES) + ), f"Column ind_lab is not identical to the expected list. These are the differences: {set(INDICATOR_NAMES) - set(indicators_excluding_nan)}" + + # Remove indicator and area columns + tb = tb.drop(columns=["indicator"]) + + # Rename ind_lab as indicator and area_lab as area + tb = tb.rename(columns={"ind_lab": "indicator", "area_lab": "area"}) + + return tb diff --git a/etl/steps/data/garden/owid/latest/ig_countries.meta.yml b/etl/steps/data/garden/owid/latest/ig_countries.meta.yml new file mode 100644 index 00000000000..499ee6580e1 --- /dev/null +++ b/etl/steps/data/garden/owid/latest/ig_countries.meta.yml @@ -0,0 +1,39 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Uncategorized + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + ig_countries: + variables: + count: + title: Country in post (flag) + unit: "" + share_post: + title: Country in post (share of post) + unit: "%" + short_unit: "%" + description_short: If there are 4 countries in the post, each country will have a share of 25% + counts_cum: + title: Total count of posts with country + unit: "posts" + summary: + variables: + proportion: + title: Share of posts mentioning a country + unit: "%" + short_unit: "%" + description_short: Proportion of posts with country + proportion_weighed: + title: Average mention of a country in a post + unit: "" + description_short: If there are two posts, and the first one mentions a country A and another country, this equals to (50% + 0%)/2 = 25% diff --git a/etl/steps/data/garden/owid/latest/ig_countries.py b/etl/steps/data/garden/owid/latest/ig_countries.py new file mode 100644 index 00000000000..a3f6c86abdc --- /dev/null +++ b/etl/steps/data/garden/owid/latest/ig_countries.py @@ -0,0 +1,83 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table + +from etl.data_helpers.misc import expand_time_column +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("ig_countries") + + # Read table from meadow dataset. + tb = ds_meadow["ig_countries"].reset_index() + + # + # Process data. + # + ## Dtypes + tb = tb.astype( + { + "country": "string", + "date": "datetime64[ns]", + } + ) + + tb = tb.sort_values("date") + + # Get share of post + x = tb.groupby("date").country.transform("nunique") + tb["share_post"] = 1 / x + + ################## + # SUMMARY TABLE + # Get number of posts with country X + tb_summary_a = tb["country"].value_counts(normalize=True).to_frame().reset_index() + + num_posts = tb["date"].nunique() + tb_summary_b = tb.groupby("country", as_index=False)["share_post"].sum() + tb_summary_b["proportion_weighed"] = tb_summary_b["share_post"] / num_posts + tb_summary_b = tb_summary_b.drop(columns=["share_post"]) + + tb_summary = tb_summary_a.merge(tb_summary_b, on="country", how="outer") + tb_summary["year"] = 2024 + + tb_summary = Table(tb_summary) + + # Get + # Expand time column + tb = expand_time_column(tb, time_col="date", dimension_col="country", method="observed", fillna_method="zero") + + # Cumulative + tb["counts_cum"] = tb.groupby("country")["count"].cumsum() + tb["count"] = (tb["counts_cum"] > 0).astype(int) + + # Metadata + tb["counts_cum"] = tb["counts_cum"].copy_metadata(tb["date"]) + tb["count"] = tb["count"].copy_metadata(tb["date"]) + tb_summary["proportion"] = 100 * tb_summary["proportion"].copy_metadata(tb["date"]) + tb_summary["proportion_weighed"] = 100 * tb_summary["proportion_weighed"].copy_metadata(tb["date"]) + + # expand_time_column(tb) + tables = [ + tb.format(["country", "date"]), + tb_summary.format(["country"], short_name="summary"), + ] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/papers/2023-10-20/anthromes.py b/etl/steps/data/garden/papers/2023-10-20/anthromes.py index 94173117116..923d4a9dcad 100644 --- a/etl/steps/data/garden/papers/2023-10-20/anthromes.py +++ b/etl/steps/data/garden/papers/2023-10-20/anthromes.py @@ -259,6 +259,6 @@ def assign_land_use_types(tb: Table) -> Table: 70: "No land", } - tb["value"] = tb["value"].replace(land_use_dict) + tb["value"] = tb["value"].astype(object).replace(land_use_dict) return tb diff --git a/etl/steps/data/garden/regions/2023-01-01/regions.yml b/etl/steps/data/garden/regions/2023-01-01/regions.yml index d8f67a8f727..5d8d59f6d09 100644 --- a/etl/steps/data/garden/regions/2023-01-01/regions.yml +++ b/etl/steps/data/garden/regions/2023-01-01/regions.yml @@ -283,6 +283,7 @@ - "Bosnia Herzegovina" - "Bsnia.&Hrzgvna." - "Bosnia & Herzegovina" + short_name: "Bosnia and Herz." - code: "BWA" name: "Botswana" @@ -408,6 +409,7 @@ - "C\u00f4te d'Ivoire" - "C\u00f4te d\u2019Ivoire" - "Ivory Coast" + - "Cte dIvoire" - code: "HRV" name: "Croatia" @@ -1528,6 +1530,7 @@ - "Turks & Caicos" - "Turks and Caicos Islands (the)" - "Turk Island" + short_name: "Turks and Caicos" - code: "TUV" name: "Tuvalu" @@ -2417,3 +2420,361 @@ - "VUT" - "WLF" - "WSM" +# Subregions from UN geoscheme (defined by the UNSD) +# In this Taiwan and Kosovo are not recognized and part of China and Serbia respectively +- code: UNSD_NAF + name: "Northern Africa (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "DZA" + - "EGY" + - "LBY" + - "MAR" + - "SDN" + - "TUN" + - "ESH" +- code: UNSD_EAF + name: "Eastern Africa (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "IOT" + - "BDI" + - "COM" + - "DJI" + - "ERI" + - "ETH" + - "ATF" + - "KEN" + - "MDG" + - "MWI" + - "MUS" + - "MYT" + - "MOZ" + - "REU" + - "RWA" + - "SYC" + - "SOM" + - "SSD" + - "UGA" + - "TZA" + - "ZMB" + - "ZWE" +- code: UNSD_MAF + name: "Middle Africa (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "AGO" + - "CMR" + - "CAF" + - "TCD" + - "COG" + - "COD" + - "GNQ" + - "GAB" + - "STP" +- code: UNSD_SAF + name: "Southern Africa (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "BWA" + - "LSO" + - "NAM" + - "ZAF" + - "SWZ" +- code: UNSD_WAF + name: "Western Africa (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "BEN" + - "BFA" + - "CPV" + - "CIV" + - "GMB" + - "GHA" + - "GIN" + - "GNB" + - "LBR" + - "MLI" + - "MRT" + - "NER" + - "NGA" + - "SHN" + - "SEN" + - "SLE" + - "TGO" +- code: UNSD_CAR + name: "Caribbean (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "AIA" + - "ATG" + - "ABW" + - "BHS" + - "BRB" + - "BES" + - "CYM" + - "CUB" + - "CUW" + - "DMA" + - "DOM" + - "GRD" + - "GLP" + - "HTI" + - "JAM" + - "MTQ" + - "MSR" + - "PRI" + - "BLM" + - "KNA" + - "LCA" + - "MAF" + - "VCT" + - "SXM" + - "TTO" + - "TCA" + - "VGB" + - "VIR" +- code: UNSD_CAM + name: "Central America (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "BLZ" + - "CRI" + - "SLV" + - "GTM" + - "HND" + - "MEX" + - "NIC" + - "PAN" +- code: UNSD_SAM + name: "South America (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "ARG" + - "BOL" + - "BVT" + - "BRA" + - "CHL" + - "COL" + - "ECU" + - "FLK" + - "GUF" + - "GUY" + - "PRY" + - "PER" + - "SGS" + - "SUR" + - "URY" + - "VEN" +- code: UNSD_NAM + name: "Northern America (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "BMU" + - "CAN" + - "GRL" + - "SPM" + - "USA" +- code: UNSD_CAS + name: "Central Asia (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "KAZ" + - "KGZ" + - "TJK" + - "TKM" + - "UZB" +- code: UNSD_EAS + name: "Eastern Asia (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "CHN" + - "HKG" + - "JPN" + - "MAC" + - "MNG" + - "PRK" + - "KOR" +- code: UNSD_SEA + name: "South-eastern Asia (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "BRN" + - "KHM" + - "TLS" + - "IDN" + - "LAO" + - "MYS" + - "MMR" + - "PHL" + - "SGP" + - "THA" + - "VNM" +- code: UNSD_SAS + name: "Southern Asia (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "AFG" + - "BGD" + - "BTN" + - "IND" + - "IRN" + - "MDV" + - "NPL" + - "PAK" + - "LKA" +- code: UNSD_WAS + name: "Western Asia (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "ARM" + - "AZE" + - "BHR" + - "CYP" + - "GEO" + - "IRQ" + - "ISR" + - "JOR" + - "KWT" + - "LBN" + - "OMN" + - "QAT" + - "SAU" + - "SYR" + - "TUR" + - "ARE" + - "YEM" +- code: UNSD_EEU + name: "Eastern Europe (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "BLR" + - "BGR" + - "CZE" + - "HUN" + - "MDA" + - "POL" + - "ROU" + - "RUS" + - "SVK" + - "UKR" +- code: UNSD_NEU + name: "Northern Europe (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "ALA" + - "DNK" + - "EST" + - "FRO" + - "FIN" + - "GGY" + - "ISL" + - "IRL" + - "IMN" + - "JEY" + - "LVA" + - "LTU" + - "NOR" + - "SJM" + - "SWE" + - "GBR" +- code: UNSD_SEU + name: "Southern Europe (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "ALB" + - "AND" + - "BIH" + - "HRV" + - "GIB" + - "GRC" + - "ITA" + - "MLT" + - "MNE" + - "PRT" + - "SMR" + - "SRB" + - "SVN" + - "ESP" + - "MKD" + - "VAT" +- code: UNSD_WEU + name: "Western Europe (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "AUT" + - "BEL" + - "FRA" + - "DEU" + - "LIE" + - "LUX" + - "MCO" + - "NLD" + - "CHE" +- code: UNSD_AUS + name: "Australia and New Zealand (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "AUS" + - "CXR" + - "CCK" + - "HMD" + - "NZL" + - "NFK" +- code: UNSD_MEL + name: "Melanesia (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "FJI" + - "NCL" + - "PNG" + - "SLB" + - "VUT" +- code: UNSD_MIC + name: "Micronesia (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "GUM" + - "KIR" + - "MHL" + - "FSM" + - "NRU" + - "MNP" + - "PLW" + - "UMI" +- code: UNSD_POL + name: "Polynesia (UNSD)" + region_type: "aggregate" + defined_by: unsd + members: + - "ASM" + - "COK" + - "PYF" + - "NIU" + - "PCN" + - "WSM" + - "TKL" + - "TON" + - "TUV" + - "WLF" diff --git a/etl/steps/data/garden/state_capacity/2023-10-19/state_capacity_dataset.py b/etl/steps/data/garden/state_capacity/2023-10-19/state_capacity_dataset.py index b22e514fbf2..63782846d2e 100644 --- a/etl/steps/data/garden/state_capacity/2023-10-19/state_capacity_dataset.py +++ b/etl/steps/data/garden/state_capacity/2023-10-19/state_capacity_dataset.py @@ -19,7 +19,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("state_capacity_dataset") # Read table from meadow dataset. - tb = ds_meadow["state_capacity_dataset"].reset_index() + tb = ds_meadow.read("state_capacity_dataset") # # Process data. @@ -64,7 +64,7 @@ def run(dest_dir: str) -> None: def regional_aggregations(tb: Table) -> Table: # Load population data. tb_pop = paths.load_dataset("population") - tb_pop = tb_pop["population"].reset_index() + tb_pop = tb_pop.read("population") tb_regions = tb.copy() diff --git a/etl/steps/data/garden/tourism/2024-08-17/unwto_gdp.py b/etl/steps/data/garden/tourism/2024-08-17/unwto_gdp.py index ad105917962..b8b3c7acd54 100644 --- a/etl/steps/data/garden/tourism/2024-08-17/unwto_gdp.py +++ b/etl/steps/data/garden/tourism/2024-08-17/unwto_gdp.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("unwto_gdp") # Read table from meadow dataset. - tb = ds_meadow.read_table("unwto_gdp") + tb = ds_meadow.read("unwto_gdp") # # Process data. # diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/budget.py b/etl/steps/data/garden/tuberculosis/2023-11-27/budget.py index e659e4aa281..4d59c645e43 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/budget.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/budget.py @@ -37,7 +37,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("data_dictionary.csv") # Read table from meadow dataset. tb = ds_meadow["budget"].reset_index() - dd = snap.read() + dd = snap.read(safe_types=False) # Process data. # tb = add_variable_description_from_producer(tb, dd) diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/burden_estimates.py b/etl/steps/data/garden/tuberculosis/2023-11-27/burden_estimates.py index ce16712b73e..bfa59809c17 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/burden_estimates.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/burden_estimates.py @@ -41,7 +41,7 @@ def run(dest_dir: str) -> None: ds_population = paths.load_dataset("population") # Load data dictionary from snapshot. - dd = snap.read() + dd = snap.read(safe_types=False) # Read table from meadow dataset. tb = ds_meadow["burden_estimates"].reset_index() tb = tb.drop(columns=["iso2", "iso3", "iso_numeric", "g_whoregion"]) diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/drug_resistance_surveillance.py b/etl/steps/data/garden/tuberculosis/2023-11-27/drug_resistance_surveillance.py index 224734858be..8a50d450ee0 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/drug_resistance_surveillance.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/drug_resistance_surveillance.py @@ -32,7 +32,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("drug_resistance_surveillance") snap = paths.load_snapshot("data_dictionary.csv") # Load data dictionary from snapshot. - dd = snap.read() + dd = snap.read(safe_types=False) # Load regions dataset. ds_regions = paths.load_dataset("regions") # Load income groups dataset. diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/expenditure.py b/etl/steps/data/garden/tuberculosis/2023-11-27/expenditure.py index 00a68e1cae1..1ff0c4fdc68 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/expenditure.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/expenditure.py @@ -20,7 +20,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("data_dictionary.csv") # Read table from meadow dataset. tb = ds_meadow["expenditure"].reset_index() - dd = snap.read() + dd = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/laboratories.py b/etl/steps/data/garden/tuberculosis/2023-11-27/laboratories.py index 727eb40070a..5a7f6f3037d 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/laboratories.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/laboratories.py @@ -38,7 +38,7 @@ def run(dest_dir: str) -> None: ds_pop = ds_un_wpp["population"].reset_index() # Load data dictionary from snapshot. - dd = snap.read() + dd = snap.read(safe_types=False) # Read table from meadow dataset. tb = ds_meadow["laboratories"].reset_index() diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/notifications.py b/etl/steps/data/garden/tuberculosis/2023-11-27/notifications.py index 9305c0da09e..cb01a28182b 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/notifications.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/notifications.py @@ -36,7 +36,7 @@ def run(dest_dir: str) -> None: # Load income groups dataset. ds_income_groups = paths.load_dataset("income_groups") # Load data dictionary from snapshot. - dd = snap.read() + dd = snap.read(safe_types=False) # Read table from meadow dataset. tb = ds_meadow["notifications"].reset_index() # diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/outcomes.py b/etl/steps/data/garden/tuberculosis/2023-11-27/outcomes.py index 6e3c8e1a679..5c4ec33298a 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/outcomes.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/outcomes.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("outcomes") snap = paths.load_snapshot("data_dictionary.csv") # Load data dictionary from snapshot. - dd = snap.read() + dd = snap.read(safe_types=False) # Read table from meadow dataset. tb = ds_meadow["outcomes"].reset_index() diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py b/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py index ae7a4784f82..020b033f783 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py @@ -17,7 +17,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("unhlm_commitments") snap = paths.load_snapshot("data_dictionary.csv") # Load data dictionary from snapshot. - dd = snap.read() + dd = snap.read(safe_types=False) # Read table from meadow dataset. tb = ds_meadow["unhlm_commitments"].reset_index() diff --git a/etl/steps/data/garden/un/2023-10-02/un_wpp_lt.py b/etl/steps/data/garden/un/2023-10-02/un_wpp_lt.py index ea755555498..a739669d0aa 100644 --- a/etl/steps/data/garden/un/2023-10-02/un_wpp_lt.py +++ b/etl/steps/data/garden/un/2023-10-02/un_wpp_lt.py @@ -61,8 +61,15 @@ def run(dest_dir: str) -> None: # Rename columns, select columns tb = tb.rename(columns=COLUMNS_RENAME) + # DTypes + tb = tb.astype( + { + "age": str, + } + ) + # Change 100 -> 100+ - tb.loc[tb["age"] == 100, "age"] = "100+" + tb.loc[tb["age"] == "100", "age"] = "100+" # Scale central death rates paths.log.info("scale indicators to make them more.") @@ -78,13 +85,6 @@ def run(dest_dir: str) -> None: tb["sex"] = tb["sex"].map({"Total": "both", "Male": "male", "Female": "female"}) assert tb["sex"].notna().all(), "NaNs detected after mapping sex values!" - # DTypes - tb = tb.astype( - { - "age": str, - } - ) - # Set index tb = tb.set_index(COLUMNS_INDEX, verify_integrity=True)[COLUMNS_INDICATORS] diff --git a/etl/steps/data/garden/un/2023-10-30/un_members.py b/etl/steps/data/garden/un/2023-10-30/un_members.py index 11c10de312e..315dc4f8e8f 100644 --- a/etl/steps/data/garden/un/2023-10-30/un_members.py +++ b/etl/steps/data/garden/un/2023-10-30/un_members.py @@ -24,7 +24,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("un_members") # Read table from meadow dataset. - tb = ds_meadow["un_members"].reset_index() + tb = ds_meadow.read("un_members") # # Process data. @@ -66,7 +66,7 @@ def data_processing(tb: Table) -> Table: # Create membership_status column, which is "Member" when year is greater or equal to admission year, and "Not a member" otherwise. # I copy the admission column first to keep metadata - tb["membership_status"] = tb["admission"].copy() + tb["membership_status"] = tb["admission"].copy().astype(object) tb.loc[tb["year"] < tb["admission"], "membership_status"] = "Not a member" tb.loc[tb["year"] >= tb["admission"], "membership_status"] = "Member" diff --git a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_300k.py b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_300k.py index 92d6d4886a3..1b93e5e9c5a 100644 --- a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_300k.py +++ b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_300k.py @@ -50,15 +50,15 @@ def run(dest_dir: str) -> None: } ) - # Create two new dataframes to separate data into estimates and projections (pre-2019 and post-2019) - past_estimates = tb_average[tb_average["year"] < 2019].copy() - future_projections = tb_average[tb_average["year"] >= 2019].copy() + # Create two new dataframes to separate data into estimates and projections + past_estimates = tb_average[tb_average["year"] <= 2015].copy() + future_projections = tb_average[tb_average["year"] >= 2015].copy() # Now, for each column in the original dataframe, split it into two for col in tb_average.columns: if col not in ["country", "year"]: - past_estimates[f"{col}_estimates"] = tb_average.loc[tb_average["year"] < 2019, col] - future_projections[f"{col}_projections"] = tb_average.loc[tb_average["year"] >= 2019, col] + past_estimates[f"{col}_estimates"] = tb_average.loc[tb_average["year"] <= 2015, col] + future_projections[f"{col}_projections"] = tb_average.loc[tb_average["year"] >= 2015, col] past_estimates = past_estimates.drop(columns=[col]) future_projections = future_projections.drop(columns=[col]) diff --git a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_size_class.py b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_size_class.py index 6aa4daf3d05..e44531c7a8a 100644 --- a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_size_class.py +++ b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_size_class.py @@ -38,15 +38,16 @@ def run(dest_dir: str) -> None: tb_pivot[col] = tb_pivot[col] * 1000 tb_pivot = tb_pivot.reset_index() - # Create two new dataframes to separate data into estimates and projections (pre-2019 and post-2019) - past_estimates = tb_pivot[tb_pivot["year"] < 2019].copy() - future_projections = tb_pivot[tb_pivot["year"] >= 2019].copy() + # Create two new dataframes to separate data into estimates and projections + + past_estimates = tb_pivot[tb_pivot["year"] <= 2015].copy() + future_projections = tb_pivot[tb_pivot["year"] >= 2015].copy() # Now, for each column in the original dataframe, split it into two (projections and estimates) for col in tb_pivot.columns: if col not in ["country", "year"]: - past_estimates[f"{col}_estimates"] = tb_pivot.loc[tb_pivot["year"] < 2019, col] - future_projections[f"{col}_projections"] = tb_pivot.loc[tb_pivot["year"] >= 2019, col] + past_estimates[f"{col}_estimates"] = tb_pivot.loc[tb_pivot["year"] <= 2015, col] + future_projections[f"{col}_projections"] = tb_pivot.loc[tb_pivot["year"] >= 2015, col] past_estimates = past_estimates.drop(columns=[col]) future_projections = future_projections.drop(columns=[col]) diff --git a/etl/steps/data/garden/un/2024-01-17/urbanization_urban_rural.py b/etl/steps/data/garden/un/2024-01-17/urbanization_urban_rural.py index c22b5c9aef1..9189fee2081 100644 --- a/etl/steps/data/garden/un/2024-01-17/urbanization_urban_rural.py +++ b/etl/steps/data/garden/un/2024-01-17/urbanization_urban_rural.py @@ -48,15 +48,15 @@ def run(dest_dir: str) -> None: # Remove 'thousands' from column name tb.rename(columns={col: col.replace("__thousands", "")}, inplace=True) - # Create two new dataframes to separate data into estimates and projections (pre-2019 and post-2019) - past_estimates = tb[tb["year"] < 2019].copy() - future_projections = tb[tb["year"] >= 2019].copy() + # Create two new dataframes to separate data into estimates and projections (pre-2015 and post-2015) + past_estimates = tb[tb["year"] <= 2015].copy() + future_projections = tb[tb["year"] >= 2015].copy() # Now, for each column in the original dataframe, split it into two (projections and estimates) for col in tb.columns: if col not in ["country", "year"]: - past_estimates[f"{col}_estimates"] = tb.loc[tb["year"] < 2019, col] - future_projections[f"{col}_projections"] = tb.loc[tb["year"] >= 2019, col] + past_estimates[f"{col}_estimates"] = tb.loc[tb["year"] <= 2015, col] + future_projections[f"{col}_projections"] = tb.loc[tb["year"] >= 2015, col] past_estimates = past_estimates.drop(columns=[col]) future_projections = future_projections.drop(columns=[col]) diff --git a/etl/steps/data/garden/un/2024-03-14/un_wpp_most.meta.yml b/etl/steps/data/garden/un/2024-03-14/un_wpp_most.meta.yml index c1e97dd6da2..c7793d9cbb2 100644 --- a/etl/steps/data/garden/un/2024-03-14/un_wpp_most.meta.yml +++ b/etl/steps/data/garden/un/2024-03-14/un_wpp_most.meta.yml @@ -1,13 +1,21 @@ +definitions: + common: + presentation: + grapher_config: + subtitle: "" + originUrl: "https://ourworldindata.org/population-growth" + note: "" + tables: population_5_year_age_groups: variables: - age: + age_group_five: title: Five year age-group with the highest population unit: "" description_short: |- Five-year age group with the highest population. type: ordinal - sort: + sort: # May need additional groups when data is updated - 0-4 - 5-9 - 10-14 @@ -23,22 +31,18 @@ tables: - 60-64 - 65-69 - 70-74 - value: - title: Population of the most populous five-year age group - unit: "people" - description_short: |- - Population of the most populous five-year age-group. - display: - numDecimalPlaces: 0 + - 75-79 + presentation: + title_public: Five year age-group with the highest population population_10_year_age_groups: variables: - age_group: + age_group_ten: title: Ten year age-group with the highest population unit: "" description_short: |- Ten-year age group with the highest population. type: ordinal - sort: + sort: # May need additional groups when data is updated - 0-9 - 10-19 - 20-29 @@ -46,10 +50,6 @@ tables: - 40-49 - 50-59 - 60-69 - value: - title: Population of the most populous ten year age group - unit: "people" - description_short: |- - Population of the most populous ten-year age-group. - display: - numDecimalPlaces: 0 + - 70-79 + presentation: + title_public: Ten year age-group with the highest population diff --git a/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py b/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py index c067d01d25e..0b0220c589f 100644 --- a/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py +++ b/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py @@ -1,10 +1,14 @@ +from typing import Any + from owid.catalog import Table from owid.catalog import processing as pr +from structlog import get_logger from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +log = get_logger() def run(dest_dir: str) -> None: @@ -12,27 +16,31 @@ def run(dest_dir: str) -> None: # Load inputs. ds_garden = paths.load_dataset("un_wpp") tb_pop = ds_garden["population"].reset_index() - + origins = tb_pop["population"].metadata.origins[0] age_group_size = [5, 10] tb_list = [] tb_pop_filter = Table() for age_group in age_group_size: + log.info(f"Creating population table for {age_group} year age groups") # filter data for just sex = all, metrics = population, variant = estimates if age_group == 5: - tb_pop_filter = create_five_year_age_groups(tb_pop) + tb_pop_filter = create_five_year_age_groups(tb_pop, origins) if age_group == 10: - tb_pop_filter = create_ten_year_age_groups(tb_pop) + tb_pop_filter = create_ten_year_age_groups(tb_pop, origins) # Group by country and year, and apply the custom function - tb_pop_filter = tb_pop_filter.groupby(["location", "year"], observed=False).apply(get_largest_age_group) - # The function above creates NAs for some locations that don't appear to be in the table e.g. Vatican, Melanesia, so dropping here - tb_pop_filter = tb_pop_filter.dropna() - tb_pop_filter = tb_pop_filter.reset_index(drop=True) - tb_pop_filter = tb_pop_filter.set_index(["location", "year"], verify_integrity=True) - tb_pop_filter = tb_pop_filter.copy_metadata(tb_pop) + tb_pop_filter = ( + tb_pop_filter.groupby(["country", "year"], group_keys=False) + .apply(get_largest_age_group) + .reset_index(drop=True) # Reset index to have a clean DataFrame + ) + # The function above creates NAs for some countrys that don't appear to be in the table e.g. Vatican, Melanesia, so dropping here + + tb_pop_filter = tb_pop_filter.drop(columns=["population"]) + tb_pop_filter = tb_pop_filter.set_index(["country", "year"], verify_integrity=True) tb_pop_filter.metadata.short_name = f"population_{age_group}_year_age_groups" tb_list.append(tb_pop_filter) # Save outputs. - # + # Create a new garden dataset with the same metadata as the meadow dataset. ds_garden = create_dataset(dest_dir, tables=tb_list, default_metadata=ds_garden.metadata) @@ -40,7 +48,7 @@ def run(dest_dir: str) -> None: ds_garden.save() -def create_ten_year_age_groups(tb: Table) -> Table: +def create_ten_year_age_groups(tb: Table, origins: Any) -> Table: # Initialize an empty list to hold the age bands age_bands = [] # Loop through a range with a step of 5, stopping before 100 @@ -49,28 +57,29 @@ def create_ten_year_age_groups(tb: Table) -> Table: # Add the "100+" group at the end and 0-4 and 5-9 as 0-9 is not a group in the dataset age_bands = age_bands + ["100+", "0-4", "5-9", "10-14", "15-19"] # Filter the table to only include the age bands we want - tb = tb[(tb.sex == "all") & (tb.metric == "population") & (tb.variant == "estimates") & (tb.age.isin(age_bands))] + tb = tb[(tb.sex == "all") & (tb.variant == "estimates") & (tb.age.isin(age_bands))] assert tb["age"].nunique() == len(age_bands), "Age groups are not as expected" - tb = tb.drop(columns=["metric", "sex", "variant"]) + tb = tb.drop(columns=["sex", "variant", "population_change", "population_density"]) # Create the 0-9 and 10-19 age groups tb_0_9 = tb[(tb.age == "0-4") | (tb.age == "5-9")] - tb_0_9 = tb_0_9.groupby(["location", "year"], observed=False)["value"].sum().reset_index() + tb_0_9 = tb_0_9.groupby(["country", "year"], observed=False)["population"].sum().reset_index() tb_0_9["age"] = "0-9" tb_10_19 = tb[(tb.age == "10-14") | (tb.age == "15-19")] - tb_10_19 = tb_10_19.groupby(["location", "year"], observed=False)["value"].sum().reset_index() + tb_10_19 = tb_10_19.groupby(["country", "year"], observed=False)["population"].sum().reset_index() tb_10_19["age"] = "10-19" # Drop the 0-4, 5-9, 10-14 and 15-19 age groups tb = tb[(tb.age != "0-4") & (tb.age != "5-9") & (tb.age != "10-14") & (tb.age != "15-19")] # Concatenate the 0-9 and 10-19 age groups with the original table tb = pr.concat([tb, tb_0_9, tb_10_19]) - tb = tb.rename(columns={"age": "age_group"}) + tb = tb.rename(columns={"age": "age_group_ten"}) + tb["age_group_ten"].metadata.origins = [origins] tb = tb.reset_index(drop=True) return tb -def create_five_year_age_groups(tb: Table) -> Table: +def create_five_year_age_groups(tb: Table, origins: Any) -> Table: # Initialize an empty list to hold the age bands age_bands = [] # Loop through a range with a step of 5, stopping before 100 @@ -79,13 +88,15 @@ def create_five_year_age_groups(tb: Table) -> Table: # Add the "100+" group at the end age_bands.append("100+") # Filter the table to only include the age bands we want - tb = tb[(tb.sex == "all") & (tb.metric == "population") & (tb.variant == "estimates") & (tb.age.isin(age_bands))] + tb = tb[(tb.sex == "all") & (tb.variant == "estimates") & (tb.age.isin(age_bands))] assert tb["age"].nunique() == len(age_bands), "Age groups are not as expected" - tb = tb.drop(columns=["metric", "sex", "variant"]) + tb = tb.drop(columns=["sex", "variant", "population_change", "population_density"]) + tb = tb.rename(columns={"age": "age_group_five"}) + tb["age_group_five"].metadata.origins = [origins] tb = tb.reset_index(drop=True) return tb # Function to apply to each group to find the age group with the largest population def get_largest_age_group(group): - return group.loc[group["value"].idxmax()] + return group.loc[group["population"].idxmax()] diff --git a/etl/steps/data/garden/un/2024-04-09/undp_hdr.meta.yml b/etl/steps/data/garden/un/2024-04-09/undp_hdr.meta.yml index f602b7cf225..cf2ee4f0edf 100644 --- a/etl/steps/data/garden/un/2024-04-09/undp_hdr.meta.yml +++ b/etl/steps/data/garden/un/2024-04-09/undp_hdr.meta.yml @@ -1,8 +1,6 @@ definitions: common: processing_level: minor - description_processing: |- - - We calculated averages over continents and income groups by taking the population-weighted average of the countries in each group. If less than 80% of countries in an area report data for a given year, we do not calculate the average for that area. presentation: topic_tags: - Human Development Index (HDI) diff --git a/etl/steps/data/garden/un/2024-04-09/undp_hdr.py b/etl/steps/data/garden/un/2024-04-09/undp_hdr.py index a3c0c0aaf38..d4a2fff7b1d 100644 --- a/etl/steps/data/garden/un/2024-04-09/undp_hdr.py +++ b/etl/steps/data/garden/un/2024-04-09/undp_hdr.py @@ -73,17 +73,26 @@ def region_avg(tb, ds_regions, ds_income_groups): """Calculate regional averages for the table, this includes continents and WB income groups""" # remove columns where regional average does not make sense tb_cols = tb.columns - ind_wo_avg = ["country", "year", "gii_rank", "hdi_rank", "loss", "rankdiff_hdi_phdi"] + ind_wo_avg = ["country", "year", "gii_rank", "hdi_rank", "loss", "rankdiff_hdi_phdi", "gdi_group"] rel_cols = [col for col in tb.columns if col not in ind_wo_avg] # calculate population weighted columns (helper columns) + rel_cols_pop = [] for col in rel_cols: tb[col + "_pop"] = tb[col] * tb["pop_total"] + rel_cols_pop.append(col + "_pop") + + # Define aggregations only for the columns I need + aggregations = dict.fromkeys( + rel_cols + rel_cols_pop, + "sum", + ) tb = geo.add_regions_to_table( tb, ds_regions=ds_regions, ds_income_groups=ds_income_groups, + aggregations=aggregations, frac_allowed_nans_per_year=0.2, ) @@ -91,4 +100,10 @@ def region_avg(tb, ds_regions, ds_income_groups): for col in rel_cols: tb[col] = tb[col + "_pop"] / tb["pop_total"] + # Add description_processing only to rel_cols + for col in rel_cols: + tb[ + col + ].m.description_processing = "We calculated averages over continents and income groups by taking the population-weighted average of the countries in each group. If less than 80% of countries in an area report data for a given year, we do not calculate the average for that area." + return tb[tb_cols] diff --git a/etl/steps/data/garden/un/2024-07-08/maternal_mortality.meta.yml b/etl/steps/data/garden/un/2024-07-08/maternal_mortality.meta.yml index aa821713ec2..dae05bc1a66 100644 --- a/etl/steps/data/garden/un/2024-07-08/maternal_mortality.meta.yml +++ b/etl/steps/data/garden/un/2024-07-08/maternal_mortality.meta.yml @@ -60,8 +60,10 @@ tables: short_unit: "" display: numDecimalPlaces: 1 + entityAnnotationsMap: 'United States: Values from 2003–2017 affected by measurement change' description_from_producer: |- {definitions.description_methodology} + mm_rate: title: Estimated maternal mortality rate description_short: |- @@ -70,6 +72,8 @@ tables: short_unit: "" display: numDecimalPlaces: 1 + entityAnnotationsMap: 'United States: Values from 2003–2017 affected by measurement change' + description_from_producer: |- {definitions.description_methodology} description_processing: The data is originally given in deaths per person-years for women of reproductive age. To make the figures comparable with other sources, we multiply it by 100,000 to get deaths per 100,000 person-years (corresponding roughly to 100,000 women of reproductive age). @@ -143,6 +147,7 @@ tables: short_unit: "" display: numDecimalPlaces: 0 + entityAnnotationsMap: 'United States: Values from 2003–2017 affected by measurement change' description_from_producer: |- {definitions.description_methodology} diff --git a/etl/steps/data/garden/un/2024-07-11/un_wpp.meta.yml b/etl/steps/data/garden/un/2024-07-11/un_wpp.meta.yml index 8097420beb4..1344ff48672 100644 --- a/etl/steps/data/garden/un/2024-07-11/un_wpp.meta.yml +++ b/etl/steps/data/garden/un/2024-07-11/un_wpp.meta.yml @@ -1,7 +1,7 @@ definitions: global: projections: - <%- if variant != 'estimates' -%> + <%- if (variant is defined) and (variant != 'estimates') -%> Projections from 2024 onwards are based on the UN's << variant >> scenario. <%- endif -%> dimensions: @@ -39,6 +39,7 @@ definitions: {definitions.global.projections} description_short_births: |- + <%- if sex is defined -%> <%- if not (sex == 'all' and age == 'all') -%> This only includes <%- if sex != 'all' -%> @@ -48,6 +49,7 @@ definitions: mothers aged << age >> <%- endif -%>. <%- endif -%> + <%- endif -%> {definitions.global.projections} diff --git a/etl/steps/data/garden/un/2024-07-12/un_wpp.meta.yml b/etl/steps/data/garden/un/2024-07-12/un_wpp.meta.yml index 0300e8bf523..3d59ea4fcb8 100644 --- a/etl/steps/data/garden/un/2024-07-12/un_wpp.meta.yml +++ b/etl/steps/data/garden/un/2024-07-12/un_wpp.meta.yml @@ -1,7 +1,7 @@ definitions: global: projections: - <%- if variant != 'estimates' -%> + <%- if (variant is defined) and (variant != 'estimates') -%> Projections from 2024 onwards are based on the UN's << variant >> scenario. <%- endif -%> dimensions: @@ -43,6 +43,7 @@ definitions: {definitions.global.projections} description_short_births: |- + <%- if sex is defined -%> <%-if not (sex == 'all' and age == 'all') -%> This only includes <%- if sex != 'all' -%> @@ -51,7 +52,8 @@ definitions: <%- if age != 'all' -%> mothers aged << age >> <%- endif -%>. - <%-endif -%> + <%- endif -%> + <%- endif -%> {definitions.global.projections} @@ -199,7 +201,7 @@ tables: <%-endif -%> {definitions.global.projections} net_migration_rate: - title: Net migration rate + title: Annual net migration rate unit: migrants per 1,000 people description_short: |- Net number of immigrants minus the number of emigrants, divided by the population of the receiving country over a year. {definitions.global.dimensions.description_short} diff --git a/etl/steps/data/garden/un/2024-07-16/migrant_stock.py b/etl/steps/data/garden/un/2024-07-16/migrant_stock.py index 38f54c7183a..c8d7f07f4d2 100644 --- a/etl/steps/data/garden/un/2024-07-16/migrant_stock.py +++ b/etl/steps/data/garden/un/2024-07-16/migrant_stock.py @@ -130,17 +130,17 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("migrant_stock") # Read tables from meadow dataset. # destination and origin table - tb_do = ds_meadow.read_table("migrant_stock_dest_origin") + tb_do = ds_meadow.read("migrant_stock_dest_origin") # destination table, total numbers and shares - tb_d_total = ds_meadow.read_table("migrant_stock_dest_total") - tb_d_share = ds_meadow.read_table("migrant_stock_dest_share") + tb_d_total = ds_meadow.read("migrant_stock_dest_total") + tb_d_share = ds_meadow.read("migrant_stock_dest_share") # origin table - tb_o = ds_meadow.read_table("migrant_stock_origin") + tb_o = ds_meadow.read("migrant_stock_origin") # table for data by sex and age - tb_sa_total = ds_meadow.read_table("migrant_stock_sex_age_total") - tb_sa_share = ds_meadow.read_table("migrant_stock_sex_age_share") + tb_sa_total = ds_meadow.read("migrant_stock_sex_age_total") + tb_sa_share = ds_meadow.read("migrant_stock_sex_age_share") # population data - tb_pop = ds_meadow.read_table("total_population") + tb_pop = ds_meadow.read("total_population") ## data on destination and origin # Remove aggregated regions from the dataset. diff --git a/etl/steps/data/garden/un/2024-07-25/refugee_data.py b/etl/steps/data/garden/un/2024-07-25/refugee_data.py index 49e9d272144..24853be2bee 100644 --- a/etl/steps/data/garden/un/2024-07-25/refugee_data.py +++ b/etl/steps/data/garden/un/2024-07-25/refugee_data.py @@ -21,8 +21,8 @@ def run(dest_dir: str) -> None: ds_resettlement = paths.load_dataset("resettlement") # Read table from meadow dataset. - tb = ds_meadow.read_table("refugee_data") - tb_resettlement = ds_resettlement.read_table("resettlement") + tb = ds_meadow.read("refugee_data") + tb_resettlement = ds_resettlement.read("resettlement") # filter out data before data availability starts (s. https://www.unhcr.org/refugee-statistics/methodology/, "Data publication timeline") tb["asylum_seekers"] = tb.apply(lambda x: x["asylum_seekers"] if x["year"] > 1999 else pd.NA, axis=1) diff --git a/etl/steps/data/garden/un/2024-07-25/resettlement.py b/etl/steps/data/garden/un/2024-07-25/resettlement.py index 9419083797b..0e9ce903186 100644 --- a/etl/steps/data/garden/un/2024-07-25/resettlement.py +++ b/etl/steps/data/garden/un/2024-07-25/resettlement.py @@ -17,7 +17,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("resettlement") # Read table from meadow dataset. - tb = ds_meadow.read_table("resettlement") + tb = ds_meadow.read("resettlement") # filter out data before data availability starts (s. https://www.unhcr.org/refugee-statistics/methodology/, "Data publication timeline") tb["resettlement_arrivals"] = tb.apply(lambda x: x["resettlement_arrivals"] if x["year"] > 1958 else pd.NA, axis=1) diff --git a/etl/steps/data/garden/un/2024-08-27/un_sdg.py b/etl/steps/data/garden/un/2024-08-27/un_sdg.py index b75acb69554..22608710c21 100644 --- a/etl/steps/data/garden/un/2024-08-27/un_sdg.py +++ b/etl/steps/data/garden/un/2024-08-27/un_sdg.py @@ -27,7 +27,7 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("un_sdg") # Read table from meadow dataset. - tb_meadow = ds_meadow.read_table("un_sdg") + tb_meadow = ds_meadow.read("un_sdg", safe_types=False) # Create long and short units columns tb = create_units(tb_meadow) diff --git a/etl/steps/data/garden/un/2024-09-11/igme.countries.json b/etl/steps/data/garden/un/2024-09-11/igme.countries.json index ab60bdd1b43..f49051ee07f 100644 --- a/etl/steps/data/garden/un/2024-09-11/igme.countries.json +++ b/etl/steps/data/garden/un/2024-09-11/igme.countries.json @@ -131,7 +131,7 @@ "Niue": "Niue", "North Macedonia": "North Macedonia", "Norway": "Norway", - "Oceania (SDG)": "Oceania", + "Oceania (SDG)": "Oceania (SDG)", "Oman": "Oman", "Pakistan": "Pakistan", "Palau": "Palau", diff --git a/etl/steps/data/garden/un/2024-09-11/igme.meta.yml b/etl/steps/data/garden/un/2024-09-11/igme.meta.yml index c7f5d826968..b2c70afefec 100644 --- a/etl/steps/data/garden/un/2024-09-11/igme.meta.yml +++ b/etl/steps/data/garden/un/2024-09-11/igme.meta.yml @@ -32,6 +32,7 @@ definitions: <%- endif -%> indicator: display_name: |- + <% set indicator = indicator | default('') %> <% if indicator == "Neonatal mortality rate" %> The estimated number of deaths of {definitions.sex} children aged less than 28 days, per 100 live births, in {definitions.wealth_quintile} <% elif indicator == "Mortality rate 1-59 months" %> @@ -102,39 +103,46 @@ tables: title: Observation value unit: |- {definitions.unit_of_measure.unit} - short_unit: <% if 'rate' in indicator %>%<% else %><%- endif -%> + short_unit: <% if 'rate' in indicator | default('') %>%<% else %><%- endif -%> display: name: |- {definitions.indicator.display_name} - numDecimalPlaces: <% if 'rate' in indicator %>1<% else %>0<%- endif -%> + numDecimalPlaces: <% if 'rate' in indicator | default('') %>1<% else %>0<%- endif -%> + presentation: + attribution: United Nations Inter-agency Group for Child Mortality Estimation (2024) lower_bound: title: Lower bound unit: |- {definitions.unit_of_measure.unit} - short_unit: <% if 'rate' in indicator %>%<% else %><%- endif -%> + short_unit: <% if 'rate' in indicator | default('') %>%<% else %><%- endif -%> display: name: |- {definitions.indicator.display_name} - numDecimalPlaces: <% if 'rate' in indicator %>1<% else %>0<%- endif -%> + numDecimalPlaces: <% if 'rate' in indicator | default('') %>1<% else %>0<%- endif -%> + presentation: + attribution: United Nations Inter-agency Group for Child Mortality Estimation (2024) upper_bound: title: Upper bound unit: |- {definitions.unit_of_measure.unit} - short_unit: <% if 'rate' in indicator %>%<% else %><%- endif -%> + short_unit: <% if 'rate' in indicator | default('') %>%<% else %><%- endif -%> display: name: |- {definitions.indicator.display_name} - numDecimalPlaces: <% if 'rate' in indicator %>1<% else %>0<%- endif -%> + numDecimalPlaces: <% if 'rate' in indicator | default('') %>1<% else %>0<%- endif -%> + presentation: + attribution: United Nations Inter-agency Group for Child Mortality Estimation (2024) igme_under_fifteen_mortality: variables: obs_value: title: <> + description_short: <% if 'rate' in indicator | default('') %>Estimated share of children that die before reaching the age of fifteen.<% else %>Estiamted number of children that die before reaching the age of fifteen.<%- endif -%> description_processing: This indicator is processed by OWID based on the original data source. It is a combination of the under-five mortality rate and the 5-14 mortality rate. unit: <> - short_unit: "%" + short_unit: <% if 'rate' in indicator | default('') %>%<% else %><%- endif -%> display: name: <> - numDecimalPlaces: <% if 'rate' in indicator %>1<% else %>0<%- endif -%> + numDecimalPlaces: <% if 'rate' in indicator %>2<% else %>0<%- endif -%> sources: - name: United Nations Inter-agency Group for Child Mortality Estimation (2018; 2024) presentation: diff --git a/etl/steps/data/garden/un/2024-09-11/igme.py b/etl/steps/data/garden/un/2024-09-11/igme.py index 6266b62dad9..b74d410f6d2 100644 --- a/etl/steps/data/garden/un/2024-09-11/igme.py +++ b/etl/steps/data/garden/un/2024-09-11/igme.py @@ -3,7 +3,7 @@ from typing import List import pandas as pd -from owid.catalog import Table +from owid.catalog import Dataset, Table from owid.catalog import processing as pr from etl.data_helpers import geo @@ -11,12 +11,16 @@ # Get paths and naming conventions for current step. paths = PathFinder(__file__) +REGIONS = geo.REGIONS def run(dest_dir: str) -> None: # # Load inputs. - # + # Load countries-regions dataset (required to get ISO codes). + ds_regions = paths.load_dataset("regions") + # Load the population dataset + ds_population = paths.load_dataset("population") # Load meadow dataset. ds_meadow = paths.load_dataset("igme", version=paths.version) # Load vintage dataset which has older data needed for youth mortality @@ -42,6 +46,11 @@ def run(dest_dir: str) -> None: # Filter out just the bits of the data we want tb = filter_data(tb) tb = round_down_year(tb) + # add regional data for count variables + tb = add_regional_totals_for_counts(tb, ds_regions) + # add regional population weighted averages for rate variables + tb = add_population_weighted_regional_averages_for_rates(tb, ds_population, ds_regions) + # Removing commas from the unit of measure tb["unit_of_measure"] = tb["unit_of_measure"].str.replace(",", "", regex=False) tb["source"] = "igme (current)" @@ -55,12 +64,11 @@ def run(dest_dir: str) -> None: ) ] - # Combine datasets with a preference for the current data when there is a conflict. + # Combine datasets with a preference for the current data when there is a conflict - this is needed to calculate the youth mortality rate. tb_com = combine_datasets( tb_a=tb_under_fifteen, tb_b=tb_vintage, table_name="igme_combined", preferred_source="igme (current)" ) - tb_com = calculate_under_fifteen_deaths(tb_com) tb_com = calculate_under_fifteen_mortality_rates(tb_com) tb_com = tb_com.format(["country", "year", "indicator", "sex", "wealth_quintile", "unit_of_measure"]) @@ -84,6 +92,67 @@ def run(dest_dir: str) -> None: ds_garden.save() +def add_regional_totals_for_counts(tb: Table, ds_regions: Dataset) -> Table: + """ + Adding regional sums for variables that are counts + """ + tb_counts = tb[tb["unit_of_measure"] == "Number of deaths"] + + tb_all_regions = Table() + for region in REGIONS: + regions = geo.list_members_of_region(region=region, ds_regions=ds_regions) + tb_region = tb_counts[tb_counts["country"].isin(regions)] + tb_region = ( + tb_region.groupby(["year", "indicator", "sex", "wealth_quintile"])[ + ["obs_value", "lower_bound", "upper_bound"] + ] + .sum() + .reset_index() + ) + tb_region["country"] = region + tb_region["unit_of_measure"] = "Number of deaths" + tb_all_regions = pr.concat([tb_all_regions, tb_region]) + + tb = pr.concat([tb, tb_all_regions]) + + return tb + + +def add_population_weighted_regional_averages_for_rates( + tb: Table, ds_population: Dataset, ds_regions: Dataset +) -> Table: + """ + Adding population-weighted averages for the death rates + """ + tb_rates = tb[tb["unit_of_measure"] != "Number of deaths"] + tb_rates = geo.add_population_to_table(tb_rates, ds_population) + msk = tb_rates["population"].isna() + # Dropping out regions that don't have a population + tb_rates = tb_rates[~msk] + + tb_rates["obs_value_pop"] = tb_rates["obs_value"] * tb_rates["population"] + tb_rates = tb_rates.drop(columns=["lower_bound", "upper_bound"]) + tb_all_regions = Table() + for region in REGIONS: + regions = geo.list_members_of_region(region=region, ds_regions=ds_regions) + tb_region = tb_rates[tb_rates["country"].isin(regions)] + tb_region = ( + tb_region.groupby(["year", "indicator", "sex", "wealth_quintile", "unit_of_measure"])[ + ["obs_value_pop", "population"] + ] + .sum() + .reset_index() + ) + tb_region["country"] = region + tb_region["obs_value"] = tb_region["obs_value_pop"] / tb_region["population"] + tb_region = tb_region.drop(columns=["obs_value_pop", "population"]) + tb_all_regions = pr.concat([tb_all_regions, tb_region]) + + tb = pr.concat([tb, tb_all_regions]) + + return tb + + def convert_to_percentage(tb: Table) -> Table: """ Convert the units which are given as 'per 1,000...' into percentages. diff --git a/etl/steps/data/garden/un/2024-09-16/long_run_child_mortality.meta.yml b/etl/steps/data/garden/un/2024-09-16/long_run_child_mortality.meta.yml index 865d317d5f0..8d371f7210d 100644 --- a/etl/steps/data/garden/un/2024-09-16/long_run_child_mortality.meta.yml +++ b/etl/steps/data/garden/un/2024-09-16/long_run_child_mortality.meta.yml @@ -26,8 +26,60 @@ tables: long_run_child_mortality_selected: variables: under_five_mortality: - title: Under-five mortality rate (selected) + title: Under-five mortality rate unit: deaths per 100 live births short_unit: "%" display: numDecimalPlaces: 1 + description_short: The long-run estimated share of newborns who die before reaching the age of five. + description_key: + - This long-run indicator is a combination of two data sources, Gapminder and the UN Inter-agency Group for Child Mortality Estimation (UN IGME). + - The historical data is compiled by Gapminder, the full range of sources used can be found in the [Gapminder documentation](https://www.gapminder.org/data/documentation/gd005/). + description_processing: >- + This indicator is a combination of data from two sources: + - The UN Inter-agency Group for Child Mortality Estimation (UN IGME) provides estimates of child mortality rates, which is available for some countries from 1932. + - Gapminder provides estimates of child mortality rates for the years 1800 to 2015. + + We combine the two datasets, for years where both are available, we have a preference for the UN IGME data. + presentation: + title_public: Under-five mortality rate + title_variant: Long-run data + attribution_short: UN IGME; Gapminder + grapher_config: + title: Child mortality rate + subtitle: The estimated share of newborns who die before reaching the age of five. + variantName: Long-run data; Gapminder & UN IGME + sourceDesc: UN IGME (2023); Gapminder (2015) + originUrl: https://ourworldindata.org/child-mortality + hasMapTab: true + yAxis: + max: 0 + min: 0 + minTime: 1800 + map: + time: latest + colorScale: + baseColorScheme: YlOrRd + binningStrategy: manual + customNumericColors: + - null + - null + - null + customNumericValues: + - 0.3 + - 0.5 + - 1 + - 3 + - 5 + - 10 + - 30 + - 50 + customNumericMinValue: 0 + timeTolerance: 0 + selectedEntityNames: + - United States + - United Kingdom + - Sweden + - France + - Brazil + - India diff --git a/etl/steps/data/garden/un/2024-10-21/census_dates.countries.json b/etl/steps/data/garden/un/2024-10-21/census_dates.countries.json new file mode 100644 index 00000000000..cf91fe99d02 --- /dev/null +++ b/etl/steps/data/garden/un/2024-10-21/census_dates.countries.json @@ -0,0 +1,239 @@ +{ + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Cayman Islands": "Cayman Islands", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Falkland Islands (Malvinas)": "Falkland Islands", + "Faroe Islands": "Faroe Islands", + "Fiji": "Fiji", + "France": "France", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Ghana": "Ghana", + "Gibraltar": "Gibraltar", + "Greece": "Greece", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guam": "Guam", + "Guatemala": "Guatemala", + "Guernsey": "Guernsey", + "Guinea": "Guinea", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Holy See": "Vatican", + "Honduras": "Honduras", + "Hungary": "Hungary", + "India": "India", + "Indonesia": "Indonesia", + "Iran, Islamic Republic of": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Isle of Man": "Isle of Man", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jersey": "Jersey", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Korea, Republic of": "South Korea", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Mexico": "Mexico", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands Antilles": "Netherlands Antilles", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Norfolk Island": "Norfolk Island", + "North Macedonia": "North Macedonia", + "Northern Mariana Islands": "Northern Mariana Islands", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Palestine, State of": "Palestine", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "R\u00e9union": "Reunion", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Sri Lanka": "Sri Lanka", + "St. Kitts and Nevis": "Saint Kitts and Nevis", + "St. Lucia": "Saint Lucia", + "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tokelau": "Tokelau", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Republic of Tanzania": "Tanzania", + "United States Virgin Islands": "United States Virgin Islands", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Viet Nam": "Vietnam", + "Wallis and Futuna Islands": "Wallis and Futuna", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "- England and Wales": "England and Wales", + "- Northern Ireland": "Northern Ireland", + "- Scotland": "Scotland", + "Bahrain": "Bahrain", + "Belgium": "Belgium", + "China - Hong Kong SAR": "Hong Kong", + "China - Macao SAR": "Macao", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Estonia": "Estonia", + "Finland": "Finland", + "Germany (8)": "Germany", + "Greenland": "Greenland", + "Guinea Bissau": "Guinea-Bissau", + "Iceland": "Iceland", + "Israel": "Israel", + "Italy": "Italy", + "Korea, Democratic People's Republic of": "North Korea", + "Latvia": "Latvia", + "Libya Arab Jamahiriya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Micronesia, Federated States of": "Micronesia (country)", + "Monaco": "Monaco", + "Netherlands": "Netherlands", + "Norway": "Norway", + "Pitcairn": "Pitcairn", + "Poland": "Poland", + "Saint Helena ex. dep.": "Saint Helena", + "Saint-Barth\u00e9lemy": "Saint Barthelemy", + "Saint-Martin": "Saint Martin (French part)", + "Sao Tom\u00e9 and Princip\u00e9": "Sao Tome and Principe", + "Singapore": "Singapore", + "Sint Maarten": "Sint Maarten (Dutch part)", + "Spain": "Spain", + "St. Pierre and Miquelon": "Saint Pierre and Miquelon", + "Svalbard and Jan Mayen Islands": "Svalbard and Jan Mayen", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Turkey": "Turkey" +} \ No newline at end of file diff --git a/etl/steps/data/garden/un/2024-10-21/census_dates.py b/etl/steps/data/garden/un/2024-10-21/census_dates.py new file mode 100644 index 00000000000..a173ccc2abe --- /dev/null +++ b/etl/steps/data/garden/un/2024-10-21/census_dates.py @@ -0,0 +1,201 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import numpy as np +import pandas as pd +from owid.catalog import Table +from owid.catalog import processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +COMMENTS_DICT = { + "(P)": "Population census only.", + "(H)": "Housing census only.", + "(1)": "Urban areas only.", + "(2)": "Enumeration of settled population was in November 1986 and of nomads in February 1987.", + "(3)": "Population figures compiled from administrative registers.", + "(4)": "Population figures compiled from administrative registers in combination with other sources of data, such as sample surveys.", + "(5)": "The population by-censuses for 1986 and 1996 were based on one-in-seven sample of the population, while that for 2006 was based on one-in-ten sample of the population.", + "(6)": "Enumeration of former Yemen Arab Republic.", + "(7)": "Enumeration of former Democratic Yemen.", + "(8)": "Through accession of the German Democratic Republic to the Federal Republic of Germany with effect from 3 October 1990, the two German States have united to form one sovereign State. As from the date of unification, the Federal Republic of Germany acts in the United Nations under the designation 'Germany'.", + "(9)": "Enumeration of former Federal Republic of Germany.", + "(10)": "Combined with agricultural census.", + "(11)": "No formal census conducted. A count of numbers of each family group by name, sex, age and whether permanent or expatriate resident is made on 30 or 31 December each year.", + "(12)": "A register-based test census was conducted on 5 December 2001 on a sample of 1.2% of the population.", + "(13)": "Due to the circumstances, the census was conducted again in 2004.", + "(14)": "Census not carried out on the territory of Kosovo and Metohia.", + "(15)": "Rolling Census based on continuous sample survey.", + "(16)": "Census was planned to be conducted using staggered enumerations province by province. At the end of 2014, only 6 of the 34 provinces had been enumerated.", + "(17)": "Traditional decennial census with full field enumeration, and a continuous sample survey.", + "(18)": "Population figures compiled from administrative registers and sample surveys while data on housing characteristics are collected through full field enumeration.", + "(19)": "Cancelled.", +} + +MONTHS_DICT = { + "Jan.": "January", + "Feb.": "February", + "Mar.": "March", + "Apr.": "April", + "Jun.": "June", + "Jul.": "July", + "Aug.": "August", + "Sep.": "September", + "Oct.": "October", + "Nov.": "November", + "Dec.": "December", +} + +MIN_YEAR = 1985 +CURR_YEAR = 2024 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("census_dates") + + # Read table from meadow dataset. + tb = ds_meadow["census_dates"].reset_index() + + # remove timeframes from date + tb["date"] = tb["date"].apply(lambda x: x.split("-")[1] if "-" in x else x) + + # add comments + tb["comment"] = tb.apply(lambda x: get_comment(x["date"], x["country"]), axis=1) + + # clean date and country columns + tb["date"] = tb["date"].apply(clean_date) + tb["date"] = tb["date"].replace(MONTHS_DICT) + tb["country"] = tb["country"].apply(clean_country) + + # convert date to datetime + tb["date_as_year"] = tb["date"].apply(date_as_year) + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb_census = years_since_last_census(tb) + tb_census = add_uk(tb_census) + tb_census = tb_census.format(["country", "year"]) + + # create indicator that shows if a census was conducted in the last 10 years + tb_census["recent_census"] = tb_census["years_since_last_census"].apply( + lambda x: np.nan if np.isnan(x) else 1 if x <= 10 else 0 + ) + + tb = tb.format(["country", "date"]) + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb, tb_census], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def get_comment(date: str, country: str) -> str: + """Get comments from footnotes in date and country column.""" + comment = "" + if date.startswith("("): + note = date.split(" ")[0] + comment += COMMENTS_DICT[note] + if country.endswith(")"): + if country.split(" ")[-1] not in ["(Malvinas)"]: + note = country.split(" ")[-1] + if len(comment) == 0: + comment += COMMENTS_DICT[note] + else: + comment += ", " + COMMENTS_DICT[note] + return comment + + +def clean_date(date: str) -> str: + """Clean date column.""" + if date.startswith("("): + return " ".join(date.split(" ")[1:]) + else: + return date + + +def clean_country(country: str) -> str: + """Clean country column.""" + if (country.endswith(")")) and (country.split(" ")[-1] not in ["(Malvinas)"]): + return " ".join(country.split(" ")[:-1]) + else: + return country + + +def date_as_year(date: str) -> int: + """Extract year from date.""" + if " " in date and date.split(" ")[-1].isdigit(): + return int(date.split(" ")[-1]) + elif "." in date: + return int(date.split(".")[-1]) + elif date.startswith("[") and date.endswith("]"): + return int(date[1:-1]) + else: + return int(date) + + +def years_since_last_census(tb: Table) -> Table: + countries = tb["country"].unique() + years = [int(x) for x in range(1985, 2024)] + rows = [] + for country in countries: + country_df = tb[tb["country"] == country].sort_values("date_as_year", ascending=True) + census_years = country_df["date_as_year"].tolist() + for year in years: + prev_census = [x for x in census_years if x <= year] + if prev_census: + last_census = max([x for x in census_years if x <= year]) + years_since_last_census = year - last_census + else: + last_census = None + years_since_last_census = None + rows.append( + { + "country": country, + "year": year, + "last_census": last_census, + "years_since_last_census": years_since_last_census, + } + ) + tb_census = Table(pd.DataFrame(rows)).copy_metadata(tb) + tb_census.m.short_name = "years_since_last_census" + + for col in tb_census.columns: + tb_census[col].metadata = tb["date"].m + tb_census[col].metadata.origins = tb["date"].m.origins + return tb_census + + +def add_uk(tb_census): + years = [int(x) for x in range(MIN_YEAR, CURR_YEAR)] + uk_rows = [] + uk_countries = ["England and Wales", "Scotland", "Northern Ireland"] + for year in years: + uk_tb = tb_census[tb_census["country"].isin(uk_countries) & (tb_census["year"] == year)] + uk_last_census = uk_tb["last_census"].max() + uk_years_since_last_census = year - uk_last_census + uk_rows.append( + { + "country": "United Kingdom", + "year": year, + "last_census": uk_last_census, + "years_since_last_census": uk_years_since_last_census, + } + ) + uk_tb_census = Table(pd.DataFrame(uk_rows)).copy_metadata(tb_census) + tb_census = pr.concat([tb_census, uk_tb_census], axis=0) + return tb_census diff --git a/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.countries.json b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.countries.json new file mode 100644 index 00000000000..2eccd289241 --- /dev/null +++ b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.countries.json @@ -0,0 +1,254 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cayman Islands": "Cayman Islands", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "China, Hong Kong SAR": "Hong Kong", + "China, Macao SAR": "Macao", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Falkland Islands (Malvinas)": "Falkland Islands", + "Faroe Islands": "Faroe Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Gibraltar": "Gibraltar", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guam": "Guam", + "Guatemala": "Guatemala", + "Guernsey": "Guernsey", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Isle of Man": "Isle of Man", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jersey": "Jersey", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Melanesia": "Melanesia", + "Mexico": "Mexico", + "Micronesia": "Micronesia (country)", + "Micronesia (Fed. States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Macedonia": "North Macedonia", + "Northern Mariana Islands": "Northern Mariana Islands", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Polynesia": "Polynesia", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "R\u00e9union": "Reunion", + "Saint Barth\u00e9lemy": "Saint Barthelemy", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Martin (French part)": "Saint Martin (French part)", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South America": "South America", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "State of Palestine": "Palestine", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tokelau": "Tokelau", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States Virgin Islands": "United States Virgin Islands", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Wallis and Futuna Islands": "Wallis and Futuna", + "Western Sahara": "Western Sahara", + "World": "World", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "China, Taiwan Province of China": "Taiwan", + "Dem. People's Republic of Korea": "North Korea", + "Kosovo (under UNSC res. 1244)": "Kosovo", + "T\u00fcrkiye": "Turkey", + "High-income countries": "High-income countries", + "Low-income countries": "Low-income countries", + "Lower-middle-income countries": "Lower-middle-income countries", + "Upper-middle-income countries": "Upper-middle-income countries", + "Latin America and the Caribbean": "Latin America and the Caribbean (UN)", + "Africa": "Africa (UN)", + "Asia": "Asia (UN)", + "Europe": "Europe (UN)", + "Northern America": "Northern America (UN)", + "Oceania": "Oceania (UN)", + "Holy See": "Vatican" +} diff --git a/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.meta.yml b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.meta.yml new file mode 100644 index 00000000000..c7b3d363bb8 --- /dev/null +++ b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.meta.yml @@ -0,0 +1,104 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + life_expectancy: + description_short: |- + <%- if type == "period" -%> + Period life expectancy is the average number of years a person is expected to live, based on mortality rates seen across all age groups in a given interval. + <%- elif type == "cohort" -%> + Cohort life expectancy is the average number of years that individuals born in a given interval actually lived, based on tracking that birth cohort over time. + <%- endif -%> + point_1: |- + <%- if type == "period" -%> + This is based on a synthetic cohort created using mortality rates across age groups in a given year. + <%- elif type == "cohort" -%> + Rather than waiting for the entire cohort to have died before calculating the cohort life expectancy, researchers may use data from birth cohorts that are 'almost extinct'. + <%- endif -%> + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 365 + description: |- + Abridged life tables up to age 100 by sex and both sexes combined providing a set of values showing the mortality experience of a hypothetical group of infants born at the same time and subject throughout their lifetime to the specific mortality rates of a given year (period life tables), from 1950 to 2023. + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + un_wpp_lt: + variables: + central_death_rate: + title: Central death rate + description_short: |- + The death rate, calculated as the number of deaths divided by the average number of people alive during the interval. + description_key: + - |- + The death rate is measured using the number of person-years lived during the interval. + - |- + Person-years refers to the combined total time that a group of people has lived. For example, if 10 people each live for 2 years, they collectively contribute 20 person-years. + - |- + The death rate is slightly different from the 'probability of death' during the interval, because the 'probability of death' metric uses a different denominator: the number of people alive at that age at the start of the interval, while this indicator uses the average number of people alive during the interval. + unit: deaths per 1,000 people + processing_level: minor + description_processing: |- + The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 1,000 to get a per-1,000 people rate. + probability_of_death: + title: Probability of death + unit: "%" + description_short: |- + The probability of dying in a given interval, among people who survived to the start of that interval. + description_key: + - |- + For example, the probability of death for a 50 year old in a given year is found by: dividing the number of deaths in 50 year olds that year, by the number of people alive at the age of 50 at the start of the year. + processing_level: minor + description_processing: |- + The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 100 to get a percentage. + probability_of_survival: + title: Probability of survival + unit: "%" + description_short: The probability that a person who survived until the start of a given interval will have died by the end of the interval. + processing_level: minor + description_processing: |- + The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 100 to get a percentage. + average_survival_length: + title: Average survival length + short_unit: years + unit: years + description_short: Average length of survival between ages x and x+n for persons dying in the interval. + number_survivors: + title: Number of survivors + unit: survivors + description_short: Number of survivors at exact age x, assuming survivors at 0 years old is 100,000. + number_deaths: + title: Number of deaths + short_unit: deaths + unit: deaths + description_short: Number of deaths between ages x and x+n. + number_person_years_lived: + title: Number of person-years lived + unit: person-years + description_short: Number of person-years lived between ages x and x+n. + survivorship_ratio: + title: Survivorship ratio + unit: "" + description_short: Survival ratio (nSx) corresponding to proportion of the life table population in age group (x, x+n) who are alive n year later + number_person_years_remaining: + title: Number of person-years remaining + unit: person-years + description_short: Number of person-years remaining after exact age x. + life_expectancy: + title: Life expectancy + short_unit: years + unit: years + description_short: |- + <%- if type == "period" -%> + The average number of years a person is expected to live, based on mortality rates seen across all age groups in a given interval. + <%- elif type == "cohort" -%> + The average number of years that individuals born in a given interval actually lived, based on tracking that birth cohort over time. + <%- endif -%> + description_key: + - |- + <%- if type == "period" -%> + Life expectancy at age 0 refers to life expectancy at birth. + <%- else -%> + It refers to the remaining life expectancy for people who have already survived to the given age. + <%- endif -%> diff --git a/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.py b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.py new file mode 100644 index 00000000000..21b1a4d92ba --- /dev/null +++ b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.py @@ -0,0 +1,100 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Column rename and selection +COLUMNS_RENAME = { + "time": "year", + "agegrpstart": "age", +} +COLUMNS_INDEX = [ + "location", + "year", + "sex", + "age", +] +COLUMNS_INDICATORS = [ + "central_death_rate", + "probability_of_death", + "probability_of_survival", + "number_survivors", + "number_deaths", + "number_person_years_lived", + "survivorship_ratio", + "number_person_years_remaining", + "life_expectancy", + "average_survival_length", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("un_wpp_lt") + + # Read table from meadow dataset. + paths.log.info("load tables, concatenate.") + tb = pr.concat( + [ + ds_meadow["un_wpp_lt_all"].reset_index(), + ds_meadow["un_wpp_lt_f"].reset_index(), + ds_meadow["un_wpp_lt_m"].reset_index(), + ], + short_name=paths.short_name, + ).reset_index() + + # + # Process data. + # + # Sanity check + assert (tb["agegrpspan"].isin([1, -1])).all() and ( + tb.loc[tb["agegrpspan"] == -1, "agegrpstart"] == 100 + ).all(), "Age group span should always be of 1, except for 100+ (-1)" + + # Rename columns, select columns + tb = tb.rename(columns=COLUMNS_RENAME) + + # DTypes + tb = tb.astype( + { + "age": str, + } + ) + + # Change 100 -> 100+ + tb.loc[tb["age"] == "100", "age"] = "100+" + + # Scale central death rates + paths.log.info("scale indicators to make them more.") + tb["central_death_rate"] = tb["central_death_rate"] * 1000 + tb["probability_of_death"] = tb["probability_of_death"] * 100 + tb["probability_of_survival"] = tb["probability_of_survival"] * 100 + + # Harmonize country names. + paths.log.info("harmonise country names.") + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path, country_col="location") + + # Harmonize sex sex + tb["sex"] = tb["sex"].map({"Total": "total", "Male": "male", "Female": "female"}) + assert tb["sex"].notna().all(), "NaNs detected after mapping sex values!" + + # Set index + tb = tb.set_index(COLUMNS_INDEX, verify_integrity=True)[COLUMNS_INDICATORS] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/unesco/2024-11-21/enrolment_rates.countries.json b/etl/steps/data/garden/unesco/2024-11-21/enrolment_rates.countries.json new file mode 100644 index 00000000000..aadc257f762 --- /dev/null +++ b/etl/steps/data/garden/unesco/2024-11-21/enrolment_rates.countries.json @@ -0,0 +1,234 @@ +{ "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cayman Islands": "Cayman Islands", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic People's Republic of Korea": "North Korea", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Falkland Islands (Malvinas)": "Falkland Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Gibraltar": "Gibraltar", + "Greece": "Greece", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Palestine": "Palestine", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "R\u00e9union": "Reunion", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tokelau": "Tokelau", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "World": "World", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Arab States": "Arab States", + "Central Asia": "Central Asia (UIS)", + "Central and Eastern Europe": "Central and Eastern Europe (UIS)", + "China, Hong Kong Special Administrative Region": "Hong Kong", + "China, Macao Special Administrative Region": "Macao", + "East Asia and the Pacific": "East Asia and the Pacific (UIS)", + "High income countries": "High-income countries", + "Latin America and the Caribbean": "Latin America and the Caribbean (UIS)", + "Low income countries": "Low-income countries", + "Lower middle income countries": "Lower-middle-income countries", + "Middle income countries": "Middle-income countries", + "North America and Western Europe": "North America and Western Europe (UIS)", + "Small Island Developing States": "Small Island Developing States (UIS)", + "South and West Asia": "South and West Asia (UIS)", + "Sub-Saharan Africa": "Sub-Saharan Africa (UIS)", + "T\u00fcrkiye": "Turkey", + "Upper middle income countries": "Upper-middle-income countries" +} \ No newline at end of file diff --git a/etl/steps/data/garden/unesco/2024-11-21/enrolment_rates.meta.yml b/etl/steps/data/garden/unesco/2024-11-21/enrolment_rates.meta.yml new file mode 100644 index 00000000000..1bba9a9ba56 --- /dev/null +++ b/etl/steps/data/garden/unesco/2024-11-21/enrolment_rates.meta.yml @@ -0,0 +1,12 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Education + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 diff --git a/etl/steps/data/garden/unesco/2024-11-21/enrolment_rates.py b/etl/steps/data/garden/unesco/2024-11-21/enrolment_rates.py new file mode 100644 index 00000000000..5f82b0fa6eb --- /dev/null +++ b/etl/steps/data/garden/unesco/2024-11-21/enrolment_rates.py @@ -0,0 +1,75 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("enrolment_rates") + + # Read table from meadow dataset. + tb = ds_meadow["enrolment_rates"].reset_index() + + # Retrieve snapshot with the metadata provided via World Bank. + + snap_wb = paths.load_snapshot("edstats_metadata.xls") + tb_wb = snap_wb.read() + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + # Add the long description from the World Bank metadata + long_definition = {} + for indicator in tb["indicator"].unique(): + definition = tb_wb[tb_wb["Indicator Name"] == indicator]["Long definition"].values + if len(definition) > 0: + long_definition[indicator] = definition[0] + else: + long_definition[indicator] = "" + + tb["long_description"] = tb["indicator"].map(long_definition) + + # Pivot the table to have the indicators as columns to add descriptions from producer + tb_pivoted = tb.pivot(index=["country", "year"], columns="indicator", values="value") + for column in tb_pivoted.columns: + meta = tb_pivoted[column].metadata + long_definition = tb["long_description"].loc[tb["indicator"] == column].iloc[0] + meta.description_from_producer = long_definition + meta.title = column + meta.display = {} + + meta.display["numDecimalPlaces"] = 1 + meta.unit = "%" + meta.short_unit = "%" + + tb_pivoted = tb_pivoted.reset_index() + tb_pivoted = tb_pivoted.format(["country", "year"]) + + # Drop columns that are not needed + tb_pivoted = tb_pivoted.drop( + columns=[ + "total_net_enrolment_rate__lower_secondary__adjusted_gender_parity_index__gpia", + "total_net_enrolment_rate__primary__adjusted_gender_parity_index__gpia", + "total_net_enrolment_rate__upper_secondary__adjusted_gender_parity_index__gpia", + ] + ) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb_pivoted], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/unicef/2024-07-30/child_migration.py b/etl/steps/data/garden/unicef/2024-07-30/child_migration.py index d8208fbce67..39a86cf8d8f 100644 --- a/etl/steps/data/garden/unicef/2024-07-30/child_migration.py +++ b/etl/steps/data/garden/unicef/2024-07-30/child_migration.py @@ -79,7 +79,7 @@ def run(dest_dir: str) -> None: ds_population = paths.load_dataset("population") # Read table from meadow dataset. - tb = ds_meadow.read_table("child_migration") + tb = ds_meadow.read("child_migration") # combine indicator, statistical population and unit columns (to get one indicator per combination) diff --git a/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py b/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py index acc32e890b5..3d281c14dde 100644 --- a/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py +++ b/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py @@ -1,7 +1,15 @@ -"""Load a meadow dataset and create a garden dataset.""" +""" +Load a meadow dataset and create a garden dataset. + +NOTE: To extract the log of the process (to review sanity checks, for example), follow these steps: + 1. Define LONG_FORMAT as True. + 2. Run the following command in the terminal: + nohup uv run etl run government_revenue_dataset > government_revenue_dataset.log 2>&1 & +""" from owid.catalog import Table from structlog import get_logger +from tabulate import tabulate from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset @@ -11,6 +19,12 @@ # Get paths and naming conventions for current step. paths = PathFinder(__file__) +# Set table format when printing +TABLEFMT = "pretty" + +# Define if I show the full table or just the first 5 rows for assertions +LONG_FORMAT = False + def run(dest_dir: str) -> None: # @@ -27,13 +41,12 @@ def run(dest_dir: str) -> None: # # Process data. - tb = drop_flagged_rows_and_unnecessary_columns(tb) - - # tb = geo.harmonize_countries( df=tb, countries_file=paths.country_mapping_path, ) + tb = drop_flagged_rows_and_unnecessary_columns(tb) + tb = tb.set_index(["country", "year"], verify_integrity=True) # @@ -92,11 +105,74 @@ def drop_flagged_rows_and_unnecessary_columns(tb: Table) -> Table: "cautionnotes", "resourcerevenuenotes", "socialcontributionsnotes", + ] + ) + + tb = sanity_checks(tb) + + # Remove all caution columns + tb = tb.drop( + columns=[ + "caution1accuracyqualityorco", "caution2resourcerevenuestax", "caution3unexcludedresourcere", "caution4inconsistencieswiths", ] - + caution_variables ) return tb + + +def sanity_checks(tb: Table) -> None: + """ + Perform sanity checks on the data. + """ + + tb = tb.copy() + + tb = check_negative_values(tb) + + return tb + + +def check_negative_values(tb: Table): + """ + Check if there are negative values in the variables + """ + + tb = tb.copy() + + # Define columns as all the columns minus country and year + variables = [ + col + for col in tb.columns + if col + not in ["country", "year"] + + [ + "caution1accuracyqualityorco", + "caution2resourcerevenuestax", + "caution3unexcludedresourcere", + "caution4inconsistencieswiths", + ] + ] + + for v in variables: + # Create a mask to check if any value is negative + mask = tb[v] < 0 + any_error = mask.any() + + if any_error: + tb_error = tb[mask].reset_index(drop=True).copy() + paths.log.warning( + f"""{len(tb_error)} observations for {v} are negative: + {_tabulate(tb_error[['country', 'year', 'caution1accuracyqualityorco', 'caution2resourcerevenuestax','caution3unexcludedresourcere','caution4inconsistencieswiths',v]], long_format=LONG_FORMAT)}""" + ) + + return tb + + +def _tabulate(tb: Table, long_format: bool, headers="keys", tablefmt=TABLEFMT, **kwargs): + if long_format: + return tabulate(tb, headers=headers, tablefmt=tablefmt, **kwargs) + else: + return tabulate(tb.head(5), headers=headers, tablefmt=tablefmt, **kwargs) diff --git a/etl/steps/data/garden/urbanization/2024-10-14/ghsl_degree_of_urbanisation.meta.yml b/etl/steps/data/garden/urbanization/2024-10-14/ghsl_degree_of_urbanisation.meta.yml index b12d8afc8f2..7d7c77d2f91 100644 --- a/etl/steps/data/garden/urbanization/2024-10-14/ghsl_degree_of_urbanisation.meta.yml +++ b/etl/steps/data/garden/urbanization/2024-10-14/ghsl_degree_of_urbanisation.meta.yml @@ -9,13 +9,13 @@ definitions: - |- **The Degree of Urbanisation (DEGURBA)** is a method for capturing the urban-rural divide, designed for international comparisons. Developed by six organizations and endorsed by the UN, it uses a two-level classification. - The first level divides areas into cities, towns and semi-dense areas, and rural areas, distinguishing between urban (cities, towns, suburbs) and rural regions. The second level adds detail, splitting towns and rural areas further. + The first level divides areas into cities, towns, and villages, distinguishing between urban (cities, towns, suburbs) and rural regions. The second level adds detail, splitting towns and villages further. This classification is based on 1 km² grid cells, grouped into urban centers, urban clusters, and rural cells. These grids are then used to classify smaller areas, typically using residential population grids from censuses or registers. If detailed data isn't available, a disaggregation grid estimates population distribution. To predict future urbanization (2025 and 2030), both static (land features) and dynamic (past satellite images) components are used to project growth. DEGURBA defines cities by population, not administrative borders, aligning with UN guidelines, though fixed thresholds may not always capture local differences. - description_short: The European Commission combines satellite imagery with national census data to identify [cities](#dod:cities-degurba), [towns and semi-dense areas](#dod:towns-suburbs-degurba), and [rural areas](#dod:rural-areas-degurba) and estimate their respective populations. + description_short: The European Commission combines satellite imagery with national census data to identify [cities](#dod:cities-degurba), [towns](#dod:towns-degurba), and [villages](#dod:villages-degurba) and estimate their respective populations. # Learn more about the available fields: @@ -30,13 +30,13 @@ tables: value: title: |- <% if location_type == "rural_total" and attribute == 'area' and type == 'estimates' %> - Land covered by rural areas + Land covered by villages <% elif location_type == "rural_total" and attribute == 'population' and type == 'estimates' %> - Population living in rural areas + Population living in villages <% elif location_type == "rural_total" and attribute == 'share' and type == 'estimates' %> - Share of land covered by rural areas + Share of land covered by villages <% elif location_type == "rural_total" and attribute == 'popshare' and type == 'estimates' %> - Share of population living in rural areas + Share of population living in villages <% elif location_type == "urban_centre" and attribute == 'area' and type == 'estimates' %> Land covered by cities @@ -48,13 +48,13 @@ tables: Share of population living in cities <% elif location_type == "urban_cluster" and attribute == 'area' and type == 'estimates' %> - Land covered by towns and semi-dense areas + Land covered by towns <% elif location_type == "urban_cluster" and attribute == 'population' and type == 'estimates' %> - Population living in towns and semi-dense areas + Population living in towns <% elif location_type == "urban_cluster" and attribute == 'share' and type == 'estimates' %> - Share of land covered by towns and semi-dense areas + Share of land covered by towns <% elif location_type == "urban_cluster" and attribute == 'popshare' and type == 'estimates' %> - Share of population living in towns and semi-dense areas + Share of population living in towns <% elif location_type == "urban_total" and attribute == 'area' and type == 'estimates' %> Land covered by urban areas @@ -66,13 +66,13 @@ tables: Share of population living in urban areas <% elif location_type == "rural_total" and attribute == 'area' and type == 'projections' %> - Projected land covered by rural areas + Projected land covered by villages <% elif location_type == "rural_total" and attribute == 'population' and type == 'projections' %> - Projected population living in rural areas + Projected population living in villages <% elif location_type == "rural_total" and attribute == 'share' and type == 'projections' %> - Projected share of land covered by rural areas + Projected share of land covered by villages <% elif location_type == "rural_total" and attribute == 'popshare' and type == 'projections' %> - Projected share of population living in rural areas + Projected share of population living in villages <% elif location_type == "urban_centre" and attribute == 'area' and type == 'projections' %> Projected land covered by cities @@ -84,13 +84,13 @@ tables: Projected share of population living in cities <% elif location_type == "urban_cluster" and attribute == 'area' and type == 'projections' %> - Projected land covered by towns and semi-dense areas + Projected land covered by towns <% elif location_type == "urban_cluster" and attribute == 'population' and type == 'projections' %> - Projected population living in towns and semi-dense areas + Projected population living in towns <% elif location_type == "urban_cluster" and attribute == 'share' and type == 'projections' %> - Projected share of land covered by towns and semi-dense areas + Projected share of land covered by towns <% elif location_type == "urban_cluster" and attribute == 'popshare' and type == 'projections' %> - Projected share of population living in towns and semi-dense areas + Projected share of population living in towns <% elif location_type == "urban_total" and attribute == 'area' and type == 'projections' %> Projected land covered by urban areas @@ -107,7 +107,7 @@ tables: <% elif location_type == "semi_dense" and attribute == 'number' and type == 'estimates' %> Number of semi-dense areas <% elif location_type == "rural_total" and attribute == 'number' and type == 'estimates' %> - Number of rural areas + Number of villages <% elif location_type == "urban_centre" and attribute == 'number' and type == 'estimates' %> Number of cities @@ -117,7 +117,7 @@ tables: <% elif location_type == "semi_dense" and attribute == 'number' and type == 'projections' %> Projected number of semi-dense areas <% elif location_type == "rural_total" and attribute == 'number' and type == 'projections' %> - Projected number of rural areas + Projected number of villages <% elif location_type == "urban_centre" and attribute == 'number' and type == 'projections' %> Projected number of cities @@ -127,14 +127,14 @@ tables: Projected population density in cities <% elif location_type == "urban_cluster" and attribute == 'density' and type == 'estimates' %> - Population density in towns and semi-dense areas + Population density in towns <% elif location_type == "urban_cluster" and attribute == 'density' and type == 'projections' %> - Projected population density in towns and semi-dense areas + Projected population density in towns <% elif location_type == "rural_total" and attribute == 'density' and type == 'estimates' %> - Population density in rural areas + Population density in villages <% elif location_type == "rural_total" and attribute == 'density' and type == 'projections' %> - Projected population density in rural areas + Projected population density in villages <% endif %> unit: @@ -160,7 +160,7 @@ tables: <%- endif -%> description_processing: <% if attribute == 'share' or attribute == 'popshare' %> - The share of total area or population for each urbanization level was calculated by dividing the area or population of each level (cities, towns and semi-dense areas, rural areas) by the overall total, providing a percentage representation for each category. + The share of total area or population for each urbanization level was calculated by dividing the area or population of each level (cities, towns, villages) by the overall total, providing a percentage representation for each category. <% elif attribute == 'density' %> Population density was calculated by dividing the population of cities by the total area it covers, providing a measure of the number of people living in each km². <%- endif -%> diff --git a/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.countries.json b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.countries.json new file mode 100644 index 00000000000..bdb892107bf --- /dev/null +++ b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.countries.json @@ -0,0 +1,185 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Angola": "Angola", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Benin": "Benin", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Gabon": "Gabon", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Guatemala": "Guatemala", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jersey": "Jersey", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Korea": "North Korea", + "North Macedonia": "North Macedonia", + "Northern Cyprus": "Northern Cyprus", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palestine": "Palestine", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Romania": "Romania", + "Russia": "Russia", + "Rwanda": "Rwanda", + "R\u00e9union": "Reunion", + "Samoa": "Samoa", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Korea": "South Korea", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Taiwan": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "M\u00e9xico": "Mexico", + "Republic of the Congo": "Congo", + "S\u00e3o Tom\u00e9 and Pr\u00edncipe": "Sao Tome and Principe" +} \ No newline at end of file diff --git a/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.meta.yml b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.meta.yml new file mode 100644 index 00000000000..a8d7bc66382 --- /dev/null +++ b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.meta.yml @@ -0,0 +1,290 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Urbanization + display: + numDecimalPlaces: + 0 + + processing_level: minor + description_key: + - |- + The European Commission integrates satellite imagery with national census data to delineate the boundaries of capital cities and estimate their populations. + To predict future urbanization (2025 and 2030), both static (land features) and dynamic (past satellite images) components are used to project growth. DEGURBA defines cities by population, not administrative borders, aligning with UN guidelines, though fixed thresholds may not always capture local differences. + + desc_short_density_capital: &desc_short_density_capital |- + The number of people per km² of land area for the country's capital city. + desc_short_density_top_100: &desc_short_density_top_100 |- + The number of people per km² of land area for cities ranked among the top 100 most populous in 2020. + + desc_short_pop_capital: &desc_short_pop_capital |- + The total number of people living in the country's capital city. + desc_short_pop_top_100: &desc_short_pop_top_100 |- + The total number of people living in cities ranked among the top 100 most populous in 2020. + desc_short_pop_1mil: &desc_short_pop_1mil |- + The total number of people living in cities with more than 1 million inhabitants. + + desc_short_1m_total : &desc_short_1m_total |- + The percentage of the total population living in cities with more than 1 million inhabitants. + desc_short_1m_urb: &desc_short_1m_urb |- + The percentage of the urban population living in cities with more than 1 million inhabitants. + + desc_processing_density: &desc_processing_density |- + Population density was calculated by dividing the population of the city by the total area it covers, providing a measure of the number of people living in each km². + + entityAnnotationsMapCapitals: &entityAnnotationsMapCapitals |- + Afghanistan: Kabul + Albania: Tirana + Algeria: Algiers + Angola: Luanda + Argentina: Buenos Aires + Armenia: Yerevan + Aruba: Oranjestad + Australia: North Canberra [Canberra] + Austria: Vienna + Azerbaijan: Baku + Bahamas: Nassau + Bahrain: Manama + Bangladesh: Dhaka + Barbados: Bridgetown + Belarus: Minsk + Belgium: Brussels + Benin: Porto-Novo + Bolivia: La Paz + Bosnia and Herzegovina: Sarajevo + Botswana: Gaborone + Brazil: Brasilia + Brunei: Bandar Seri Begawan + Bulgaria: Sofia + Burkina Faso: Ouagadougou + Burundi: Gitega + Cambodia: Phnom Penh + Cameroon: Yaoundé + Canada: Ottawa + Central African Republic: Bangui + Chad: N'Djamena + Chile: Santiago + China: Beijing + Colombia: Bogota + Comoros: Moroni + Costa Rica: San José + Croatia: Zagreb + Cuba: Havana + Curacao: Willemstad + Cyprus: Strovolos [Nicosia] + Czechia: Prague + Cote d'Ivoire: Yamoussoukro + Democratic Republic of Congo: Kinshasa + Denmark: Copenhagen + Djibouti: Djibouti + Dominican Republic: Santo Domingo + Ecuador: Quito + Egypt: Cairo + El Salvador: San Salvador + Equatorial Guinea: Malabo + Eritrea: Asmara + Estonia: Tallinn + Ethiopia: Addis Ababa + Fiji: Suva + Finland: Helsinki + France: Paris + French Guiana: Cayenne + French Polynesia: Papeete + Gabon: Libreville + Georgia: Tbilisi + Germany: Berlin + Ghana: Accra + Greece: Athens + Guatemala: Guatemala City + Guinea-Bissau: Bissau + Guyana: Georgetown + Haiti: Port-au-Prince + Honduras: Tegucigalpa + Hungary: Budapest + Iceland: Reykjavik + India: New Delhi + Indonesia: Jakarta + Iran: Tehran + Iraq: Baghdad + Ireland: Dublin + Israel: Jerusalem + Italy: Rome + Jamaica: Kingston + Japan: Tokyo + Jersey: St. Helier + Jordan: Amman + Kazakhstan: Astana + Kenya: Nairobi + Kosovo: Pristina + Kuwait: Kuwait City + Kyrgyzstan: Bishkek + Laos: Vientiane + Latvia: Riga + Lebanon: Beirut + Lesotho: Maseru + Liberia: Monrovia + Libya: Tripoli + Lithuania: Vilnius + Luxembourg: Luxembourg + Madagascar: Antananarivo + Malawi: Lilongwe + Malaysia: Kuala Lumpur + Maldives: Malé + Mali: Bamako + Malta: Valletta + Mauritania: Nouakchott + Mauritius: Port Louis + Mayotte: Mamoudzou + Moldova: Chișinău + Mongolia: Ulaanbaatar + Montenegro: Podgorica + Morocco: Rabat + Mozambique: Maputo + Myanmar: Pyinmana [Nay Pyi Taw] + Mexico: Mexico City + Namibia: Windhoek + Nepal: Kathmandu + Netherlands: Amsterdam + New Caledonia: Nouméa + New Zealand: Wellington + Nicaragua: Managua + Niger: Niamey + Nigeria: Abuja + North Korea: P'yŏngyang + North Macedonia: Skopje + Northern Cyprus: Nicosia + Norway: Oslo + Oman: Muscat + Pakistan: Islamabad + Palestine: Ramallah + Panama: Panama City + Papua New Guinea: Port Moresby + Paraguay: Asuncion + Peru: Lima + Philippines: Manila + Poland: Warsaw + Portugal: Lisbon + Puerto Rico: Bayamón [San Juan] + Qatar: Doha + Congo: Brazzaville + Romania: Bucharest + Russia: Moscow + Rwanda: Kigali + Reunion: Saint-Denis + Samoa: Apia + Saudi Arabia: Riyadh + Senegal: Dakar + Serbia: Belgrade + Sierra Leone: Freetown + Singapore: Singapore + Slovakia: Bratislava + Slovenia: Ljubljana + Solomon Islands: Honiara + Somalia: Mogadishu + South Africa: Cape Town + South Korea: Seoul + South Sudan: Juba + Spain: Madrid + Sri Lanka: Colombo [Sri Jayawardenepura Kotte] + Sudan: Khartoum + Suriname: Paramaribo + Sweden: Stockholm + Switzerland: Bern + Syria: Damascus + Sao Tome and Principe: São Tomé + Taiwan: Taipei + Tajikistan: Dushanbe + Tanzania: Dodoma + Thailand: Bangkok + East Timor: Dili + Togo: Lomé + Tonga: Nuku'alofa + Trinidad and Tobago: Port of Spain + Tunisia: Tunis + Turkey: Ankara + Turkmenistan: Ashgabat + Uganda: Kampala + Ukraine: Kyiv + United Arab Emirates: Abu Dhabi + United Kingdom: London + United States: Washington + Uruguay: Montevideo + Uzbekistan: Tashkent + Vanuatu: Port Vila + Venezuela: Caracas + Vietnam: Hanoi + Yemen: Sana'a + Zambia: Lusaka + Zimbabwe: Harare + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + + +tables: + ghsl_urban_centers: + variables: + + urban_pop_projections: + title: Population of the capital city (projected) + unit: 'people' + description_short: *desc_short_pop_capital + display: + entityAnnotationsMap: *entityAnnotationsMapCapitals + isProjection: true + + urban_density_projections: + title: Population density of the capital city (projected) + unit: 'people/km²' + description_short: *desc_short_density_capital + description_processing: *desc_processing_density + display: + entityAnnotationsMap: *entityAnnotationsMapCapitals + isProjection: true + + urban_pop_estimates: + title: Population of the capital city + unit: 'people' + description_short: *desc_short_pop_capital + display: + entityAnnotationsMap: *entityAnnotationsMapCapitals + + urban_density_estimates: + title: Population density of the capital city + unit: 'people/km²' + description_short: *desc_short_density_capital + description_processing: *desc_processing_density + display: + entityAnnotationsMap: *entityAnnotationsMapCapitals + + urban_density_top_100_projections: + title: Population density of the top 100 most populous cities (projected) + unit: 'people/km²' + description_short: *desc_short_density_top_100 + description_processing: *desc_processing_density + display: + isProjection: true + + urban_density_top_100_estimates: + title: Population density of the top 100 most populous cities + unit: 'people/km²' + description_short: *desc_short_density_top_100 + description_processing: *desc_processing_density + + urban_pop_top_100_estimates: + title: Population of the top 100 most populous cities + unit: 'people' + description_short: *desc_short_pop_top_100 + + urban_pop_top_100_projections: + title: Population of the top 100 most populous cities (projected) + unit: 'people' + description_short: *desc_short_pop_top_100 + display: + isProjection: true diff --git a/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.py b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.py new file mode 100644 index 00000000000..7107170111a --- /dev/null +++ b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.py @@ -0,0 +1,86 @@ +"""Load a meadow dataset and create a garden dataset.""" +import owid.catalog.processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +START_OF_PROJECTIONS = 2025 + +# Regions for which aggregates will be created. +REGIONS = [ + "North America", + "South America", + "Europe", + "Africa", + "Asia", + "Oceania", + "Low-income countries", + "Upper-middle-income countries", + "Lower-middle-income countries", + "High-income countries", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("ghsl_urban_centers") + # Read table from meadow dataset. + tb = ds_meadow.read("ghsl_urban_centers") + + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # + # Process data. + # + + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb = tb.drop(columns=["urban_center_name", "urban_area"]) + + # Add region aggregates. + tb = geo.add_regions_to_table( + tb, + aggregations={"urban_pop": "sum"}, + regions=REGIONS, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + min_num_values_per_year=1, + ) + + # Split data into estimates and projections. + past_estimates = tb[tb["year"] < START_OF_PROJECTIONS].copy() + future_projections = tb[tb["year"] >= START_OF_PROJECTIONS - 5].copy() + + # Now, for each column, split it into two (projections and estimates). + for col in ["urban_pop", "urban_density", "urban_density_top_100", "urban_pop_top_100"]: + if col not in ["country", "year"]: + past_estimates[f"{col}_estimates"] = tb.loc[tb["year"] < START_OF_PROJECTIONS, col] + future_projections[f"{col}_projections"] = tb.loc[tb["year"] >= START_OF_PROJECTIONS - 5, col] + past_estimates = past_estimates.drop(columns=[col]) + future_projections = future_projections.drop(columns=[col]) + + # Merge past estimates and future projections + tb = pr.merge(past_estimates, future_projections, on=["country", "year"], how="outer") + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py b/etl/steps/data/garden/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py index 24a3f4080bc..b5fe034233a 100644 --- a/etl/steps/data/garden/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py +++ b/etl/steps/data/garden/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py @@ -86,10 +86,13 @@ # NOTE: The following could be mapped to ("Clays", "Mine, fuller's earth"). We decided to remove "Clays". ("Fuller's earth", "Total"): None, ("Gallium", "Total"): ("Gallium", "Refinery"), + ("Gemstones", "Total"): ("Gemstones", "Mine"), ("Germanium", "Total"): ("Germanium", "Refinery"), ("Gold", "Total"): ("Gold", "Mine"), ("Graphite", "Total"): ("Graphite", "Mine"), ("Gypsum", "Total"): ("Gypsum", "Mine"), + # In USGS current data, "Hafnium" is not reported, only "Zirconium and Hafnium". + ("Hafnium", "Total"): None, ("Helium", "Total"): ("Helium", "Mine"), ("Indium", "Total"): ("Indium", "Refinery"), # NOTE: Industrial diamond production includes natural and synthetic diamonds. @@ -97,13 +100,18 @@ ("Industrial diamond", "Total"): ("Diamond", "Mine and synthetic, industrial"), ("Industrial garnet", "Total"): ("Garnet", "Mine"), ("Industrial sand and gravel", "Total"): ("Sand and gravel", "Mine, industrial"), + ("Iodine", "Total"): ("Iodine", "Mine"), ("Iron Oxide Pigments", "Total"): None, + ("Iron and Steel Scrap", "Total"): None, ("Iron and Steel Slag", "Total"): None, ("Iron ore", "Total"): ("Iron ore", "Mine, crude ore"), # NOTE: The following could be mapped to ("Clays", "Mine, kaolin"). We decided to remove "Clays". ("Kaolin", "Total"): None, + ("Kyanite", "Total"): None, ("Lead", "Total"): ("Lead", "Mine"), ("Lime", "Total"): ("Lime", "Processing"), + ("Lithium statistics", "Total"): ("Lithium", "Mine"), + ("Lumber", "Total"): None, ("Magnesium compounds", "Total"): ("Magnesium compounds", "Mine"), ("Magnesium metal", "Total"): ("Magnesium metal", "Smelter"), ("Manganese", "Total"): ("Manganese", "Mine"), @@ -114,23 +122,34 @@ # NOTE: The following could be mapped to ("Clays", "Mine, miscellaneous"). We decided to remove "Clays". ("Miscellaneous clay", "Total"): None, ("Molybdenum", "Total"): ("Molybdenum", "Mine"), + ("Natural & Synthetic Rutile", "Total"): None, ("Nickel", "Total"): ("Nickel", "Mine"), ("Niobium", "Total"): ("Niobium", "Mine"), # NOTE: Extracted from "world_mine_production". ("Niobium", "Mine"): ("Niobium", "Mine"), ("Nitrogen (Fixed)-Ammonia", "Total"): ("Nitrogen", "Fixed ammonia"), + ("Other industrial wood products", "Total"): None, + ("Paper and board", "Total"): None, # NOTE: The following could be mapped to ("Peat", "Mine"). We decided to remove "Peat". ("Peat", "Total"): None, ("Perlite", "Total"): ("Perlite", "Mine"), ("Phosphate rock", "Total"): ("Phosphate rock", "Mine"), ("Pig Iron", "Total"): ("Iron", "Smelter, pig iron"), + # In USGS current data, PGM are broken down into palladium and platinum. + ("Platinum-group metals", "Total"): None, + ("Plywood and veneer", "Total"): None, + ("Potash", "Total"): ("Potash", "Mine"), ("Pumice and Pumicite", "Total"): ("Pumice and pumicite", "Mine"), + ("Quartz crystal", "Total"): None, + ("Rare earths", "Total"): ("Rare earths", "Mine"), + ("Rhenium", "Total"): ("Rhenium", "Mine"), ("Salt", "Total"): ("Salt", "Mine"), ("Selenium", "Total"): ("Selenium", "Refinery"), ("Silicon", "Total"): ("Silicon", "Processing"), ("Silicon carbide", "Total"): None, ("Silver", "Total"): ("Silver", "Mine"), ("Soda ash", "Total"): ("Soda ash", "Natural and synthetic"), + ("Sodium sulfate", "Total"): None, ("Steel", "Total"): ("Steel", "Processing, crude"), ("Strontium", "Total"): ("Strontium", "Mine"), ("Sulfur", "Total"): ("Sulfur", "Processing"), @@ -138,6 +157,8 @@ ("Tantalum", "Total"): ("Tantalum", "Mine"), # NOTE: The following could be mapped to ("Tellurium", "Refinery"). However, we decided to discard Tellurium. ("Tellurium", "Total"): None, + ("Thallium", "Total"): None, + ("Thorium", "Total"): None, ("Tin", "Total"): ("Tin", "Mine"), # NOTE: For titanium there is no global data. ("Titanium dioxide", "Total"): None, @@ -147,9 +168,15 @@ # To avoid confusion, remove this total. ("Total clay", "Total"): None, ("Total manufactured abrasives ", "Total"): None, + ("Total forestry", "Total"): None, ("Tungsten", "Total"): ("Tungsten", "Mine"), ("Vanadium", "Total"): ("Vanadium", "Mine"), + ("Vermiculite", "Total"): None, + ("Wollastonite", "Total"): None, + ("Wood panel products", "Total"): None, ("Zinc", "Total"): ("Zinc", "Mine"), + # In USGS current data, "Hafnium" is not reported, only "Zirconium and Hafnium". + ("Zirconium", "Total"): None, } # Units can either be "metric tonnes" or "metric tonnes of gross weight". @@ -199,6 +226,8 @@ # NOTE: Bismuth is in gross weight for the US, but metal content for the World. # However, data for the US contains only nans and zeros, and will be removed later on. "Bismuth", + "Gemstones", + "Iodine", ] # Footnotes (that will appear in the footer of charts) to add to the flattened tables (production and unit value). @@ -213,6 +242,8 @@ # "production|Chromium|Mine|tonnes": "Values are reported as tonnes of contained chromium.", "production|Cobalt|Refinery|tonnes": "Values are reported as tonnes of cobalt content.", "production|Bismuth|Mine|tonnes": "Values are reported as tonnes of metal content.", + "production|Lithium|Mine|tonnes": "Values are reported as tonnes of lithium content.", + "production|Gemstones|Mine|tonnes": "Values are reported as tonnes of gemstone-quality diamonds.", } FOOTNOTES_UNIT_VALUE = { "unit_value|Silicon|Processing|constant 1998 US$ per tonne": "Values refer to constant 1998 US$ per tonne of silicon content in ferrosilicon or silicon metal.", @@ -325,12 +356,13 @@ def prepare_us_production(tb: Table, tb_metadata: Table) -> Table: # For now, we'll only keep "production". tb_us_production = tb[["commodity", "year", "production", "unit"]].assign(**{"country": "United States"}) # Remove spurious footnotes like "W". + # Also, zirconiummineral_concentrates contains rows of "< 100,000". It's unclear which value should be assigned here, so we will remove these (two) rows. tb_us_production["production"] = map_series( tb_us_production["production"], - mapping={"W": None}, + mapping={"W": None, "< 100,000": None}, warn_on_missing_mappings=False, warn_on_unused_mappings=True, - ).astype({"production": float}) + ).astype({"production": "float64[pyarrow]"}) # Add notes to the table, using the extracted metadata. for column in ["production"]: mask = tb_metadata[column].notnull() @@ -490,10 +522,8 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("historical_statistics_for_mineral_and_material_commodities") # Read tables of data and extracted metadata from meadow dataset. - tb = ds_meadow.read_table("historical_statistics_for_mineral_and_material_commodities") - tb_metadata = ds_meadow.read_table("historical_statistics_for_mineral_and_material_commodities_metadata").astype( - "string" - ) + tb = ds_meadow.read("historical_statistics_for_mineral_and_material_commodities") + tb_metadata = ds_meadow.read("historical_statistics_for_mineral_and_material_commodities_metadata").astype("string") # # Process data. @@ -642,6 +672,40 @@ def run(dest_dir: str) -> None: (tb_flat["country"].isin(["World"])) & (tb_flat["year"].isin([1974, 1999])), "production|Magnesium metal|Smelter|tonnes", ] = None + + # Lithium production in the US is only given until 1954. From then on, the data is "W" (which means withheld to avoid disclosing company proprietary data). To avoid confusion, simply remove all this data. + error = "Expected lithium US production data to end in 1954. Remove this part of the code." + assert ( + tb_flat.loc[ + (tb_flat["country"] == "United States") & (tb_flat["production|Lithium|Mine|tonnes"].notnull()), "year" + ].max() + == 1954 + ), error + tb_flat.loc[(tb_flat["country"] == "United States"), "production|Lithium|Mine|tonnes"] = None + # Similarly, unit value from 1900 to 1951 is only informed in 1936 (and in terms of production, it's only informed in terms of gross weight, not lithium content). This creates a significant decline in unit value in line charts (between 1936 and 1952) which is unclear if it's real or not. To avoid confusion, ignore that data point and start the series in 1952. + error = ( + "Expected lithium unit value data to only be informed in 1936 (prior to 1952). Remove this part of the code." + ) + assert tb_flat[ + (tb_flat["unit_value|Lithium|Mine|constant 1998 US$ per tonne"].notnull()) & (tb_flat["year"] < 1952) + ]["year"].tolist() == [1936], error + tb_flat.loc[(tb_flat["year"] < 1952), "unit_value|Lithium|Mine|constant 1998 US$ per tonne"] = None + + # Gemstones unit values is zero in a range of years. + # The documentation says "Unit value data for 1922–28 were estimated by interpolation of imports value data, and rounded to two significant figures". + # In practice, the unit values for those years are exactly zero, which are probably spurious. + # Remove those zeros. + _years_with_zero_value = [1922, 1923, 1924, 1925, 1926, 1927, 1928] + error = "Expected gemstones unit value to be zero in a range of years. Remove this part of the code." + assert set( + tb_flat.loc[ + (tb_flat["year"].isin(_years_with_zero_value)) & (tb_flat["country"] == "World"), + "unit_value|Gemstones|Mine|constant 1998 US$ per tonne", + ] + ) == {0.0}, error + tb_flat.loc[ + (tb_flat["year"].isin(_years_with_zero_value)), "unit_value|Gemstones|Mine|constant 1998 US$ per tonne" + ] = None #################################################################################################################### # Format tables conveniently. diff --git a/etl/steps/data/garden/war/2023-09-21/brecke.meta.yml b/etl/steps/data/garden/war/2023-09-21/brecke.meta.yml index c386e8d7420..dc18bda67ad 100644 --- a/etl/steps/data/garden/war/2023-09-21/brecke.meta.yml +++ b/etl/steps/data/garden/war/2023-09-21/brecke.meta.yml @@ -34,6 +34,7 @@ definitions: number_deaths: description_short: |- + <% set per_capita = "" %> <%- if conflict_type == "all" -%> The estimated number of deaths<< per_capita >> in ongoing armed conflicts that year. diff --git a/etl/steps/data/garden/war/2023-09-21/cow.meta.yml b/etl/steps/data/garden/war/2023-09-21/cow.meta.yml index c0831948820..4fe167ba54f 100644 --- a/etl/steps/data/garden/war/2023-09-21/cow.meta.yml +++ b/etl/steps/data/garden/war/2023-09-21/cow.meta.yml @@ -84,6 +84,7 @@ definitions: number_deaths: description_short: |- + <% set per_capita = "" %> <%- if conflict_type == "all" -%> Included are deaths of combatants due to fighting, disease, and starvation in wars that were ongoing that year<< per_capita >>. diff --git a/etl/steps/data/garden/war/2023-09-21/cow.py b/etl/steps/data/garden/war/2023-09-21/cow.py index e44109e2016..540f00d271d 100644 --- a/etl/steps/data/garden/war/2023-09-21/cow.py +++ b/etl/steps/data/garden/war/2023-09-21/cow.py @@ -681,6 +681,7 @@ def make_table_intra(tb: Table) -> Table: tb.groupby("warnum")["intnl"].nunique().max() == 1 ), "An intra-state conflict is not expected to change between international / non-international!" mask = tb["intnl"] == 1 + tb["conflict_type"] = tb["conflict_type"].astype(object) tb.loc[mask, "conflict_type"] = CTYPE_INTRA_INTL tb.loc[-mask, "conflict_type"] = CTYPE_INTRA_NINTL diff --git a/etl/steps/data/garden/war/2023-09-21/mars.meta.yml b/etl/steps/data/garden/war/2023-09-21/mars.meta.yml index 952866fb2af..56b433fa3ee 100644 --- a/etl/steps/data/garden/war/2023-09-21/mars.meta.yml +++ b/etl/steps/data/garden/war/2023-09-21/mars.meta.yml @@ -41,6 +41,7 @@ definitions: number_deaths: description_short: |- + <% set per_capita = "" %> <%- if conflict_type == "all" -%> The << estimate >> estimate of the number of deaths<< per_capita >> in all ongoing conventional wars that year. diff --git a/etl/steps/data/garden/war/2023-09-21/mie.meta.yml b/etl/steps/data/garden/war/2023-09-21/mie.meta.yml index 597067acb05..99355425c04 100644 --- a/etl/steps/data/garden/war/2023-09-21/mie.meta.yml +++ b/etl/steps/data/garden/war/2023-09-21/mie.meta.yml @@ -13,19 +13,19 @@ definitions: force_usage: &force_usage |- A threat of force can be unspecified, or entail threatening to declare war or to occupy territory; a display of force can entail shows of force or border violations; a use of force can entail attacks, clashes, or battles. conflict_type: - <%- if hostility == "all" -%> + <%- if hostility_level == "all" -%> Included are conflicts between states where force was threatened, displayed, used, or escalated to a war that year - <%- elif hostility == "Threat to use force" -%> + <%- elif hostility_level == "Threat to use force" -%> Included are conflicts between states where, at most, use of force was threatened that year - <%- elif hostility == "Display of force" -%> + <%- elif hostility_level == "Display of force" -%> Included are conflicts between states where, at most, force was displayed that year - <%- elif hostility == "Use of force" -%> + <%- elif hostility_level == "Use of force" -%> Included are conflicts between states where, at most, force was used that year, but there were fewer than 1,000 combatant deaths - <%- elif hostility == "War" -%> + <%- elif hostility_level == "War" -%> Included are interstate wars that were ongoing that year <%- endif -%> @@ -41,10 +41,10 @@ definitions: number_new_conflicts: description_short: |- - <%- if hostility == "Use of force" -%> + <%- if hostility_level == "Use of force" -%> Included are conflicts between states where, at most, force was used that year for the first time, but there were fewer than 1,000 combatant deaths. - <%- elif hostility == "War" -%> + <%- elif hostility_level == "War" -%> Included are interstate wars that started that year. <% else %> @@ -60,6 +60,7 @@ definitions: number_deaths: description_short: |- + <% set per_capita = "" %> The << estimate >> estimate of the number of deaths of combatants due to fighting between states that year<< per_capita >>. description_short_per_capita: <% set per_capita = ", per 100,000 people" %> diff --git a/etl/steps/data/garden/war/2023-09-21/prio_v31.meta.yml b/etl/steps/data/garden/war/2023-09-21/prio_v31.meta.yml index b434cdfee05..0659590c690 100644 --- a/etl/steps/data/garden/war/2023-09-21/prio_v31.meta.yml +++ b/etl/steps/data/garden/war/2023-09-21/prio_v31.meta.yml @@ -173,6 +173,7 @@ tables: unit: deaths description_short: |- <% set estimate = "high" %> + <% set per_capita = "" %> {definitions.number_deaths.description_short} description_key: *description_key_deaths @@ -181,6 +182,7 @@ tables: unit: deaths description_short: |- <% set estimate = "low" %> + <% set per_capita = "" %> {definitions.number_deaths.description_short} description_key: *description_key_deaths @@ -189,6 +191,7 @@ tables: unit: deaths description_short: |- <% set estimate = "best" %> + <% set per_capita = "" %> {definitions.number_deaths.description_short} description_key: *description_key_deaths @@ -197,6 +200,7 @@ tables: unit: deaths per 100,000 people description_short: |- <% set estimate = "high" %> + <% set per_capita = ", per 100,000 people" %> {definitions.number_deaths.description_short} description_key: *description_key_deaths display: @@ -207,6 +211,7 @@ tables: unit: deaths per 100,000 people description_short: |- <% set estimate = "low" %> + <% set per_capita = ", per 100,000 people" %> {definitions.number_deaths.description_short} description_key: *description_key_deaths display: @@ -217,6 +222,7 @@ tables: unit: deaths per 100,000 people description_short: |- <% set estimate = "best" %> + <% set per_capita = ", per 100,000 people" %> {definitions.number_deaths.description_short} description_key: *description_key_deaths @@ -227,6 +233,7 @@ tables: title: Number of ongoing conflicts unit: conflicts description_short: |- + <% set per_capita = "" %> {definitions.number_ongoing_conflicts.description_short}. description_key: *description_key_ongoing @@ -234,6 +241,7 @@ tables: title: Number of ongoing conflicts per state unit: conflicts per state description_short: |- + <% set per_capita = "" %> The number of conflicts divided by the number of all states. This accounts for the changing number of states over time. {definitions.number_ongoing_conflicts.description_short} description_key: *description_key_ongoing display: @@ -243,6 +251,7 @@ tables: title: Number of ongoing conflicts per state-pair unit: conflicts per state-pair description_short: |- + <% set per_capita = "" %> The number of conflicts divided by the number of all state-pairs. This accounts for the changing number of states over time. {definitions.number_ongoing_conflicts.description_short} description_key: *description_key_ongoing display: @@ -255,6 +264,7 @@ tables: title: Number of new conflicts unit: conflicts description_short: |- + <% set per_capita = "" %> {definitions.number_new_conflicts.description_short} description_key: *description_key_new @@ -262,6 +272,7 @@ tables: title: Number of new conflicts per state unit: conflicts per state description_short: |- + <% set per_capita = "" %> The number of conflicts divided by the number of all states. This accounts for the changing number of states over time. {definitions.number_new_conflicts.description_short} description_key: *description_key_new display: @@ -271,6 +282,7 @@ tables: title: Number of new conflicts per state-pair unit: conflicts per state-pair description_short: |- + <% set per_capita = "" %> The number of conflicts divided by the number of all state-pairs. This accounts for the changing number of states over time. {definitions.number_new_conflicts.description_short} description_key: *description_key_new display: diff --git a/etl/steps/data/garden/war/2023-09-21/ucdp.meta.yml b/etl/steps/data/garden/war/2023-09-21/ucdp.meta.yml index fa0e4ede96d..26fc92220f4 100644 --- a/etl/steps/data/garden/war/2023-09-21/ucdp.meta.yml +++ b/etl/steps/data/garden/war/2023-09-21/ucdp.meta.yml @@ -49,6 +49,7 @@ definitions: # Fields used for number of deaths indicators number_deaths: description_short: |- + <% set per_capita = "" %> <%- if conflict_type == "all" -%> The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. diff --git a/etl/steps/data/garden/war/2023-09-21/ucdp_prio.meta.yml b/etl/steps/data/garden/war/2023-09-21/ucdp_prio.meta.yml index f1686d8e68f..34960bea396 100644 --- a/etl/steps/data/garden/war/2023-09-21/ucdp_prio.meta.yml +++ b/etl/steps/data/garden/war/2023-09-21/ucdp_prio.meta.yml @@ -2,6 +2,7 @@ definitions: all: description_short: |- + <% set per_capita = "" %> <%- if conflict_type == "all" -%> The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. diff --git a/etl/steps/data/garden/war/2024-01-11/nuclear_weapons_proliferation.py b/etl/steps/data/garden/war/2024-01-11/nuclear_weapons_proliferation.py index a17543b7ca3..59e4edf222b 100644 --- a/etl/steps/data/garden/war/2024-01-11/nuclear_weapons_proliferation.py +++ b/etl/steps/data/garden/war/2024-01-11/nuclear_weapons_proliferation.py @@ -52,6 +52,7 @@ def run(dest_dir: str) -> None: .groupby(["status", "year"], as_index=False) .count() .pivot(index="year", columns="status", join_column_levels_with="_") + .rename(columns={"year_": "year"}) ) # Rename columns conveniently. diff --git a/etl/steps/data/garden/war/2024-08-26/shared.py b/etl/steps/data/garden/war/2024-08-26/shared.py index e5cc8dff0ea..e72e85e5b6d 100644 --- a/etl/steps/data/garden/war/2024-08-26/shared.py +++ b/etl/steps/data/garden/war/2024-08-26/shared.py @@ -157,8 +157,8 @@ def get_number_of_countries_in_conflict_by_region(tb: Table, dimension_name: str # Complement with missing entries tb_num_participants = expand_time_column( tb_num_participants, - ["country", dimension_name], - "year", + dimension_col=["country", dimension_name], + time_col="year", method="full_range", fillna_method="zero", ) diff --git a/etl/steps/data/garden/war/2024-08-26/ucdp.meta.yml b/etl/steps/data/garden/war/2024-08-26/ucdp.meta.yml index 5a54ed362c9..f5f8701bbaf 100644 --- a/etl/steps/data/garden/war/2024-08-26/ucdp.meta.yml +++ b/etl/steps/data/garden/war/2024-08-26/ucdp.meta.yml @@ -49,6 +49,7 @@ definitions: # Fields used for number of deaths indicators number_deaths: description_short: |- + <% set per_capita = "" %> <%- if conflict_type == "all" -%> The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. @@ -78,6 +79,7 @@ definitions: number_deaths_type: description_short: |- + <% set per_capita = "" %> <%- if conflict_type == "all" -%> The best estimate of the number of deaths of << people_type >> in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. @@ -91,7 +93,7 @@ definitions: The best estimate of the number of deaths of << people_type >> in non-internationalized intrastate conflicts that were ongoing that year<< per_capita >>. <%- elif conflict_type == "one-sided violence" -%> - The << estimate >> estimate of the number of deaths of << people_type >> from one-sided violence that was ongoing that year<< per_capita >>. + The << estimate | default('') >> estimate of the number of deaths of << people_type >> from one-sided violence that was ongoing that year<< per_capita >>. <%- elif conflict_type == "non-state conflict" -%> The best estimate of the number of deaths of << people_type >> in non-state conflicts that were ongoing that year<< per_capita >>. diff --git a/etl/steps/data/garden/war/2024-08-26/ucdp.py b/etl/steps/data/garden/war/2024-08-26/ucdp.py index 3e9be336fbc..3cda91c4a90 100644 --- a/etl/steps/data/garden/war/2024-08-26/ucdp.py +++ b/etl/steps/data/garden/war/2024-08-26/ucdp.py @@ -1,6 +1,12 @@ """Data from UCDP. +IMPORTANT NOTE: + + Changes in this script should probably be reflected also in the script for ucdp_preview! + + At some point we should merge the tools from both scripts to avoid duplication. + Notes: - Conflict types for state-based violence is sourced from UCDP/PRIO dataset. non-state and one-sided violence is sourced from GED dataset. - There can be some mismatches with latest official reported data (UCDP's live dashboard). This is because UCDP uses latest data for their dashboard, which might not be available yet as bulk download. @@ -79,13 +85,13 @@ def run(dest_dir: str) -> None: # Read table from GW codes ds_gw = paths.load_dataset("gleditsch") - tb_regions = ds_gw.read_table("gleditsch_regions") + tb_regions = ds_gw.read("gleditsch_regions") tb_codes = ds_gw["gleditsch_countries"] # Load maps table short_name = "nat_earth_110" ds_maps = paths.load_dataset(short_name) - tb_maps = ds_maps.read_table(short_name) + tb_maps = ds_maps.read(short_name) # Load population ds_population = paths.load_dataset("population") @@ -98,7 +104,7 @@ def run(dest_dir: str) -> None: # Load relevant tables tb_ged = ( - ds_meadow.read_table("ucdp_ged") + ds_meadow.read("ucdp_ged") .reset_index() .astype( { @@ -113,7 +119,7 @@ def run(dest_dir: str) -> None: ) ) tb_conflict = ( - ds_meadow.read_table("ucdp_battle_related_conflict") + ds_meadow.read("ucdp_battle_related_conflict") .reset_index() .astype( { @@ -123,14 +129,14 @@ def run(dest_dir: str) -> None: } ) ) - tb_prio = ds_meadow.read_table("ucdp_prio_armed_conflict") + tb_prio = ds_meadow.read("ucdp_prio_armed_conflict") # Keep only active conflicts paths.log.info("keep active conflicts") tb_ged = tb_ged.loc[tb_ged["active_year"] == 1] # Change region named "Asia" to "Asia and Oceania" (in GED) - tb_ged["region"] = tb_ged["region"].cat.rename_categories({"Asia": "Asia and Oceania"}) + tb_ged["region"] = tb_ged["region"].replace({"Asia": "Asia and Oceania"}) # Create `conflict_type` column paths.log.info("add field `conflict_type`") @@ -158,15 +164,15 @@ def run(dest_dir: str) -> None: paths.log.info("replace missing data with zeros (where applicable)") tb_prio = expand_time_column( tb_prio, - ["region", "conflict_type"], - "year", + dimension_col=["region", "conflict_type"], + time_col="year", method="full_range", fillna_method="zero", ) tb = expand_time_column( tb, - ["region", "conflict_type"], - "year", + dimension_col=["region", "conflict_type"], + time_col="year", method="full_range", fillna_method="zero", ) @@ -350,8 +356,9 @@ def add_conflict_type(tb_ged: Table, tb_conflict: Table) -> Table: # Create `conflict_type` column as a combination of `type_of_violence` and `type_of_conflict`. tb_ged["conflict_type"] = ( tb_ged["type_of_conflict"] + .astype(object) .replace(TYPE_OF_CONFLICT_MAPPING) - .fillna(tb_ged["type_of_violence"].replace(TYPE_OF_VIOLENCE_MAPPING)) + .fillna(tb_ged["type_of_violence"].astype(object).replace(TYPE_OF_VIOLENCE_MAPPING)) ) # Sanity check @@ -917,7 +924,7 @@ def estimate_metrics_participants_prio(tb_prio: Table, tb_codes: Table) -> Table tb_country["participated_in_conflict"].m.origins = tb_prio["gwno_a"].m.origins # Format conflict tyep - tb_country["conflict_type"] = tb_country["type_of_conflict"].replace(TYPE_OF_CONFLICT_MAPPING) + tb_country["conflict_type"] = tb_country["type_of_conflict"].astype(object).replace(TYPE_OF_CONFLICT_MAPPING) tb_country = tb_country.drop(columns=["type_of_conflict"]) # Prepare GW table @@ -1083,8 +1090,8 @@ def estimate_metrics_locations(tb: Table, tb_maps: Table, tb_codes: Table, ds_po # Fill with zeroes tb_locations_country = expand_time_column( tb_locations_country, - ["country", "conflict_type"], - "year", + dimension_col=["country", "conflict_type"], + time_col="year", method="full_range", fillna_method="zero", ) diff --git a/etl/steps/data/garden/war/2024-08-26/ucdp_prio.meta.yml b/etl/steps/data/garden/war/2024-08-26/ucdp_prio.meta.yml index 33abbd66846..640ad6560c8 100644 --- a/etl/steps/data/garden/war/2024-08-26/ucdp_prio.meta.yml +++ b/etl/steps/data/garden/war/2024-08-26/ucdp_prio.meta.yml @@ -2,6 +2,7 @@ definitions: all: description_short: |- + <% set per_capita = "" %> <%- if conflict_type == "all" -%> The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. diff --git a/etl/steps/data/garden/war/2024-10-02/shared.py b/etl/steps/data/garden/war/2024-10-02/shared.py index e5cc8dff0ea..e72e85e5b6d 100644 --- a/etl/steps/data/garden/war/2024-10-02/shared.py +++ b/etl/steps/data/garden/war/2024-10-02/shared.py @@ -157,8 +157,8 @@ def get_number_of_countries_in_conflict_by_region(tb: Table, dimension_name: str # Complement with missing entries tb_num_participants = expand_time_column( tb_num_participants, - ["country", dimension_name], - "year", + dimension_col=["country", dimension_name], + time_col="year", method="full_range", fillna_method="zero", ) diff --git a/etl/steps/data/garden/war/2024-10-02/ucdp_monthly.meta.yml b/etl/steps/data/garden/war/2024-10-02/ucdp_monthly.meta.yml index da1b751d200..568e6fd686d 100644 --- a/etl/steps/data/garden/war/2024-10-02/ucdp_monthly.meta.yml +++ b/etl/steps/data/garden/war/2024-10-02/ucdp_monthly.meta.yml @@ -49,6 +49,7 @@ definitions: # Fields used for number of deaths indicators number_deaths: description_short: |- + <% set per_capita = "" %> <%- if conflict_type == "all" -%> The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. @@ -78,6 +79,7 @@ definitions: number_deaths_type: description_short: |- + <% set per_capita = "" %> <%- if conflict_type == "all" -%> The best estimate of the number of deaths of << people_type >> in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. diff --git a/etl/steps/data/garden/war/2024-10-02/ucdp_monthly.py b/etl/steps/data/garden/war/2024-10-02/ucdp_monthly.py index 7610cfa3ae2..2cad268e730 100644 --- a/etl/steps/data/garden/war/2024-10-02/ucdp_monthly.py +++ b/etl/steps/data/garden/war/2024-10-02/ucdp_monthly.py @@ -83,13 +83,13 @@ def run(dest_dir: str) -> None: # Read table from GW codes ds_gw = paths.load_dataset("gleditsch") - tb_regions = ds_gw.read_table("gleditsch_regions") + tb_regions = ds_gw.read("gleditsch_regions") tb_codes = ds_gw["gleditsch_countries"] # Load maps table short_name = "nat_earth_110" ds_maps = paths.load_dataset(short_name) - tb_maps = ds_maps.read_table(short_name) + tb_maps = ds_maps.read(short_name) # Load population ds_population = paths.load_dataset("population") @@ -102,7 +102,7 @@ def run(dest_dir: str) -> None: # Load relevant tables tb_ged = ( - ds_meadow.read_table("ucdp_ged") + ds_meadow.read("ucdp_ged") .reset_index() .astype( { @@ -117,7 +117,7 @@ def run(dest_dir: str) -> None: ) ) tb_ced = ( - ds_ced.read_table("ucdp_ced") + ds_ced.read("ucdp_ced") .reset_index() .astype( { @@ -132,7 +132,7 @@ def run(dest_dir: str) -> None: ) ) tb_conflict = ( - ds_meadow.read_table("ucdp_battle_related_conflict") + ds_meadow.read("ucdp_battle_related_conflict") .reset_index() .astype( { @@ -142,7 +142,7 @@ def run(dest_dir: str) -> None: } ) ) - tb_prio = ds_meadow.read_table("ucdp_prio_armed_conflict") + tb_prio = ds_meadow.read("ucdp_prio_armed_conflict") # Extend codes to have data for latest years tb_codes = extend_latest_years(tb_codes) @@ -159,7 +159,7 @@ def run(dest_dir: str) -> None: tb_ged = tb_ged.loc[tb_ged["active_year"] == 1] # Change region named "Asia" to "Asia and Oceania" (in GED) - tb_ged["region"] = tb_ged["region"].cat.rename_categories({"Asia": "Asia and Oceania"}) + tb_ged["region"] = tb_ged["region"].replace({"Asia": "Asia and Oceania"}) # Create `conflict_type` column paths.log.info("add field `conflict_type`") @@ -187,15 +187,15 @@ def run(dest_dir: str) -> None: paths.log.info("replace missing data with zeros (where applicable)") tb_prio = expand_time_column( tb_prio, - ["region", "conflict_type"], - "year", + dimension_col=["region", "conflict_type"], + time_col="year", method="full_range", fillna_method="zero", ) tb = expand_time_column( tb, - ["region", "conflict_type"], - "year", + dimension_col=["region", "conflict_type"], + time_col="year", method="full_range", fillna_method="zero", ) @@ -365,6 +365,7 @@ def add_conflict_type(tb_ged: Table, tb_conflict: Table) -> Table: ) # Fill unknown types of violence mask = tb_ged["type_of_violence"] == 1 # these are state-based conflicts + tb_ged["type_of_conflict"] = tb_ged["type_of_conflict"].astype(object) tb_ged.loc[mask, "type_of_conflict"] = tb_ged.loc[mask, "type_of_conflict"].fillna("state-based (unknown)") # Assert that `type_of_conflict` was only added for state-based events @@ -379,8 +380,9 @@ def add_conflict_type(tb_ged: Table, tb_conflict: Table) -> Table: # Create `conflict_type` column as a combination of `type_of_violence` and `type_of_conflict`. tb_ged["conflict_type"] = ( tb_ged["type_of_conflict"] + .astype(object) .replace(TYPE_OF_CONFLICT_MAPPING) - .fillna(tb_ged["type_of_violence"].replace(TYPE_OF_VIOLENCE_MAPPING)) + .fillna(tb_ged["type_of_violence"].astype(object).replace(TYPE_OF_VIOLENCE_MAPPING)) ) # Sanity check @@ -949,7 +951,7 @@ def estimate_metrics_participants_prio(tb_prio: Table, tb_codes: Table) -> Table tb_country["participated_in_conflict"].m.origins = tb_prio["gwno_a"].m.origins # Format conflict tyep - tb_country["conflict_type"] = tb_country["type_of_conflict"].replace(TYPE_OF_CONFLICT_MAPPING) + tb_country["conflict_type"] = tb_country["type_of_conflict"].astype(object).replace(TYPE_OF_CONFLICT_MAPPING) tb_country = tb_country.drop(columns=["type_of_conflict"]) # Prepare GW table @@ -1116,8 +1118,8 @@ def estimate_metrics_locations(tb: Table, tb_maps: Table, tb_codes: Table, ds_po # Fill with zeroes tb_locations_country = expand_time_column( tb_locations_country, - ["country", "conflict_type"], - "year", + dimension_col=["country", "conflict_type"], + time_col="year", method="full_range", fillna_method="zero", ) diff --git a/etl/steps/data/garden/war/2024-11-22/shared.py b/etl/steps/data/garden/war/2024-11-22/shared.py new file mode 100644 index 00000000000..e72e85e5b6d --- /dev/null +++ b/etl/steps/data/garden/war/2024-11-22/shared.py @@ -0,0 +1,190 @@ +from typing import List, Optional + +import numpy as np +import owid.catalog.processing as pr +from owid.catalog import Table + +from etl.data_helpers.misc import expand_time_column + + +def add_indicators_extra( + tb: Table, + tb_regions: Table, + columns_conflict_rate: Optional[List[str]] = None, + columns_conflict_mortality: Optional[List[str]] = None, +) -> Table: + """Scale original columns to obtain new indicators (conflict rate and conflict mortality indicators). + + CONFLICT RATE: + Scale columns `columns_conflict_rate` based on the number of countries (and country-pairs) in each region and year. + + For each indicator listed in `columns_to_scale`, two new columns are added to the table: + - `{indicator}_per_country`: the indicator value divided by the number of countries in the region and year. + - `{indicator}_per_country_pair`: the indicator value divided by the number of country-pairs in the region and year. + + CONFLICT MORTALITY: + Scale columns `columns_conflict_mortality` based on the population in each region. + + For each indicator listed in `columns_to_scale`, a new column is added to the table: + - `{indicator}_per_capita`: the indicator value divided by the number of countries in the region and year. + + + tb: Main table + tb_regions: Table with three columns: "year", "region", "num_countries". Gives the number of countries per region per year. + columns_to_scale: List with the names of the columns that need scaling. E.g. number_ongiong_conflicts -> number_ongiong_conflicts_per_country + """ + tb_regions_ = tb_regions.copy() + + # Sanity check 1: columns as expected in tb_regions + assert set(tb_regions_.columns) == { + "year", + "region", + "number_countries", + "population", + }, f"Invalid columns in tb_regions {tb_regions_.columns}" + # Sanity check 2: regions equivalent in both tables + regions_main = set(tb["region"]) + regions_aux = set(tb_regions_["region"]) + assert regions_main == regions_aux, f"Regions in main table and tb_regions differ: {regions_main} vs {regions_aux}" + + # Ensure full precision + tb_regions_["number_countries"] = tb_regions_["number_countries"].astype(float) + tb_regions_["population"] = tb_regions_["population"] # .astype(float) + # Get number of country-pairs + tb_regions_["number_country_pairs"] = ( + tb_regions_["number_countries"] * (tb_regions_["number_countries"] - 1) / 2 + ).astype(int) + + # Add number of countries and number of country pairs to main table + tb = tb.merge(tb_regions_, on=["year", "region"], how="left") + + if not columns_conflict_rate and not columns_conflict_mortality: + raise ValueError( + "Call to function is useless. Either provide `columns_conflict_rate` or `columns_conflict_mortality`." + ) + + # CONFLICT RATES ########### + if columns_conflict_rate: + # Add normalised indicators + for column_name in columns_conflict_rate: + # Add per country indicator + column_name_new = f"{column_name}_per_country" + tb[column_name_new] = (tb[column_name].astype(float) / tb["number_countries"].astype(float)).replace( + [np.inf, -np.inf], np.nan + ) + # Add per country-pair indicator + column_name_new = f"{column_name}_per_country_pair" + tb[column_name_new] = (tb[column_name].astype(float) / tb["number_country_pairs"].astype(float)).replace( + [np.inf, -np.inf], np.nan + ) + + # CONFLICT MORTALITY ########### + if columns_conflict_mortality: + # Add normalised indicators + for column_name in columns_conflict_mortality: + # Add per country indicator + column_name_new = f"{column_name}_per_capita" + tb[column_name_new] = ( + (100000 * tb[column_name].astype(float) / tb["population"]) + .replace([np.inf, -np.inf], np.nan) + .astype(float) + ) + + # Drop intermediate columns + tb = tb.drop(columns=["number_countries", "number_country_pairs", "population"]) + + return tb + + +def aggregate_conflict_types( + tb: Table, + parent_name: str, + children_names: Optional[List[str]] = None, + columns_to_aggregate: Optional[List[str]] = None, + columns_to_aggregate_absolute: Optional[List[str]] = None, + columns_to_groupby: Optional[List[str]] = None, + dim_name: str = "conflict_type", +) -> Table: + """Aggregate metrics in broader conflict types.""" + if columns_to_aggregate is None: + columns_to_aggregate = ["participated_in_conflict"] + if columns_to_groupby is None: + columns_to_groupby = ["year", "country", "id"] + if columns_to_aggregate_absolute is None: + columns_to_aggregate_absolute = [] + if children_names is None: + tb_agg = tb.copy() + else: + tb_agg = tb[tb[dim_name].isin(children_names)].copy() + # Obtain summations + tb_agg = tb_agg.groupby(columns_to_groupby, as_index=False).agg({col: sum for col in columns_to_aggregate}) + # Threshold to 1 for binary columns + threshold_upper = 1 + for col in columns_to_aggregate: + if col not in columns_to_aggregate_absolute: + tb_agg[col] = tb_agg[col].apply(lambda x: min(x, threshold_upper)) + # Add conflict type + tb_agg[dim_name] = parent_name + + # Combine + tb = pr.concat([tb, tb_agg], ignore_index=True) + return tb + + +def get_number_of_countries_in_conflict_by_region(tb: Table, dimension_name: str) -> Table: + """Get the number of countries participating in conflicts by region.""" + # Add region + tb_num_participants = add_region_from_code(tb) + tb_num_participants = tb_num_participants.drop(columns=["country"]).rename(columns={"region": "country"}) + + # Sanity check + assert not tb_num_participants["id"].isna().any(), "Some countries with NaNs!" + tb_num_participants = tb_num_participants.drop(columns=["id"]) + + # Groupby sum (regions) + tb_num_participants = tb_num_participants.groupby(["country", dimension_name, "year"], as_index=False)[ + "participated_in_conflict" + ].sum() + # Groupby sum (world) + tb_num_participants_world = tb_num_participants.groupby([dimension_name, "year"], as_index=False)[ + "participated_in_conflict" + ].sum() + tb_num_participants_world["country"] = "World" + # Combine + tb_num_participants = pr.concat([tb_num_participants, tb_num_participants_world], ignore_index=True) + tb_num_participants = tb_num_participants.rename(columns={"participated_in_conflict": "number_participants"}) + + # Complement with missing entries + tb_num_participants = expand_time_column( + tb_num_participants, + dimension_col=["country", dimension_name], + time_col="year", + method="full_range", + fillna_method="zero", + ) + + return tb_num_participants + + +def add_region_from_code(tb: Table, col_code: str = "id") -> Table: + """Add region to table based on code (gw, cow, isd).""" + + def _code_to_region_gw(code: int) -> str: + """Convert code to region name.""" + match code: + case c if 2 <= c <= 199: + return "Americas" + case c if 200 <= c <= 399: + return "Europe" + case c if 400 <= c <= 626: + return "Africa" + case c if 630 <= c <= 699: + return "Middle East" + case c if 700 <= c <= 999: + return "Asia and Oceania" + case _: + raise ValueError(f"Invalid GW code: {code}") + + tb_ = tb.copy() + tb_["region"] = tb_[col_code].apply(_code_to_region_gw) + return tb_ diff --git a/etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml new file mode 100644 index 00000000000..cc976a6b225 --- /dev/null +++ b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml @@ -0,0 +1,596 @@ +definitions: + common: + presentation: + topic_tags: + - War & Peace + display: + numDecimalPlaces: 0 + + all: + # Explanation of each conflict type + conflict_type_base: |- + This includes combatant and civilian deaths due to fighting + conflict_type: |- + <%- if conflict_type == "all" -%> + An armed conflict is a disagreement between organized groups, or between one organized group and civilians, that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "state-based" -%> + A state-based conflict is a conflict between two armed groups, at least one of which is a state, that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "interstate" -%> + An interstate conflict is a conflict between states that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "intrastate" -%> + An intrastate conflict is a conflict between a state and a non-state armed group that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. If a foreign state is involved, it is called "internationalized", and "non-internationalized" otherwise. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + An internationalized intrastate conflict is a conflict between a state and a non-state armed group, with involvement of a foreign state, that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + An non-internationalized intrastate conflict is a conflict between a state and a non-state armed group, without involvement of a foreign state, that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "extrasystemic" -%> + An extrasystemic conflict is a conflict between a state and a non-state armed group outside its territory that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "non-state conflict" -%> + A non-state conflict is a conflict between non-state armed groups, such as rebel groups, criminal organizations, or ethnic groups, that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "one-sided violence" -%> + One-sided violence is the use of armed force by a state or non-state armed group against civilians that causes at least 25 civilian deaths during a year. + + <%- endif -%> + location_conflicts_method: |- + UCDP provides geographical coordinates of each conflict event. We have mapped these coordinates to countries by means of the Natural Earth dataset. + + In some instances, the event's coordinates fall within the borders of a country. Other times, the event's coordinates fall outside the borders of a country. In the latter case, we have mapped the event to the country that is closest to the event's coordinates. + + Conflict event with id "53238" and relid "PAK-2003-1-345-88" was assigned to "Siachen Glacier" by Natural Earth. We have mapped it to "Pakistan" following the text in the `where_description` field from the Natural Earth data, which refers to "Giang sector in Siachen, Pakistani Kashmir". + + # Fields used for number of deaths indicators + number_deaths: + description_short: |- + <% set per_capita = "" %> + <%- if conflict_type == "all" -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "state-based" -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in interstate, intrastate, and extrasystemic conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in internationalized intrastate conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in non-internationalized intrastate conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "one-sided violence" -%> + The << estimate >> estimate of the number of deaths of civilians from one-sided violence that was ongoing that year<< per_capita >>. + + <%- elif conflict_type == "non-state conflict" -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in non-state conflicts that were ongoing that year<< per_capita >>. + + <%- else -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in << conflict_type >> conflicts that were ongoing that year<< per_capita >>. + + <%- endif -%> + description_short_per_capita: <% set per_capita = ", per 100,000 people" %> + {definitions.number_deaths.description_short} + description_key: &description_key_deaths + - "{definitions.all.conflict_type}" + + number_deaths_type: + description_short: |- + <% set per_capita = "" %> + <%- if conflict_type == "all" -%> + The best estimate of the number of deaths of << people_type >> in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "state-based" -%> + The best estimate of the number of deaths of << people_type >> in interstate, intrastate, and extrasystemic conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + The best estimate of the number of deaths of << people_type >> in internationalized intrastate conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + The best estimate of the number of deaths of << people_type >> in non-internationalized intrastate conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "one-sided violence" -%> + The << estimate | default('') >> estimate of the number of deaths of << people_type >> from one-sided violence that was ongoing that year<< per_capita >>. + + <%- elif conflict_type == "non-state conflict" -%> + The best estimate of the number of deaths of << people_type >> in non-state conflicts that were ongoing that year<< per_capita >>. + + <%- else -%> + The best estimate of the number of deaths of << people_type >> in << conflict_type >> conflicts that were ongoing that year<< per_capita >>. + + <%- endif -%> + description_short_per_capita: <% set per_capita = ", per 100,000 people" %> + {definitions.number_deaths_type.description_short} + description_key: &description_key_deaths_type + - "{definitions.all.conflict_type}" + + number_ongoing_conflicts: + description_short: |- + <%- if conflict_type == "all" -%> + Included are armed conflicts that were ongoing a year. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + Included are internationalized intrastate conflicts that were ongoing a year. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + Included are non-internationalized intrastate conflicts that were ongoing a year. + + <%- elif conflict_type == "non-state conflict" -%> + Included are non-state conflicts that were ongoing that year. + + <%- elif conflict_type == "one-sided violence" -%> + Included is one-sided violence that was ongoing that year. + + <%- elif conflict_type == "state-based" -%> + Included are interstate, intrastate, and extrasystemic conflicts that were ongoing that year. + + <%- else -%> + Included are << conflict_type >> conflicts that were ongoing a year. + + <%- endif -%> + description_key: &description_key_ongoing + - "{definitions.all.conflict_type}" + - We count a conflict as ongoing in a region even if the conflict is also ongoing in other regions. The sum across all regions can therefore be higher than the total number of ongoing conflicts. + + number_new_conflicts: + description_short: "{definitions.number_ongoing_conflicts.description_short}" + description_key: &description_key_new + - "{definitions.all.conflict_type}" + - We only count a conflict as new when the conflict overall started that year, not if it became active again. + - We count a conflict as new in a region even if the conflict started earlier or at the same time in another region. The sum across all regions can therefore be higher than the total number of new conflicts. + - |- + <%- if conflict_type == "intrastate (internationalized)" -%> + We count an internationalized intrastate conflict as new only if the conflict started that year, not if it became internationalized. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + We count a non-internationalized intrastate conflict as new only if the conflict started that year, not if it stopped being international. + <%- endif -%> + +tables: + # PARTICIPANT INDICATORS + ucdp_preview_country: + common: + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + description_key: + - "{definitions.all.conflict_type}" + - A country is considered to participate in a conflict if they were a primary participant, which refers to those participants that have the main disagreement of the conflict. + + variables: + participated_in_conflict: + title: State involved in conflict + unit: "" + description_short: |- + <%- if conflict_type == "state-based" -%> + State was a primary participant in at least one interstate, intrastate, or extrasystemic conflict that year. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + State was a primary participants in at least one internationalized intrastate conflict that year. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + State was a primary participants in at least one non-internationalized intrastate conflict that year. + + <%- elif conflict_type == "one-sided violence" -%> + State was a primary participants in at least one instance of one-sided violence that year. + + <% else -%> + State was a primary participants in at least one << conflict_type >> conflict that year. + + <%- endif -%> + description_key: + - |- + '1' indicates that the state participated in a conflict. '0' indicates that the state did not participate in a conflict. + + number_participants: + title: Number of states involved in conflicts + unit: "states" + description_short: |- + <%- if conflict_type == "state-based" -%> + Included are states that were primary participants in at least one interstate, intrastate, or extrasystemic conflict that year. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + Included are states that were primary participants in at least one internationalized intrastate conflict that year. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + Included are states that were primary participants in at least one non-internationalized intrastate conflict that year. + + <%- elif conflict_type == "one-sided violence" -%> + Included are states that were primary participants in at least one instance of one-sided violence that year. + + <% else -%> + Included are states that were primary participants in at least one << conflict_type >> conflict that year. + + <%- endif -%> + + # LOCATION INDICATORS + ucdp_preview_locations: + common: + description_processing: |- + {definitions.all.location_conflicts_method} + description_key: + - "{definitions.all.conflict_type}" + + variables: + is_location_of_conflict: + title: Country where conflict took place + unit: "" + description_short: |- + <%- if conflict_type == "state-based" -%> + At least one interstate, intrastate, or extrasystemic conflict event took place in this country in a given year. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + At least one internationalised intrastate conflict event took place in this country in a given year. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + At least one non-internationalized intrastate conflict event took place in this country in a given year. + + <%- elif conflict_type == "one-sided violence" -%> + At least one conflict event took place in this country in a given year. + + <% else -%> + At least one << conflict_type >> conflict event took place in this country in a given year. + + <%- endif -%> + description_key: + - |- + '1' indicates that there was a conflict event in the given country. '0' indicates that there was no conflict event in the given country. + - "{definitions.all.conflict_type}" + + number_locations: + title: Number of countries where conflict took place + unit: "countries" + description_short: |- + <%- if conflict_type == "all" -%> + Included are armed conflicts that caused at least one death in the country that year. + + <%- elif conflict_type == "state-based" -%> + Included are interstate, intrastate, and extrasystemic conflicts that caused at least one death in the country that year. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + Included are internationalized conflicts that caused at least one death in the country that year. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + Included are non-internationalized conflicts that caused at least one death in the country that year. + + <%- elif conflict_type == "one-sided violence" -%> + Included is one-sided violence that caused at least one death in the country that year. + + <% else -%> + Included are << conflict_type >> conflicts that caused at least one death in the country that year. + + <%- endif -%> + + number_deaths: + title: Deaths in ongoing conflicts in a country (best estimate) + unit: "deaths" + description_short: |- + <% set estimate = "best" %> + {definitions.number_deaths.description_short} + + number_deaths_low: + title: Deaths in ongoing conflicts in a country (low estimate) + unit: "deaths" + description_short: |- + <% set estimate = "low" %> + {definitions.number_deaths.description_short} + + number_deaths_high: + title: Deaths in ongoing conflicts in a country (high estimate) + unit: "deaths" + description_short: |- + <% set estimate = "high" %> + {definitions.number_deaths.description_short} + + number_deaths_combatants: + title: Deaths of combatants in ongoing conflicts in a country + unit: "deaths" + description_short: |- + <% set people_type = "combatants" %> + {definitions.number_deaths_type.description_short} + + number_deaths_civilians: + title: Deaths of civilians in ongoing conflicts in a country + unit: "deaths" + description_short: |- + <% set people_type = "civilians" %> + {definitions.number_deaths_type.description_short} + + number_deaths_unknown: + title: Deaths of unknown type in ongoing conflicts in a country + unit: "deaths" + description_short: |- + <% set people_type = "unknown type" %> + {definitions.number_deaths_type.description_short} + + death_rate: + title: Death rate in ongoing conflicts in a country (best estimate) + unit: "deaths per 100,000 people" + display: + numDecimalPlaces: 1 + description_short: |- + <% set estimate = "best" %> + {definitions.number_deaths.description_short_per_capita} + + death_rate_low: + title: Death rate in ongoing conflicts in a country (low estimate) + unit: "deaths per 100,000 people" + display: + numDecimalPlaces: 1 + description_short: |- + <% set estimate = "low" %> + {definitions.number_deaths.description_short_per_capita} + + death_rate_high: + title: Death rate in ongoing conflicts in a country (high estimate) + unit: "deaths per 100,000 people" + display: + numDecimalPlaces: 1 + description_short: |- + <% set estimate = "high" %> + {definitions.number_deaths.description_short_per_capita} + + # death_rate_combatants: + # title: Death rate of combatants in ongoing conflicts in a country + # unit: "deaths per 100,000 people" + # display: + # numDecimalPlaces: 1 + # description_short: |- + # <% set people_type = "combatants" %> + # {definitions.number_deaths_type.description_short_per_capita} + + # death_rate_civilians: + # title: Death rate of civilians in ongoing conflicts in a country + # unit: "deaths per 100,000 people" + # display: + # numDecimalPlaces: 1 + # description_short: |- + # <% set people_type = "civilians" %> + # {definitions.number_deaths_type.description_short_per_capita} + + # death_rate_unknown: + # title: Death rate of unknown type in ongoing conflicts in a country + # unit: "deaths per 100,000 people" + # display: + # numDecimalPlaces: 1 + # description_short: |- + # <% set people_type = "unknown type" %> + # {definitions.number_deaths_type.description_short_per_capita} + + # MAIN INDICATORS + ucdp_preview: + common: + presentation: + grapher_config: + selectedEntityNames: + - Africa + - Americas + - Asia and Oceania + - Europe + - Middle East + variables: + ################## + # Ongoing deaths # + ################## + ## Estimated deaths + number_deaths_ongoing_conflicts: + title: Deaths in ongoing conflicts (best estimate) + unit: deaths + description_short: |- + <% set estimate = "best" %> + {definitions.number_deaths.description_short} + description_key: *description_key_deaths + + number_deaths_ongoing_conflicts_high: + title: Deaths in ongoing conflicts (high estimate) + unit: deaths + description_short: |- + <% set estimate = "high" %> + {definitions.number_deaths.description_short} + description_key: *description_key_deaths + + number_deaths_ongoing_conflicts_low: + title: Deaths in ongoing conflicts (low estimate) + unit: deaths + description_short: |- + <% set estimate = "low" %> + {definitions.number_deaths.description_short} + description_key: *description_key_deaths + + ## Deaths by type + number_deaths_ongoing_conflicts_civilians: + title: Deaths of civilians in ongoing conflicts + unit: deaths + description_short: |- + <% set people_type = "civilians" %> + {definitions.number_deaths_type.description_short} + description_key: *description_key_deaths_type + + number_deaths_ongoing_conflicts_combatants: + title: Deaths of combatants in ongoing conflicts + unit: deaths + description_short: |- + <% set people_type = "combatants" %> + {definitions.number_deaths_type.description_short} + description_key: *description_key_deaths_type + + number_deaths_ongoing_conflicts_unknown: + title: Deaths of unknown type in ongoing conflicts + unit: deaths + description_short: |- + <% set people_type = "unknown type" %> + {definitions.number_deaths_type.description_short} + description_key: *description_key_deaths_type + + ## Deaths per capita + number_deaths_ongoing_conflicts_per_capita: + title: Death rate in ongoing conflicts (best estimate) + unit: deaths per 100,000 people + description_short: |- + <% set estimate = "best" %> + {definitions.number_deaths.description_short_per_capita} + description_key: *description_key_deaths + display: + numDecimalPlaces: 1 + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + number_deaths_ongoing_conflicts_high_per_capita: + title: Death rate in ongoing conflicts (high estimate) + unit: deaths per 100,000 people + description_short: |- + <% set estimate = "high" %> + {definitions.number_deaths.description_short_per_capita} + description_key: *description_key_deaths + display: + numDecimalPlaces: 1 + + number_deaths_ongoing_conflicts_low_per_capita: + title: Death rate in ongoing conflicts (low estimate) + unit: deaths per 100,000 people + description_short: |- + <% set estimate = "low" %> + {definitions.number_deaths.description_short_per_capita} + description_key: *description_key_deaths_type + display: + numDecimalPlaces: 1 + + # number_deaths_ongoing_conflicts_civilians_per_capita: + # title: Death rate from civilians in ongoing conflicts + # unit: deaths + # description_short: |- + # <% set people_type = "civilians" %> + # {definitions.number_deaths_type.description_short_per_capita} + # description_key: *description_key_deaths_type + # display: + # numDecimalPlaces: 1 + + # number_deaths_ongoing_conflicts_combatants_per_capita: + # title: Death rate from combatants ongoing conflicts + # unit: deaths + # description_short: |- + # <% set people_type = "combatants" %> + # {definitions.number_deaths_type.description_short_per_capita} + # description_key: *description_key_deaths_type + # display: + # numDecimalPlaces: 1 + + # number_deaths_ongoing_conflicts_unknown_per_capita: + # title: Death rate from unknown type in ongoing conflicts + # unit: deaths + # description_short: |- + # <% set people_type = "unknown type" %> + # {definitions.number_deaths_type.description_short_per_capita} + # description_key: *description_key_deaths + # display: + # numDecimalPlaces: 1 + + ##################### + # Ongoing conflicts # + ##################### + number_ongoing_conflicts: + title: Number of ongoing conflicts + unit: conflicts + description_short: |- + {definitions.number_ongoing_conflicts.description_short} + description_key: *description_key_ongoing + presentation: + grapher_config: + selectedEntityNames: + - World + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + number_ongoing_conflicts_per_country: + title: Number of ongoing conflicts per state + unit: conflicts per state + description_short: |- + The number of conflicts divided by the number of all states. This accounts for the changing number of states over time. {definitions.number_ongoing_conflicts.description_short} + description_key: *description_key_ongoing + display: + numDecimalPlaces: 3 + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + number_ongoing_conflicts_per_country_pair: + title: Number of ongoing conflicts per state-pair + unit: conflicts per state-pair + description_short: |- + The number of conflicts divided by the number of all state-pairs. This accounts for the changing number of states over time. {definitions.number_ongoing_conflicts.description_short} + description_key: *description_key_ongoing + display: + numDecimalPlaces: 5 + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + ################# + # New conflicts # + ################# + number_new_conflicts: + title: Number of new conflicts + unit: conflicts + description_short: |- + {definitions.number_new_conflicts.description_short} + description_key: *description_key_new + presentation: # TODO + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + number_new_conflicts_per_country: + title: Number of new conflicts per state + unit: conflicts per state + description_short: |- + The number of conflicts divided by the number of all states. This accounts for the changing number of states over time. {definitions.number_new_conflicts.description_short} + description_key: *description_key_new + display: + numDecimalPlaces: 3 + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + number_new_conflicts_per_country_pair: + title: Number of new conflicts per state-pair + unit: conflicts per state-pair + description_short: |- + The number of conflicts divided by the number of all state-pairs. This accounts for the changing number of states over time. {definitions.number_new_conflicts.description_short} + description_key: *description_key_new + display: + numDecimalPlaces: 5 + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + +dataset: + title: UCDP, History of war (preview) + description: |- + This dataset provides information on armed conflicts, using data from the UCDP Georeferenced Event Dataset (version 24.1), the UCDP/PRIO Armed Conflict Dataset (version 24.1), the UCDP Battle-Related Deaths Dataset (version 24.1), and the UCDP Candidate Event Dataset (version 24.X). + + Its difference with the main "UCDP, History of war" dataset is that it includes data up to the latest available, using preliminary data from the UCDP Candidate Event Dataset. + + We aggregate the UCDP Georeferenced Event Dataset up to the year and world (region) to identify all conflict deaths, non-state conflicts, and one-sided violence. + + We use the UCDP/PRIO Armed Conflict Dataset to identify state-based conflicts: interstate, intrastate (all, internationalized, and non-internationalized), and extrasystemic. + + We use the UCDP Battle-Related Deaths Dataset to link deaths in the Georeferenced Event Dataset to types of state-based conflicts in the UCDP/PRIO Armed Conflict Dataset. + + We combine these datasets to provide information on the number of ongoing and new conflicts, the number of ongoing and new conflict types, as well as the number of deaths in ongoing conflicts and conflict types. + + Deaths of combatants and civilians due to fighting are included. + + The Georeferenced Event Dataset has been extracted from the UCDP systems at a certain point in time. However, the UCDP team works with the data all year round, including revisions and updates. Therefore, their dashboard + might show slightly more up-to-date data, which sometimes result in minor discrepancies in the data. + + We use the world regions as defined by UCDP/PRIO: Africa, Americas, Asia, Europe, and Middle East. These are defined based on Gleditsch and Ward codes. Find the complete mapping at + http://ksgleditsch.com/data/iisystem.dat (states) and http://ksgleditsch.com/data/microstatessystem.dat (micro-states): + + • Americas: 2-199 + + • Europe: 200-399 + + • Africa: 400-626 + + • Middle East: 630-699 + + • Asia and Oceania: 700-999 + + You can find more information about the data in our article: [To be published] + + This dataset contains information on armed conflicts - state, non-state and one-sided conflicts, in the period of 1989 and 2022. diff --git a/etl/steps/data/garden/war/2024-11-22/ucdp_preview.py b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.py new file mode 100644 index 00000000000..463ceba2f57 --- /dev/null +++ b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.py @@ -0,0 +1,1421 @@ +"""Data from UCDP. + + +IMPORTANT NOTE: + + - This script is basically a copy of the latest script used to generate UCDP dataset. At some point we should align the tools in both scripts to avoid duplication. + + +Notes: + - Conflict types for state-based violence is sourced from UCDP/PRIO dataset. non-state and one-sided violence is sourced from GED dataset. + - There can be some mismatches with latest official reported data (UCDP's live dashboard). This is because UCDP uses latest data for their dashboard, which might not be available yet as bulk download. + - Regions: + - Uses `region` column for both GED and UCDP/PRIO datasets. + - Incompatibilities in Oceania are encoded in "Asia". We therefore have changed the region name to "Asia and Oceania". + - GED: Dataset uses names (not codes!) + - You can learn more about the countries included in each region from section "Appendix 5 Main sources consulted during the 2022 update" in page 40, + document: https://ucdp.uu.se/downloads/ged/ged231.pdf. + - Note that countries from Oceania are included in Asia! + - UCDP/PRIO: Dataset uses codes (note we changed "Asia" -> "Asia and Oceania") + 1 = Europe (GWNo: 200-399) + 2 = Middle East (GWNo: 630-699) + 3 = Asia (GWNo: 700-999) [renamed to 'Asia and Oceania'] + 4 = Africa (GWNo: 400-626) + 5 = Americas (GWNo: 2-199) +""" + +from datetime import datetime +from typing import List, Optional + +import geopandas as gpd +import numpy as np +import pandas as pd +from owid.catalog import Dataset, Table +from owid.catalog import processing as pr +from shapely import wkt +from shared import ( + add_indicators_extra, + aggregate_conflict_types, + get_number_of_countries_in_conflict_by_region, +) +from structlog import get_logger + +from etl.data_helpers import geo +from etl.data_helpers.misc import expand_time_column +from etl.helpers import PathFinder, create_dataset + +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Mapping for Geo-referenced datase +TYPE_OF_VIOLENCE_MAPPING = { + 2: "non-state conflict", + 3: "one-sided violence", +} +# Mapping for armed conflicts dataset (inc PRIO/UCDP) +UNKNOWN_TYPE_ID = 99 +UNKNOWN_TYPE_NAME = "state-based (unknown)" +TYPE_OF_CONFLICT_MAPPING = { + 1: "extrasystemic", + 2: "interstate", + 3: "intrastate (non-internationalized)", + 4: "intrastate (internationalized)", + UNKNOWN_TYPE_ID: UNKNOWN_TYPE_NAME, +} +# Regions mapping (for PRIO/UCDP dataset) +REGIONS_MAPPING = { + 1: "Europe", + 2: "Middle East", + 3: "Asia and Oceania", + 4: "Africa", + 5: "Americas", +} +REGIONS_EXPECTED = set(REGIONS_MAPPING.values()) +# Last year of data +LAST_YEAR_STABLE = 2023 +LAST_YEAR_CED = 2024 +LAST_YEAR = 2023 + + +def run(dest_dir: str) -> None: + paths.log.info("start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("ucdp") + ds_ced = paths.load_dataset("ucdp_ced") + + # Read table from GW codes + ds_gw = paths.load_dataset("gleditsch") + tb_regions = ds_gw.read("gleditsch_regions") + tb_codes = ds_gw["gleditsch_countries"] + + # Load maps table + short_name = "nat_earth_110" + ds_maps = paths.load_dataset(short_name) + tb_maps = ds_maps.read(short_name) + + # Load population + ds_population = paths.load_dataset("population") + + # + # Process data. + # + paths.log.info("sanity checks") + _sanity_checks(ds_meadow) + + # Load relevant tables + tb_ged = ds_meadow.read("ucdp_ged").astype( + { + "deaths_a": float, + "deaths_b": float, + "deaths_civilians": float, + "deaths_unknown": float, + "best": float, + "high": float, + "low": float, + } + ) + tb_ced = ds_ced.read("ucdp_ced").astype( + { + "deaths_a": float, + "deaths_b": float, + "deaths_civilians": float, + "deaths_unknown": float, + "best": float, + "high": float, + "low": float, + } + ) + tb_conflict = ds_meadow.read("ucdp_battle_related_conflict").astype( + { + "bd_best": float, + "bd_low": float, + "bd_high": float, + } + ) + tb_prio = ds_meadow.read("ucdp_prio_armed_conflict") + + # Extend codes to have data for latest years + tb_codes = extend_latest_years(tb_codes) + + # Merge CED into GED + assert (tb_ced.columns == tb_ged.columns).all(), "Columns are not the same!" + assert tb_ged["year"].max() == LAST_YEAR_STABLE, "GED data is not up to date!" + assert tb_ced["year"].max() == LAST_YEAR_CED, "CED data is not up to date!" + tb_ced = tb_ced[tb_ged.columns] + tb_ged = pr.concat([tb_ged, tb_ced], ignore_index=True) + + # Keep only active conflicts + paths.log.info("keep active conflicts") + tb_ged = tb_ged.loc[tb_ged["active_year"] == 1] + + # Change region named "Asia" to "Asia and Oceania" (in GED) + tb_ged["region"] = tb_ged["region"].replace({"Asia": "Asia and Oceania"}) + + # Create `conflict_type` column + paths.log.info("add field `conflict_type`") + tb = add_conflict_type(tb_ged, tb_conflict) + + # Sanity-check that the number of 'unknown' types of some conflicts is controlled + # NOTE: Export summary of conflicts that have no category assigned + tb_summary = get_summary_unknown(tb) + assert len(tb_summary) / tb["conflict_new_id"].nunique() < 0.01, "Too many conflicts without a category assigned!" + # tb_summary.to_csv("summary.csv") + + # Get country-level stuff + paths.log.info("getting country-level indicators") + tb_participants = estimate_metrics_participants(tb, tb_prio, tb_codes) + tb_locations = estimate_metrics_locations(tb, tb_maps, tb_codes, ds_population) + + # Sanity check conflict_type transitions + ## Only consider transitions between intrastate and intl intrastate. If other transitions are detected, raise error. + _sanity_check_conflict_types(tb) + _sanity_check_prio_conflict_types(tb_prio) + + # Add number of new conflicts and ongoing conflicts (also adds data for the World) + paths.log.info("get metrics for main dataset (also estimate values for 'World')") + tb = estimate_metrics(tb) + + # Add table from UCDP/PRIO + paths.log.info("prepare data from ucdp/prio table (also estimate values for 'World')") + tb_prio = prepare_prio_data(tb_prio) + + # Fill NaNs + paths.log.info("replace missing data with zeros (where applicable)") + tb_prio = expand_time_column( + tb_prio, + dimension_col=["region", "conflict_type"], + time_col="year", + method="full_range", + fillna_method="zero", + ) + tb = expand_time_column( + tb, + dimension_col=["region", "conflict_type"], + time_col="year", + method="full_range", + fillna_method="zero", + ) + # Combine main dataset with PRIO/UCDP + paths.log.info("add data from ucdp/prio table") + tb = combine_tables(tb, tb_prio) + + # Add extra-systemic after 1989 + paths.log.info("fix extra-systemic nulls") + tb = fix_extrasystemic_entries(tb) + + # Add data for "all conflicts" conflict type + paths.log.info("add data for 'all conflicts'") + tb = add_conflict_all(tb) + + # Add data for "all intrastate" conflict types + tb = add_conflict_all_intrastate(tb) + + # Add data for "state-based" conflict types + tb = add_conflict_all_statebased(tb) + + # Force types + # tb = tb.astype({"conflict_type": "category", "region": "category"}) + + # Add conflict rates + tb = add_indicators_extra( + tb, + tb_regions, + columns_conflict_rate=["number_ongoing_conflicts", "number_new_conflicts"], + columns_conflict_mortality=[ + "number_deaths_ongoing_conflicts", + "number_deaths_ongoing_conflicts_high", + "number_deaths_ongoing_conflicts_low", + # "number_deaths_ongoing_conflicts_civilians", + # "number_deaths_ongoing_conflicts_unknown", + # "number_deaths_ongoing_conflicts_combatants", + ], + ) + + # Adapt region names + tb = adapt_region_names(tb) + + # Tables + tables = [ + tb.format(["year", "region", "conflict_type"], short_name=paths.short_name), + tb_participants.format(["year", "country", "conflict_type"]), + tb_locations.format(["year", "country", "conflict_type"]), + ] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("ucdp.end") + + +def _sanity_checks(ds: Dataset) -> None: + """Check that the tables in the dataset are as expected.""" + + def _check_consistency_of_ged( + tb_ged: Table, + tb_type: Table, + death_col: str, + type_of_violence: int, + conflict_ids_errors: Optional[List[int]] = None, + ): + ERR_THRESHOLD = 0.015 + + # Check IDs + ged_ids = tb_ged.loc[tb_ged["type_of_violence"] == type_of_violence, ["conflict_new_id"]].drop_duplicates() + conflict_ids = tb_type[["conflict_id"]].drop_duplicates() + res = ged_ids.merge(conflict_ids, left_on="conflict_new_id", right_on="conflict_id", how="outer") + assert res.isna().sum().sum() == 0, "Check NaNs in conflict_new_id or conflict_id" + + # Check number of deaths + deaths_ged = ( + tb_ged.loc[(tb_ged["type_of_violence"] == type_of_violence) & (tb_ged["active_year"] == 1)] + .groupby(["conflict_new_id", "year"], as_index=False)[["best"]] + .sum() + .sort_values(["conflict_new_id", "year"]) + ) + deaths = tb_type[["conflict_id", "year", death_col]].sort_values(["conflict_id", "year"]) + res = deaths_ged.merge( + deaths, left_on=["conflict_new_id", "year"], right_on=["conflict_id", "year"], how="outer" + ) + + # Get error + res["err"] = res["best"].astype(float) - res[death_col].astype(float) + res["err_rel"] = res["err"] / res["best"] + res = res[res["err_rel"] > ERR_THRESHOLD] + # Remove accepted errors + if conflict_ids_errors is not None: + res = res.loc[~res["conflict_new_id"].isin(conflict_ids_errors)] + assert ( + len(res) == 0 + ), f"Dicrepancy between number of deaths in conflict ({tb_ged.m.short_name} vs. {tb_type.m.short_name}). \n {res})" + + # Read tables + tb_ged = ds["ucdp_ged"].reset_index() + tb_conflict = ds["ucdp_battle_related_conflict"].reset_index() + tb_nonstate = ds["ucdp_non_state"].reset_index() + tb_onesided = ds["ucdp_one_sided"].reset_index() + + # Battle-related conflict # + _check_consistency_of_ged( + tb_ged, + tb_conflict, + "bd_best", + 1, + ) + + # Non-state # + _check_consistency_of_ged( + tb_ged, + tb_nonstate, + "best_fatality_estimate", + 2, + [16009], + ) + + # One-sided # + _check_consistency_of_ged( + tb_ged, + tb_onesided, + "best_fatality_estimate", + 3, + [16009], + ) + + +def add_conflict_type(tb_ged: Table, tb_conflict: Table) -> Table: + """Add `conflict_type` to georeferenced dataset table. + + Values for conflict_type are: + - non-state conflict + - one-sided violence + - extrasystemic + - interstate + - intrastate + - internationalized intrastate + + The thing is that the original table `tb_ged` only contains a very high level categorisation. In particular, + it labels all state-based conflicts as 'state-based'. Instead, we want to use a more fine grained definition: + extrasystemic, intrastate, interstate. + + Parameters + ---------- + tb_ged: Table + This is the main table with the relevant data + tb_conflict: Table + This is a secondary table, that we use to obtain the conflict types of the conflicts. + """ + tb_conflict = tb_conflict.loc[:, ["conflict_id", "year", "type_of_conflict"]].drop_duplicates() + assert tb_conflict.groupby(["conflict_id", "year"]).size().max() == 1, "Some conflict_id-year pairs are duplicated!" + + # Add `type_of_conflict` to `tb_ged`. + # This column contains the type of state-based conflict (1: inter-state, 2: intra-state, 3: extra-state, 4: internationalized intrastate) + tb_ged = tb_ged.merge( + tb_conflict, + left_on=["conflict_new_id", "year"], + right_on=["conflict_id", "year"], + how="outer", + ) + + # Assign latest available conflict type to unknown state-based conflicts + tb_ged = patch_unknown_conflict_type_ced(tb_ged) + + # Assert that `type_of_conflict` was only added for state-based events + assert ( + tb_ged[tb_ged["type_of_violence"] != 1]["type_of_conflict"].isna().all() + ), "There are some actual values for non-state based conflicts! These should only be NaN, since `tb_conflict` should only contain data for state-based conflicts." + # Check that `type_of_conflict` is not NaN for state-based events + assert ( + not tb_ged[tb_ged["type_of_violence"] == 1]["type_of_conflict"].isna().any() + ), "Could not find the type of conflict for some state-based conflicts!" + + # Create `conflict_type` column as a combination of `type_of_violence` and `type_of_conflict`. + tb_ged["conflict_type"] = ( + tb_ged["type_of_conflict"] + .astype(object) + .replace(TYPE_OF_CONFLICT_MAPPING) + .fillna(tb_ged["type_of_violence"].astype(object).replace(TYPE_OF_VIOLENCE_MAPPING)) + ) + + # Sanity check + assert tb_ged["conflict_type"].isna().sum() == 0, "Check NaNs in conflict_type (i.e. conflicts without a type)!" + + return tb_ged + + +def patch_unknown_conflict_type_ced(tb): + """Assign conflict types to unknown state-based conflicts (based on latest category appearing in GED)""" + mask = (tb["type_of_violence"] == 1) & (tb["type_of_conflict"].isna()) + assert ( + tb.loc[mask, "year"] > LAST_YEAR_STABLE + ).all(), "Unknown conflict types should only be present in years after GED!" + ids_unknown = list(tb.loc[mask, "conflict_new_id"].unique()) + + # Get table with the latest assigned conflict type for each conflict that has category 'state-based (unknown)' assigned + id_to_type = ( + tb.loc[tb["conflict_new_id"].isin(ids_unknown) & ~mask, ["conflict_new_id", "year", "type_of_conflict"]] + .sort_values("year") + .drop_duplicates(subset=["conflict_new_id"], keep="last") + .set_index("conflict_new_id")["type_of_conflict"] + .to_dict() + ) + tb.loc[mask, "type_of_conflict"] = tb.loc[mask, "conflict_new_id"].apply( + lambda x: id_to_type.get(x, UNKNOWN_TYPE_ID) + ) + return tb + + +def _sanity_check_conflict_types(tb: Table) -> Table: + """Check conflict type. + + - Only transitions accepted are between intrastate conflicts. + - The same conflict is only expceted to have one type in a year. + """ + # Define expected combinations of conflicT_types for a conflict. Typically, only in the intrastate domain + TRANSITION_EXPECTED = {"intrastate (internationalized)", "intrastate (non-internationalized)"} + # Get conflicts with more than one conflict type assigned to them over their lifetime + tb_ = tb.loc[tb["year"] < LAST_YEAR_STABLE] + conflict_type_transitions = tb_.groupby("conflict_new_id")["conflict_type"].apply(set) + transitions = conflict_type_transitions[conflict_type_transitions.apply(len) > 1].drop_duplicates() + # Extract unique combinations of conflict_types for a conflict + assert (len(transitions) == 1) & (transitions.iloc[0] == TRANSITION_EXPECTED), "Error" + + # Check if different regions categorise the conflict differently in the same year + assert not ( + tb_.groupby(["conflict_id", "year"])["type_of_conflict"].nunique() > 1 + ).any(), "Seems like the conflict has multiple types for a single year! Is it categorised differently depending on the region? This case has not been taken into account -- please review the code!" + + +def _sanity_check_prio_conflict_types(tb: Table) -> Table: + """Check conflict type in UCDP/PRIO data. + + - Only transitions accepted between intrastate conflicts. + - The same conflict is only expceted to have one type in a year. + """ + # Define expected combinations of conflict_types for a conflict. Typically, only in the intrastate domain + TRANSITIONS_EXPECTED = {"{3, 4}"} + # Get conflicts with more than one conflict type assigned to them over their lifetime + conflict_type_transitions = tb.groupby("conflict_id")["type_of_conflict"].apply(set) + transitions = conflict_type_transitions[conflict_type_transitions.apply(len) > 1].drop_duplicates() + # Extract unique combinations of conflict_types for a conflict + transitions = set(transitions.astype(str)) + transitions_unk = transitions - TRANSITIONS_EXPECTED + + # Check if different regions categorise the conflict differently in the same year + assert not ( + tb.groupby(["conflict_id", "year"])["type_of_conflict"].nunique() > 1 + ).any(), "Seems like the conflict hast multiple types for a single year! Is it categorised differently depending on the region?" + + assert not transitions_unk, f"Unknown transitions found: {transitions_unk}" + + +def estimate_metrics(tb: Table) -> Table: + """Add number of ongoing and new conflicts, and number of deaths. + + It also estimates the values for 'World', otherwise this can't be estimated later on. + This is because some conflicts occur in multiple regions, and hence would be double counted. To overcome this, + we need to access the actual conflict_id field to find the number of unique values. This can only be done here. + """ + # Get number of ongoing conflicts, and deaths in ongoing conflicts + paths.log.info("get number of ongoing conflicts and deaths in ongoing conflicts") + tb_ongoing = _get_ongoing_metrics(tb) + + # Get number of new conflicts every year + paths.log.info("get number of new conflicts every year") + tb_new = _get_new_metrics(tb) + # Combine and build single table + paths.log.info("combine and build single table") + tb = tb_ongoing.merge( + tb_new, + left_on=["year", "region", "conflict_type"], + right_on=["year", "region", "conflict_type"], + how="outer", # data for (1991, intrastate) is available for 'ongoing conflicts' but not for 'new conflicts'. We don't want to loose it! + ) + + # If datapoint is missing, fill with zero + tb = tb.fillna(0) + + # tb = tb.drop(columns=["year_start"]) + return tb + + +def _get_ongoing_metrics(tb: Table) -> Table: + # Estimate combatant deaths per conflict + tb_ = tb.copy() + tb_["deaths_combatants"] = tb_["deaths_a"] + tb_["deaths_b"] + + # Define aggregations + column_props = { + # Deaths (estimates) + "best": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts", + }, + "high": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts_high", + }, + "low": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts_low", + }, + # Deaths by type + "deaths_civilians": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts_civilians", + }, + "deaths_unknown": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts_unknown", + }, + "deaths_combatants": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts_combatants", + }, + # Number of conflicts + "conflict_new_id": { + "f": "nunique", + "rename": "number_ongoing_conflicts", + }, + } + col_funcs = {k: v["f"] for k, v in column_props.items()} + col_renames = {k: v["rename"] for k, v in column_props.items()} + # For each region + columns_idx = ["year", "region", "conflict_type"] + tb_ongoing = tb_.groupby(columns_idx, as_index=False).agg(col_funcs) + tb_ongoing = tb_ongoing.rename(columns={n: n for n in columns_idx} | col_renames) + + # For the World + columns_idx = ["year", "conflict_type"] + tb_ongoing_world = tb_.groupby(columns_idx, as_index=False).agg(col_funcs) + tb_ongoing_world = tb_ongoing_world.rename(columns={n: n for n in columns_idx} | col_renames) + tb_ongoing_world["region"] = "World" + + # Combine + tb_ongoing = pr.concat([tb_ongoing, tb_ongoing_world], ignore_index=True).sort_values( # type: ignore + by=["year", "region", "conflict_type"] + ) + + # Check that `deaths = deaths_combatants + deaths_civilians + deaths_unknown` holds + assert ( + tb_ongoing["number_deaths_ongoing_conflicts"] + - tb_ongoing[ + [ + "number_deaths_ongoing_conflicts_civilians", + "number_deaths_ongoing_conflicts_unknown", + "number_deaths_ongoing_conflicts_combatants", + ] + ].sum(axis=1) + == 0 + ).all(), "Sum of deaths from combatants, civilians and unknown should equal best estimate!" + return tb_ongoing + + +def _get_new_metrics(tb: Table) -> Table: + # Reduce table to only preserve first appearing event + tb = ( + tb.loc[:, ["conflict_new_id", "year", "region", "conflict_type"]] + .sort_values("year") + .drop_duplicates(subset=["conflict_new_id", "region"], keep="first") + ) + + # For each region + columns_idx = ["year", "region", "conflict_type"] + tb_new = tb.groupby(columns_idx)[["conflict_new_id"]].nunique().reset_index() + tb_new.columns = columns_idx + ["number_new_conflicts"] + + # For the World + ## Consider first start globally (a conflict may have started in region A in year X and in region B later in year X + 1) + tb = tb.sort_values("year").drop_duplicates(subset=["conflict_new_id"], keep="first") + columns_idx = ["year", "conflict_type"] + tb_new_world = tb.groupby(columns_idx)[["conflict_new_id"]].nunique().reset_index() + tb_new_world.columns = columns_idx + ["number_new_conflicts"] + tb_new_world["region"] = "World" + + # Combine + tb_new = pr.concat([tb_new, tb_new_world], ignore_index=True).sort_values( # type: ignore + by=["year", "region", "conflict_type"] + ) + + return tb_new + + +def prepare_prio_data(tb_prio: Table) -> Table: + """Prepare PRIO table. + + This includes estimating all necessary metrics (ongoing and new). + """ + tb_prio = _prepare_prio_table(tb_prio) + tb_prio = _prio_add_metrics(tb_prio) + return tb_prio + + +def combine_tables(tb: Table, tb_prio: Table) -> Table: + """Combine main table with data from UCDP/PRIO. + + UCDP/PRIO table provides estimates for dates earlier then 1989. + + It only includes state-based conflicts! + """ + # Ensure year period for each table is as expected + assert tb["year"].min() == 1989, "Unexpected start year!" + assert tb["year"].max() == LAST_YEAR_CED, "Unexpected start year!" + assert tb_prio["year"].min() == 1946, "Unexpected start year!" + assert tb_prio["year"].max() == 1989, "Unexpected start year!" + + # Force NaN in 1989 data from Geo-referenced dataset for `number_new_conflicts` + # We want this data to come from PRIO/UCDP instead! + tb.loc[tb["year"] == 1989, "number_new_conflicts"] = np.nan + # Force NaN in 1989 data from PRIO/UCDP dataset for `number_ongoing_conflicts` + # We want this data to come from GEO instead! + tb_prio.loc[tb_prio["year"] == 1989, "number_ongoing_conflicts"] = np.nan + + # Merge Geo with UCDP/PRIO + tb = tb_prio.merge(tb, on=["year", "region", "conflict_type"], suffixes=("_prio", "_main"), how="outer") + + # Sanity checks + ## Data from PRIO/UCDP for `number_ongoing_conflicts` goes from 1946 to 1988 (inc) + assert tb[tb["number_ongoing_conflicts_prio"].notna()]["year"].min() == 1946 + assert tb[tb["number_ongoing_conflicts_prio"].notna()]["year"].max() == 1988 + ## Data from GEO for `number_ongoing_conflicts` goes from 1989 to 2023 (inc) + assert tb[tb["number_ongoing_conflicts_main"].notna()].year.min() == 1989 + assert tb[tb["number_ongoing_conflicts_main"].notna()]["year"].max() == LAST_YEAR_CED + ## Data from PRIO/UCDP for `number_new_conflicts` goes from 1946 to 1989 (inc) + assert tb[tb["number_new_conflicts_prio"].notna()]["year"].min() == 1946 + assert tb[tb["number_new_conflicts_prio"].notna()]["year"].max() == 1989 + ## Data from GEO for `number_new_conflicts` goes from 1990 to 2022 (inc) + assert tb[tb["number_new_conflicts_main"].notna()]["year"].min() == 1990 + assert tb[tb["number_new_conflicts_main"].notna()]["year"].max() == LAST_YEAR_CED + + # Actually combine timeseries from UCDP/PRIO and GEO. + # We prioritise values from PRIO for 1989, therefore the order `PRIO.fillna(MAIN)` + tb["number_ongoing_conflicts"] = tb["number_ongoing_conflicts_prio"].fillna(tb["number_ongoing_conflicts_main"]) + tb["number_new_conflicts"] = tb["number_new_conflicts_prio"].fillna(tb["number_new_conflicts_main"]) + + # Remove unnecessary columns + columns_remove = tb.filter(regex=r"(_prio|_main)").columns + tb = tb[[col for col in tb.columns if col not in columns_remove]] + + return tb + + +def fix_extrasystemic_entries(tb: Table) -> Table: + """Fix entries with conflict_type='extrasystemic'. + + Basically means setting to zero null entries after 1989. + """ + # Sanity check + assert ( + tb.loc[tb["conflict_type"] == "extrasystemic", "year"].max() == 1989 + ), "There are years beyond 1989 for extrasystemic conflicts by default!" + + # Get only extra-systemic stuff + mask = tb.conflict_type == "extrasystemic" + tb_extra = tb.loc[mask].copy() + + # add all combinations + years = np.arange(tb["year"].min(), tb["year"].max() + 1) + regions = set(tb["region"]) + new_idx = pd.MultiIndex.from_product([years, regions], names=["year", "region"]) + tb_extra = tb_extra.set_index(["year", "region"]).reindex(new_idx).reset_index() + tb_extra["conflict_type"] = "extrasystemic" + + # Replace nulls with zeroes (all time series) + columns = [ + "number_ongoing_conflicts", + "number_new_conflicts", + ] + tb_extra[columns] = tb_extra[columns].fillna(0) + + # Replace nulls with zeroes (only post 1989 time series) + columns = [ + "number_deaths_ongoing_conflicts", + "number_deaths_ongoing_conflicts_high", + "number_deaths_ongoing_conflicts_low", + "number_deaths_ongoing_conflicts_civilians", + "number_deaths_ongoing_conflicts_unknown", + "number_deaths_ongoing_conflicts_combatants", + ] + mask_1989 = tb_extra["year"] >= 1989 + tb_extra.loc[mask_1989, columns] = tb_extra.loc[mask_1989, columns].fillna(0) + + # Add to main table + tb = pr.concat([tb[-mask], tb_extra]) + return tb + + +def _prepare_prio_table(tb: Table) -> Table: + # Select relevant columns + tb = tb.loc[:, ["conflict_id", "year", "region", "type_of_conflict", "start_date"]] + + # Flatten (some entries have multiple regions, e.g. `1, 2`). This should be flattened to multiple rows. + # https://stackoverflow.com/a/42168328/5056599 + tb["region"] = tb["region"].str.split(", ") + cols = tb.columns[tb.columns != "region"].tolist() + tb = tb[cols].join(tb["region"].apply(pd.Series)) + tb = tb.set_index(cols).stack().reset_index() + tb = tb.drop(tb.columns[-2], axis=1).rename(columns={0: "region"}) + tb["region"] = tb["region"].astype(int) + + # Obtain start year of the conflict + tb["year_start"] = pd.to_datetime(tb["start_date"]).dt.year + + # Rename regions + tb["region"] = tb["region"].map(REGIONS_MAPPING) + + # Create conflict_type + tb["conflict_type"] = tb["type_of_conflict"].map(TYPE_OF_CONFLICT_MAPPING) + + # Checks + assert tb["conflict_type"].isna().sum() == 0, "Some unknown conflict type ids were found!" + assert tb["region"].isna().sum() == 0, "Some unknown region ids were found!" + + # Filter only data from the first year with ongoing conflicts + tb = tb[tb["year_start"] >= tb["year"].min()] + + return tb + + +def _prio_add_metrics(tb: Table) -> Table: + """Things to consider: + + Values for the `number_new_conflicts` in 1989 for conflict types 'one-sided' and 'non-state' (i.e. other than 'state-based') + are not accurate. + This is because the Geo-referenced dataset starts in 1989, and this leads somehow to an overestimate of the number of conflicts + that started this year. We can solve this for 'state-based' conflicts, for which we can get data earlier than 1989 from + the UCDP/PRIO Armed Conflicts dataset. + """ + # Get number of ongoing conflicts for all regions + cols_idx = ["year", "region", "conflict_type"] + tb_ongoing = tb.groupby(cols_idx, as_index=False)["conflict_id"].nunique() + tb_ongoing.columns = cols_idx + ["number_ongoing_conflicts"] + # Get number of ongoing conflicts for 'World' + cols_idx = ["year", "conflict_type"] + tb_ongoing_world = tb.groupby(cols_idx, as_index=False)["conflict_id"].nunique() + tb_ongoing_world.columns = cols_idx + ["number_ongoing_conflicts"] + tb_ongoing_world["region"] = "World" + # Combine regions & world + tb_ongoing = pr.concat([tb_ongoing, tb_ongoing_world], ignore_index=True) + # Keep only until 1989 + tb_ongoing = tb_ongoing[tb_ongoing["year"] < 1989] + + # Get number of new conflicts for all regions + ## Reduce table to only preserve first appearing event + tb = tb.sort_values("year").drop_duplicates(subset=["conflict_id", "year_start", "region"], keep="first") + # Groupby operation + cols_idx = ["year_start", "region", "conflict_type"] + tb_new = tb.groupby(cols_idx, as_index=False)["conflict_id"].nunique() + tb_new.columns = cols_idx + ["number_new_conflicts"] + # Get number of new conflicts for 'World' + tb = tb.sort_values("year").drop_duplicates(subset=["conflict_id", "year_start"], keep="first") + cols_idx = ["year_start", "conflict_type"] + tb_new_world = tb.groupby(cols_idx, as_index=False)["conflict_id"].nunique() + tb_new_world.columns = cols_idx + ["number_new_conflicts"] + tb_new_world["region"] = "World" + # Combine regions & world + tb_new = pr.concat([tb_new, tb_new_world], ignore_index=True) + # Keep only until 1989 (inc) + tb_new = tb_new[tb_new["year_start"] <= 1989] + # Rename column + tb_new = tb_new.rename(columns={"year_start": "year"}) + + # Combine and build single table + tb = tb_ongoing.merge( + tb_new, left_on=["year", "region", "conflict_type"], right_on=["year", "region", "conflict_type"], how="outer" + ) + + # Dtypes + tb = tb.astype({"year": "uint64", "region": "category"}) + + return tb + + +def add_conflict_all(tb: Table) -> Table: + """Add metrics for conflict_type = 'all'. + + Note that this should only be added for years after 1989, since prior to that year we are missing data on 'one-sided' and 'non-state'. + """ + # Estimate number of all conflicts + tb_all = tb.groupby(["year", "region"], as_index=False)[ + [ + "number_deaths_ongoing_conflicts", + "number_deaths_ongoing_conflicts_high", + "number_deaths_ongoing_conflicts_low", + "number_deaths_ongoing_conflicts_civilians", + "number_deaths_ongoing_conflicts_unknown", + "number_deaths_ongoing_conflicts_combatants", + "number_ongoing_conflicts", + "number_new_conflicts", + ] + ].sum() + tb_all["conflict_type"] = "all" + + # Only append values after 1989 (before that we don't have 'one-sided' or 'non-state' counts) + tb_all = tb_all[tb_all["year"] >= 1989] + tb = pr.concat([tb, tb_all], ignore_index=True) + + # Set `number_new_conflicts` to NaN for 1989 + tb.loc[(tb["year"] == 1989) & (tb["conflict_type"] == "all"), "number_new_conflicts"] = np.nan + + return tb + + +def add_conflict_all_intrastate(tb: Table) -> Table: + """Add metrics for conflict_type = 'intrastate'.""" + tb_intra = tb[ + tb["conflict_type"].isin(["intrastate (non-internationalized)", "intrastate (internationalized)"]) + ].copy() + tb_intra = tb_intra.groupby(["year", "region"], as_index=False).sum(numeric_only=True, min_count=1) + tb_intra["conflict_type"] = "intrastate" + tb = pr.concat([tb, tb_intra], ignore_index=True) + return tb + + +def add_conflict_all_statebased(tb: Table) -> Table: + """Add metrics for conflict_type = 'state-based'.""" + tb_state = tb[tb["conflict_type"].isin(TYPE_OF_CONFLICT_MAPPING.values())].copy() + tb_state = tb_state.groupby(["year", "region"], as_index=False).sum(numeric_only=True, min_count=1) + tb_state["conflict_type"] = "state-based" + tb = pr.concat([tb, tb_state], ignore_index=True) + return tb + + +def adapt_region_names(tb: Table) -> Table: + assert not tb["region"].isna().any(), "There were some NaN values found for field `region`. This is not expected!" + # Get regions in table + regions = set(tb["region"]) + # Check they are as expected + regions_unknown = regions - (REGIONS_EXPECTED | {"World"}) + assert not regions_unknown, f"Unexpected regions: {regions_unknown}, please review!" + + # Add suffix with source name + msk = tb["region"] != "World" + tb.loc[msk, "region"] = tb.loc[msk, "region"] + " (UCDP)" + return tb + + +def estimate_metrics_participants(tb: Table, tb_prio: Table, tb_codes: Table) -> Table: + """Add participant information at country-level.""" + ################### + # Participated in # + ################### + # FLAG YES/NO (country-level) + + # Get table with [year, conflict_type, code] + codes = ["gwnoa", "gwnob"] + tb_country = pr.concat( + [tb.loc[:, ["year", "conflict_type", code]].rename(columns={code: "id"}).copy() for code in codes] + ) + + # Drop rows with code = NaN + tb_country = tb_country.dropna(subset=["id"]) + # Drop duplicates + tb_country = tb_country.drop_duplicates() + + # Explode where multiple codes + tb_country["id"] = tb_country["id"].astype(str).str.split(";") + tb_country = tb_country.explode("id") + # Drop duplicates (may appear duplicates after exploding) + tb_country = tb_country.drop_duplicates() + # Ensure numeric type + tb_country["id"] = tb_country["id"].astype(int) + + # Sanity check + assert not tb_country.isna().any(axis=None), "There are some NaNs!" + + # Add country name + tb_country["country"] = tb_country.apply(lambda x: tb_codes.loc[(x["id"], x["year"])], axis=1) + assert tb_country["country"].notna().all(), "Some countries were not found! NaN was set" + + # Add flag + tb_country["participated_in_conflict"] = 1 + tb_country["participated_in_conflict"].m.origins = tb["gwnoa"].m.origins + + # Prepare GW table + ctypes_all = list(set(tb_country["conflict_type"])) + tb_alltypes = Table(pd.DataFrame({"conflict_type": ctypes_all})) + tb_codes_ = tb_codes.reset_index().merge(tb_alltypes, how="cross") + tb_codes_["country"] = tb_codes_["country"].astype(str) + + # Combine all GW entries with UCDP + columns_idx = ["year", "country", "id", "conflict_type"] + tb_country = tb_codes_.merge(tb_country, on=columns_idx, how="outer") + tb_country["participated_in_conflict"] = tb_country["participated_in_conflict"].fillna(0) + tb_country = tb_country[columns_idx + ["participated_in_conflict"]] + + # Add intrastate (all) + tb_country = aggregate_conflict_types( + tb_country, "intrastate", ["intrastate (non-internationalized)", "intrastate (internationalized)"] + ) + # Add state-based + tb_country = aggregate_conflict_types(tb_country, "state-based", list(TYPE_OF_CONFLICT_MAPPING.values())) + + # Only preserve years that make sense + tb_country = tb_country[(tb_country["year"] >= tb["year"].min()) & (tb_country["year"] <= tb["year"].max())] + + ################### + # Participated in # + ################### + # NUMBER COUNTRIES + + tb_num_participants = get_number_of_countries_in_conflict_by_region(tb_country, "conflict_type") + + # Combine tables + tb_country = pr.concat([tb_country, tb_num_participants], ignore_index=True) + + # Drop column `id` + tb_country = tb_country.drop(columns=["id"]) + + ############ + # Add PRIO # + ############ + tb_country_prio = estimate_metrics_participants_prio(tb_prio, tb_codes) + + tb_country = pr.concat([tb_country, tb_country_prio], ignore_index=True, short_name=f"{paths.short_name}_country") + + return tb_country + + +def estimate_metrics_participants_prio(tb_prio: Table, tb_codes: Table) -> Table: + """Add participant information at country-level. + + Only works for UCDP/PRIO data. + """ + ################### + # Participated in # + ################### + # FLAG YES/NO (country-level) + + # Get table with [year, conflict_type, code] + codes = ["gwno_a", "gwno_a_2nd", "gwno_b", "gwno_b_2nd"] + tb_country = pr.concat( + [tb_prio[["year", "type_of_conflict", code]].rename(columns={code: "id"}).copy() for code in codes] + ) + + # Drop rows with code = NaN + tb_country = tb_country.dropna(subset=["id"]) + # Drop duplicates + tb_country = tb_country.drop_duplicates() + + # Explode where multiple codes + tb_country["id"] = tb_country["id"].astype(str).str.split(",") + tb_country = tb_country.explode("id") + # Ensure numeric type + tb_country["id"] = tb_country["id"].astype(int) + # Drop duplicates (may appear duplicates after exploding) + tb_country = tb_country.drop_duplicates() + + # Sanity check + assert not tb_country.isna().any(axis=None), "There are some NaNs!" + + # Correct codes + ## 751 'Government of Hyderabad' -> 750 'India' + tb_country.loc[tb_country["id"] == 751, "id"] = 750 + ## 817 'Republic of Vietnam' in 1975 -> 816 'Vietnam' + tb_country.loc[(tb_country["id"] == 817) & (tb_country["year"] == 1975), "id"] = 816 + ## 345 'Yugoslavia' after 2005 -> 340 'Serbia' + tb_country.loc[(tb_country["id"] == 345) & (tb_country["year"] > 2005), "id"] = 340 + # Add country name + tb_country["country"] = tb_country.apply(lambda x: tb_codes.loc[(x["id"], x["year"])], axis=1) + assert tb_country["country"].notna().all(), "Some countries were not found! NaN was set" + ## Remove duplicates after correcting codes + tb_country = tb_country.drop_duplicates() + + # Add flag + tb_country["participated_in_conflict"] = 1 + tb_country["participated_in_conflict"].m.origins = tb_prio["gwno_a"].m.origins + + # Format conflict tyep + tb_country["conflict_type"] = tb_country["type_of_conflict"].astype(object).replace(TYPE_OF_CONFLICT_MAPPING) + tb_country = tb_country.drop(columns=["type_of_conflict"]) + + # Prepare GW table + tb_alltypes = Table(pd.DataFrame({"conflict_type": tb_country["conflict_type"].unique()})) + tb_codes = tb_codes.reset_index().merge(tb_alltypes, how="cross") + tb_codes["country"] = tb_codes["country"].astype(str) + + # Combine all GW entries with UCDP/PRIO + columns_idx = ["year", "country", "id", "conflict_type"] + tb_country = tb_codes.merge(tb_country, on=columns_idx, how="outer") + tb_country["participated_in_conflict"] = tb_country["participated_in_conflict"].fillna(0) + tb_country = tb_country[columns_idx + ["participated_in_conflict"]] + + # Add intrastate (all) + tb_country = aggregate_conflict_types( + tb_country, "intrastate", ["intrastate (non-internationalized)", "intrastate (internationalized)"] + ) + # Add state-based + tb_country = aggregate_conflict_types(tb_country, "state-based", list(TYPE_OF_CONFLICT_MAPPING.values())) + + # Only preserve years that make sense + tb_country = tb_country[ + (tb_country["year"] >= tb_prio["year"].min()) & (tb_country["year"] <= tb_prio["year"].max()) + ] + + ################### + # Participated in # + ################### + # NUMBER COUNTRIES + + tb_num_participants = get_number_of_countries_in_conflict_by_region(tb_country, "conflict_type") + + # Combine tables + tb_country = pr.concat([tb_country, tb_num_participants], ignore_index=True) + + # Drop column `id` + tb_country = tb_country.drop(columns=["id"]) + + ############### + # Final steps # + ############### + + # Keep only years not covered by UCDP (except for 'extrasystemic') + tb_country = tb_country[(tb_country["year"] < 1989) | (tb_country["conflict_type"] == "extrasystemic")] + return tb_country + + +def estimate_metrics_locations(tb: Table, tb_maps: Table, tb_codes: Table, ds_population: Dataset) -> Table: + """Add participant information at country-level. + + reference: https://github.com/owid/notebooks/blob/main/JoeHasell/UCDP%20and%20PRIO/UCDP_georeferenced/ucdp_country_extract.ipynb + + tb: actual data + tb_maps: map data (borders and stuff) + tb_codes: from gw codes. so that all countries have either a 1 or 0 (instead of missing data). + ds_population: population data (for rates) + """ + tb_codes_ = tb_codes.reset_index().drop(columns=["id"]).copy() + tb_codes_ = tb_codes_[tb_codes_["year"] >= 1989] + + # Add country name using geometry + paths.log.info("adding location name of conflict event...") + tb_locations = _get_location_of_conflict_in_ucdp_ged(tb, tb_maps).copy() + + # There are some countries not in GW (remove, replace?). We keep Palestine and Western Sahara since + # these are mappable in OWID maps. + # We map entry with id "53238" and relid "PAK-2003-1-345-88" from "Siachen Glacier" to "Pakistan" based on + # the text in `where_description` field, which says: "Giang sector in Siachen, Pakistani Kashmir" + tb_locations.loc[tb_locations["country_name_location"] == "Siachen Glacier", "country_name_location"] = "Pakistan" + + ################### + # COUNTRY-LEVEL: Country in conflict or not (1 or 0) + ################### + paths.log.info("estimating country flag 'is_location_of_conflict'...") + + # Check that number of deaths is all zero + assert ( + tb_locations["best"] - tb_locations[["deaths_a", "deaths_b", "deaths_civilians", "deaths_unknown"]].sum(axis=1) + == 0 + ).all(), "Sum of deaths from combatants, civilians and unknown should equal best estimate!" + tb_locations["deaths_combatants"] = tb_locations["deaths_a"] + tb_locations["deaths_b"] + + # Estimate if a conflict occured in a country, and the number of deaths in it + # Define aggregations + INDICATOR_BASE_NAME = "number_deaths" + column_props = { + # Deaths (estimates) + "best": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}", + }, + "high": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}_high", + }, + "low": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}_low", + }, + # Deaths by type + "deaths_civilians": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}_civilians", + }, + "deaths_unknown": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}_unknown", + }, + "deaths_combatants": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}_combatants", + }, + # Number of conflicts + "conflict_new_id": { + "f": "nunique", + "rename": "is_location_of_conflict", + }, + } + # TODO: continue here + col_funcs = {k: v["f"] for k, v in column_props.items()} + col_renames = {k: v["rename"] for k, v in column_props.items()} + tb_locations_country = ( + tb_locations.groupby(["country_name_location", "year", "conflict_type"], as_index=False) + .agg(col_funcs) + .rename( + columns={ + "country_name_location": "country", + } + | col_renames + ) + ) + assert tb_locations_country["is_location_of_conflict"].notna().all(), "Missing values in `is_location_of_conflict`!" + cols_num_deaths = [v for v in col_renames.values() if v != "is_location_of_conflict"] + for col in cols_num_deaths: + assert tb_locations_country[col].notna().all(), f"Missing values in `{col}`!" + # Convert into a binary indicator: 1 (if more than one conflict), 0 (otherwise) + tb_locations_country["is_location_of_conflict"] = tb_locations_country["is_location_of_conflict"].apply( + lambda x: 1 if x > 0 else 0 + ) + + # Add missing countries using tb_codes as reference + tb_locations_country = tb_codes_.merge( + tb_locations_country, + on=["country", "year"], + how="outer", + ) + # Add Greenland + assert ( + "Greenland" not in set(tb_locations_country.country) + ), "Greenland is not expected to be there! That's why we force it to zero. If it appears, just remove the following code line" + tb_green = Table(pd.DataFrame({"country": ["Greenland"], "year": [LAST_YEAR]})) + tb_locations_country = pr.concat([tb_locations_country, tb_green], ignore_index=True) + + # NaNs of numeric indicators to zero + cols_indicators = ["is_location_of_conflict"] + cols_num_deaths + tb_locations_country[cols_indicators] = tb_locations_country[cols_indicators].fillna(0) + # NaN in conflict_type to arbitrary (since missing ones are filled from the next operation with fill_gaps_with_zeroes) + mask = tb_locations_country["conflict_type"].isna() + assert ( + tb_locations_country.loc[mask, cols_indicators].sum().sum() == 0 + ), "There are some non-NaNs for NaN-valued conflict types!" + tb_locations_country["conflict_type"] = tb_locations_country["conflict_type"].fillna("one-sided violence") + + # Fill with zeroes + tb_locations_country = expand_time_column( + tb_locations_country, + dimension_col=["country", "conflict_type"], + time_col="year", + method="full_range", + fillna_method="zero", + ) + + # Add origins from Natural Earth + cols = ["is_location_of_conflict"] + cols_num_deaths + for col in cols: + tb_locations_country[col].origins += tb_maps["name"].m.origins + + ################### + # Add conflict type aggregates + ################### + paths.log.info("adding conflict type aggregates...") + + # Add missing conflict types + CTYPES_AGGREGATES = { + "intrastate": ["intrastate (non-internationalized)", "intrastate (internationalized)"], + "state-based": list(TYPE_OF_CONFLICT_MAPPING.values()), + "all": list(TYPE_OF_VIOLENCE_MAPPING.values()) + list(TYPE_OF_CONFLICT_MAPPING.values()), + } + for ctype_agg, ctypes in CTYPES_AGGREGATES.items(): + tb_locations_country = aggregate_conflict_types( + tb=tb_locations_country, + parent_name=ctype_agg, + children_names=ctypes, + columns_to_aggregate=["is_location_of_conflict"] + cols_num_deaths, + columns_to_aggregate_absolute=cols_num_deaths, + columns_to_groupby=["country", "year"], + ) + + ################### + # Add rates + ################### + # Add population column + tb_locations_country = geo.add_population_to_table( + tb=tb_locations_country, + ds_population=ds_population, + ) + # Divide and obtain rates + factor = 100_000 + suffix = [c.replace(INDICATOR_BASE_NAME, "") for c in cols_num_deaths] + suffix = [suf for suf in suffix if suf not in {"_combatants", "_unknown", "_civilians"}] + for suf in suffix: + tb_locations_country[f"death_rate{suf}"] = ( + factor * tb_locations_country[f"{INDICATOR_BASE_NAME}{suf}"] / tb_locations_country["population"] + ) + + # Drop population column + tb_locations_country = tb_locations_country.drop(columns=["population"]) + + ################### + # REGION-LEVEL: Number of locations with conflict + ################### + paths.log.info("estimating number of locations with conflict...") + + def _get_number_of_locations_with_conflict_regions(tb: Table, cols: List[str]) -> Table: + """Get number of locations with conflict.""" + # For each group, get the number of unique locations + tb = ( + tb.groupby(cols) + .agg( + { + "country_name_location": "nunique", + } + ) + .reset_index() + ) + # Rename columns + if "region" in cols: + column_rename = { + "country_name_location": "number_locations", + "region": "country", + } + else: + column_rename = { + "country_name_location": "number_locations", + } + + tb = tb.rename(columns=column_rename) + return tb + + # Regions + ## Number of countries (given ctypes) + tb_locations_regions = _get_number_of_locations_with_conflict_regions( + tb_locations, ["region", "year", "conflict_type"] + ) + tb_locations_regions_world = _get_number_of_locations_with_conflict_regions(tb_locations, ["year", "conflict_type"]) + tb_locations_regions_world["country"] = "World" + + tbs_locations_regions = [ + tb_locations_regions, + tb_locations_regions_world, + ] + + ## Extra conflict types (aggregates) + cols = ["region", "year"] + for ctype_agg, ctypes in CTYPES_AGGREGATES.items(): + # Keep only children for this ctype aggregate + tb_locations_ = tb_locations[tb_locations["conflict_type"].isin(ctypes)] + # Get actual table, add ctype. (also for region 'World') + tb_locations_regions_agg = _get_number_of_locations_with_conflict_regions(tb_locations_, ["region", "year"]) + tb_locations_regions_agg["conflict_type"] = ctype_agg + tb_locations_regions_agg_world = _get_number_of_locations_with_conflict_regions(tb_locations_, ["year"]) + tb_locations_regions_agg_world["conflict_type"] = ctype_agg + tb_locations_regions_agg_world["country"] = "World" + tbs_locations_regions.extend([tb_locations_regions_agg, tb_locations_regions_agg_world]) + + # Combine + tb_locations_regions = pr.concat( + tbs_locations_regions, + ignore_index=True, + ) + + # Add origins + tb_locations_regions["number_locations"].m.origins = tb_locations_country["is_location_of_conflict"].origins + + # Extend to full time-series + fill NaNs with zeros. + tb_locations_regions = expand_time_column( + df=tb_locations_regions, + dimension_col=["country", "conflict_type"], + time_col="year", + method="full_range", + fillna_method="zero", + ) + + ################### + # COMBINE: Country flag + Regional counts + ################### + paths.log.info("combining country flag and regional counts...") + tb_locations = pr.concat( + [tb_locations_country, tb_locations_regions], short_name=f"{paths.short_name}_locations", ignore_index=True + ) + return tb_locations + + +def _get_location_of_conflict_in_ucdp_ged(tb: Table, tb_maps: Table) -> Table: + """Add column with country name of the conflict.""" + # Convert the UCDP data to a GeoDataFrame (so it can be mapped and used in spatial analysis). + # The 'wkt.loads' function takes the coordinates in the 'geometry' column and ensures geopandas will use it to map the data. + gdf = tb[["relid", "geom_wkt"]] + gdf.rename(columns={"geom_wkt": "geometry"}, inplace=True) + gdf["geometry"] = gdf["geometry"].apply(wkt.loads) + gdf = gpd.GeoDataFrame(gdf, crs="epsg:4326") + + # Format the map to be a GeoDataFrame with a gemoetry column + gdf_maps = gpd.GeoDataFrame(tb_maps) + gdf_maps["geometry"] = gdf_maps["geometry"].apply(wkt.loads) + gdf_maps = gdf_maps.set_geometry("geometry") + gdf_maps.crs = "epsg:4326" + + # Use the overlay function to extract data from the world map that each point sits on top of. + gdf_match = gpd.overlay(gdf, gdf_maps, how="intersection") + # Events not assigned to any country + # There are 2271 points that are missed - likely because they are in the sea perhaps due to the conflict either happening at sea or at the coast and the coordinates are slightly inaccurate. + # I've soften the assertion, otherwise a bit of a pain! + assert ( + diff := gdf.shape[0] - gdf_match.shape[0] + ) <= 2280, f"Unexpected number of events without exact coordinate match! {diff}" + # DEBUG: Examine which are these unlabeled conflicts + # mask = ~tb["relid"].isin(gdf_match["relid"]) + # tb.loc[mask, ["relid", "year", "conflict_name", "side_a", "side_b", "best"]] + + # Get missing entries + ids_missing = set(gdf["relid"]) - set(gdf_match["relid"]) + gdf_missing = gdf.loc[gdf["relid"].isin(ids_missing)] + + # Reprojecting the points and the world into the World Equidistant Cylindrical Sphere projection. + wec_crs = "+proj=eqc +lat_ts=0 +lat_0=0 +lon_0=0 +x_0=0 +y_0=0 +a=6371007 +b=6371007 +units=m +no_defs" + gdf_missing_wec = gdf_missing.to_crs(wec_crs) + gdf_maps_wec = gdf_maps.to_crs(wec_crs) + # For these points we can find the nearest country using the distance function + polygon_near = [] + for _, row in gdf_missing_wec.iterrows(): + polygon_index = gdf_maps_wec.distance(row["geometry"]).sort_values().index[0] + ne_country_name = gdf_maps_wec["name"][polygon_index] + polygon_near.append(ne_country_name) + # Assign + gdf_missing["name"] = polygon_near + + # Combining and adding name to original table + COLUMN_COUNTRY_NAME = "country_name_location" + gdf_country_names = pr.concat([Table(gdf_match[["relid", "name"]]), Table(gdf_missing[["relid", "name"]])]) + tb = tb.merge(gdf_country_names, on="relid", how="left", validate="one_to_one").rename( + columns={"name": COLUMN_COUNTRY_NAME} + ) + assert tb[COLUMN_COUNTRY_NAME].notna().all(), "Some missing values found in `COLUMN_COUNTRY_NAME`" + + # SOME CORRECTIONS # + # To align with OWID borders we will rename the conflicts in Somaliland to Somalia and the conflicts in Morocco that were below 27.66727 latitude to Western Sahara. + ## Somaliland -> Somalia + mask = tb[COLUMN_COUNTRY_NAME] == "Somaliland" + paths.log.info(f"{len(tb.loc[mask, COLUMN_COUNTRY_NAME])} datapoints in Somaliland") + tb.loc[mask, COLUMN_COUNTRY_NAME] = "Somalia" + ## Morocco -> Western Sahara + mask = (tb[COLUMN_COUNTRY_NAME] == "Morocco") & (tb["latitude"] < 27.66727) + paths.log.info(f"{len(tb.loc[mask, COLUMN_COUNTRY_NAME])} datapoints in land contested by Morocco/W.Sahara") + tb.loc[mask, COLUMN_COUNTRY_NAME] = "Western Sahara" + + # Add a flag column for points likely to have inccorect corrdinates: + # a) points where coordiantes are (0 0), or points where latitude and longitude are exactly the same + tb["flag"] = "" + # Items are (mask, flag_message) + errors = [ + ( + tb["geom_wkt"] == "POINT (0 0)", + "coordinates (0 0)", + ), + (tb["latitude"] == tb["longitude"], "latitude = longitude"), + ] + for error in errors: + tb.loc[error[0], "flag"] = error[1] + tb.loc[mask, COLUMN_COUNTRY_NAME] = np.nan + + assert tb[COLUMN_COUNTRY_NAME].isna().sum() == 4, "4 missing values were expected! Found a different amount!" + tb = tb.dropna(subset=[COLUMN_COUNTRY_NAME]) + + return tb + + +def extend_latest_years(tb: Table) -> Table: + """Create table with each country present in a year.""" + + index = list(tb.index.names) + tb = tb.reset_index() + + # define mask for last year + mask = tb["year"] == LAST_YEAR_STABLE + + # Get year to extend to + current_year = datetime.now().year + + tb_all_years = Table(pd.RangeIndex(LAST_YEAR_STABLE + 1, current_year + 1), columns=["year"]) + tb_last = tb[mask].drop(columns="year").merge(tb_all_years, how="cross") + + tb = pr.concat([tb, tb_last], ignore_index=True, short_name="gleditsch_countries") + + tb = tb.set_index(index) + return tb + + +def get_summary_unknown(tb: Table): + """Get a table summary of the ongoing conflicts that couldn't be mapped to a specific category. + + We know that these are state-based conflicts, but we don't have more information about them! + + By looking at them, we may be able to map these to a specific category: + + - "extrasystemic", + - "interstate" + - "intrastate (non-internationalized)" + - "intrastate (internationalized)" + """ + tbx = tb.loc[ + tb["type_of_conflict"] == UNKNOWN_TYPE_ID, + ["id", "conflict_new_id", "conflict_name", "date_start", "date_end", "side_a", "side_b"], + ] + tbx = tbx.groupby(["conflict_new_id", "conflict_name"], as_index=False).agg( + { + "date_start": "min", + "date_end": "max", + "side_a": (lambda x: "; ".join(set(x))), + "side_b": (lambda x: "; ".join(set(x))), + "id": "nunique", + } + ) + tbx = tbx.drop_duplicates(subset=["conflict_new_id", "conflict_name"]) + tbx["date_start"] = pd.to_datetime(tbx["date_start"]) + tbx["date_end"] = pd.to_datetime(tbx["date_end"]) + tbx = tbx.rename(columns={"id": "num_events"}) + tbx = tbx.sort_values(["num_events", "date_start"], ascending=False) + + return tbx diff --git a/etl/steps/data/garden/wb/2017-04-16/world_gdp.py b/etl/steps/data/garden/wb/2017-04-16/world_gdp.py index a4968d65180..02d9f00c5ef 100644 --- a/etl/steps/data/garden/wb/2017-04-16/world_gdp.py +++ b/etl/steps/data/garden/wb/2017-04-16/world_gdp.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # snap = paths.load_snapshot() - tb = snap.read().set_index(["country", "year"]) + tb = snap.read(safe_types=False).set_index(["country", "year"]) # # Save outputs. diff --git a/etl/steps/data/garden/wb/2022-10-03/extreme_poverty_by_region.py b/etl/steps/data/garden/wb/2022-10-03/extreme_poverty_by_region.py index a4968d65180..02d9f00c5ef 100644 --- a/etl/steps/data/garden/wb/2022-10-03/extreme_poverty_by_region.py +++ b/etl/steps/data/garden/wb/2022-10-03/extreme_poverty_by_region.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # snap = paths.load_snapshot() - tb = snap.read().set_index(["country", "year"]) + tb = snap.read(safe_types=False).set_index(["country", "year"]) # # Save outputs. diff --git a/etl/steps/data/garden/wb/2022-10-03/world_bank_pip.py b/etl/steps/data/garden/wb/2022-10-03/world_bank_pip.py index a4968d65180..02d9f00c5ef 100644 --- a/etl/steps/data/garden/wb/2022-10-03/world_bank_pip.py +++ b/etl/steps/data/garden/wb/2022-10-03/world_bank_pip.py @@ -11,7 +11,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # snap = paths.load_snapshot() - tb = snap.read().set_index(["country", "year"]) + tb = snap.read(safe_types=False).set_index(["country", "year"]) # # Save outputs. diff --git a/etl/steps/data/garden/wb/2024-01-17/world_bank_pip.meta.yml b/etl/steps/data/garden/wb/2024-01-17/world_bank_pip.meta.yml index 6056222a0a9..98a1ad3929f 100644 --- a/etl/steps/data/garden/wb/2024-01-17/world_bank_pip.meta.yml +++ b/etl/steps/data/garden/wb/2024-01-17/world_bank_pip.meta.yml @@ -445,7 +445,7 @@ tables: This data is expressed in [international-$](#dod:int_dollar_abbreviation) at 2017 prices. Depending on the country and year, it relates to income measured after taxes and benefits, or to consumption, [per capita](#dod:per-capita). - type: StackedArea + chartTypes: ["StackedArea"] addCountryMode: disabled hideRelativeToggle: false originUrl: https://ourworldindata.org/poverty diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml index 79633398446..388adb37b83 100644 --- a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml +++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml @@ -487,7 +487,7 @@ tables: This data is expressed in [international-$](#dod:int_dollar_abbreviation) at 2017 prices. Depending on the country and year, it relates to income measured after taxes and benefits, or to consumption, [per capita](#dod:per-capita). - type: StackedArea + chartTypes: ["StackedArea"] addCountryMode: disabled hideRelativeToggle: false originUrl: https://ourworldindata.org/poverty diff --git a/etl/steps/data/garden/wb/2024-06-10/gender_statistics.py b/etl/steps/data/garden/wb/2024-06-10/gender_statistics.py index fe734d701c1..79c4a92e643 100644 --- a/etl/steps/data/garden/wb/2024-06-10/gender_statistics.py +++ b/etl/steps/data/garden/wb/2024-06-10/gender_statistics.py @@ -71,9 +71,10 @@ def run(dest_dir: str) -> None: countries_file=paths.country_mapping_path, ) # Pivot the dataframe so that each indicator is a separate column - tb = tb.pivot(index=["country", "year"], columns="wb_seriescode", values="value") - tb = tb.reset_index() + tb = tb.pivot(index=["country", "year"], columns="wb_seriescode", values="value").reset_index() + # Slovakia - should be the same as Czechoslovakia in the period 1970-1993 for the indicator SG.LAW.EQRM.WK as confirmed by the source (but not yet updated on their database) + tb.loc[(tb["country"] == "Slovakia") & (tb["year"].between(1970, 1993)), "SG.LAW.EQRM.WK"] = 1 # Add metadata by finding the descriptions and sources using indicator codes tb = add_metadata(tb, metadata_tb) diff --git a/etl/steps/data/garden/wb/2024-09-09/food_prices_for_nutrition.py b/etl/steps/data/garden/wb/2024-09-09/food_prices_for_nutrition.py index ef6449d2aae..a79d730142a 100644 --- a/etl/steps/data/garden/wb/2024-09-09/food_prices_for_nutrition.py +++ b/etl/steps/data/garden/wb/2024-09-09/food_prices_for_nutrition.py @@ -74,12 +74,12 @@ def run(dest_dir: str) -> None: # # Load meadow dataset and read its main table. ds_meadow = paths.load_dataset("food_prices_for_nutrition") - tb = ds_meadow.read_table("food_prices_for_nutrition") + tb = ds_meadow.read("food_prices_for_nutrition") # Load the World Development Indicators (WDI) dataset to get the U.S. Consumer Price Index (CPI), # which will be used to correct for inflation and express costs in constant 2021 PPP$. ds_wdi = paths.load_dataset("wdi") - tb_wdi = ds_wdi.read_table("wdi") + tb_wdi = ds_wdi.read("wdi") # # Process data. diff --git a/etl/steps/data/garden/wb/2024-10-07/world_bank_pip.meta.yml b/etl/steps/data/garden/wb/2024-10-07/world_bank_pip.meta.yml index 0770ce60b5a..6a971fa5882 100644 --- a/etl/steps/data/garden/wb/2024-10-07/world_bank_pip.meta.yml +++ b/etl/steps/data/garden/wb/2024-10-07/world_bank_pip.meta.yml @@ -9,7 +9,6 @@ definitions: attribution_short: World Bank grapher_config: originUrl: https://ourworldindata.org/poverty - $schema: https://files.ourworldindata.org/schemas/grapher-schema.003.json processing_level: major @@ -406,7 +405,12 @@ tables: - Nigeria - Bangladesh originUrl: https://ourworldindata.org/poverty - $schema: https://files.ourworldindata.org/schemas/grapher-schema.005.json + + poverty_gap_index_215: + presentation: + title_public: Poverty gap index at $2.15 per day + topic_tags: + - Poverty gini: presentation: @@ -493,7 +497,7 @@ tables: This data is expressed in [international-$](#dod:int_dollar_abbreviation) at 2017 prices. Depending on the country and year, it relates to income measured after taxes and benefits, or to consumption, [per capita](#dod:per-capita). - type: StackedArea + chartTypes: ["StackedArea"] addCountryMode: disabled hideRelativeToggle: false originUrl: https://ourworldindata.org/poverty @@ -529,4 +533,4 @@ tables: - The regions employed by the World Bank PIP differ from the regional groupings used by the World Bank in other contexts. - Some economies, mostly high-income economies, are excluded from the geographical regions and are included as a separate group referred to as “other high income” (or “industrialized economies” or “rest of the world” in earlier publications). presentation: - title_public: World regions according to World Bank Poverty and Inequality Platform \ No newline at end of file + title_public: World regions according to World Bank Poverty and Inequality Platform diff --git a/etl/steps/data/garden/wb/2024-11-04/edstats.countries.json b/etl/steps/data/garden/wb/2024-11-04/edstats.countries.json new file mode 100644 index 00000000000..9cda4d91ab8 --- /dev/null +++ b/etl/steps/data/garden/wb/2024-11-04/edstats.countries.json @@ -0,0 +1,237 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas, The": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cayman Islands": "Cayman Islands", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo, Dem. Rep.": "Democratic Republic of Congo", + "Congo, Rep.": "Congo", + "Costa Rica": "Costa Rica", + "Cote d'Ivoire": "Cote d'Ivoire", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Curacao": "Curacao", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt, Arab Rep.": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Polynesia": "French Polynesia", + "Gabon": "Gabon", + "Gambia, The": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Gibraltar": "Gibraltar", + "Greece": "Greece", + "Grenada": "Grenada", + "Guam": "Guam", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hong Kong SAR, China": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran, Islamic Rep.": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Korea, Rep.": "South Korea", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyz Republic": "Kyrgyzstan", + "Lao PDR": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macao SAR, China": "Macao", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia, Fed. Sts.": "Micronesia (country)", + "Moldova": "Moldova", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Montserrat": "Montserrat", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North America": "North America (WB)", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Slovak Republic": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "St. Kitts and Nevis": "Saint Kitts and Nevis", + "St. Lucia": "Saint Lucia", + "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela, RB": "Venezuela", + "Viet Nam": "Vietnam", + "Virgin Islands (U.S.)": "United States Virgin Islands", + "West Bank and Gaza": "Palestine", + "World": "World", + "Yemen, Rep.": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "East Asia & Pacific": "East Asia and the Pacific (WB)", + "High income": "High-income countries", + "Lower middle income": "Lower-middle-income countries", + "Latin America & Caribbean": "Latin America and Caribbean (WB)", + "Upper middle income": "Upper-middle-income countries", + "South Asia": "South Asia (WB)", + "Sub-Saharan Africa": "Sub-Saharan Africa (WB)", + "Arab World": "Arab World (WB)", + "Middle East & North Africa": "Middle East and North Africa (WB)", + "Low income": "Low-income countries", + "Europe & Central Asia": "Europe and Central Asia (WB)", + "European Union": "EU (27)", + "Czechia": "Czechia", + "Eswatini": "Eswatini", + "Korea, Dem. People's Rep.": "North Korea", + "North Macedonia": "North Macedonia", + "Tokelau": "Tokelau", + "Africa Eastern and Southern": "Southern and Eastern Africa (WB)", + "Africa Western and Central": "Western and Central Africa (WB)", + "Central Europe and the Baltics": "Central Europe and the Baltics (WB)", + "Turkiye": "Turkey", + "Cook Islands": "Cook Islands", + "Niue": "Niue", + "Vietnam": "Vietnam" +} \ No newline at end of file diff --git a/etl/steps/data/garden/wb/2024-11-04/edstats.excluded_countries.json b/etl/steps/data/garden/wb/2024-11-04/edstats.excluded_countries.json new file mode 100644 index 00000000000..58a278cc9d7 --- /dev/null +++ b/etl/steps/data/garden/wb/2024-11-04/edstats.excluded_countries.json @@ -0,0 +1,35 @@ +[ + "Euro area", + "OECD members", + "Caribbean small states", + "Early-demographic dividend", + "Fragile and conflict affected situations", + "Global Partnership for Education", + "IBRD only", + "IDA & IBRD total", + "IDA blend", + "IDA only", + "IDA total", + "Late-demographic dividend", + "Other small states", + "Pacific island small states", + "Post-demographic dividend", + "Pre-demographic dividend", + "Small states", + "East Asia & Pacific (IDA & IBRD countries)", + "Europe & Central Asia (IDA & IBRD countries)", + "Latin America & the Caribbean (IDA & IBRD countries)", + "Middle East & North Africa (IDA & IBRD countries)", + "South Asia (IDA & IBRD)", + "Sub-Saharan Africa (IDA & IBRD countries)", + "Middle East & North Africa (excluding high income)", + "East Asia & Pacific (excluding high income)", + "Heavily indebted poor countries (HIPC)", + "Sub-Saharan Africa (excluding high income)", + "Latin America & Caribbean (excluding high income)", + "Europe & Central Asia (excluding high income)", + "Least developed countries: UN classification", + "Low & middle income", + "Middle income" +] + diff --git a/etl/steps/data/garden/wb/2024-11-04/edstats.meta.yml b/etl/steps/data/garden/wb/2024-11-04/edstats.meta.yml new file mode 100644 index 00000000000..f1818ec226d --- /dev/null +++ b/etl/steps/data/garden/wb/2024-11-04/edstats.meta.yml @@ -0,0 +1,365 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Education + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 364 + +tables: + edstats: + variables: + expected_years_of_school: + title: Expected years of schooling + unit: years + description_from_producer: |- + Expected years of school is calculated as the sum of age-specific enrollment rates between ages 4 and 17. Age-specific enrollment rates are approximated using school enrollment rates at different levels: pre-primary enrollment rates approximate the age-specific enrolment rates for 4 and 5 year-olds; the primary rate approximates for 6-11 year-olds; the lower-secondary rate approximates for 12-14 year-olds; and the upper-secondary approximates for 15-17 year-olds. Most recent estimates are used. Year of most recent primary enrollment rate used is shown in data notes. + + World Bank variable id: HD.HCI.EYRS + + Original source: World Bank staff estimates based on data from UNESCO Institute for Statistics, supplemented with data provided by World Bank staff. + display: + numDecimalPlaces: 0 + + harmonized_test_scores: + title: Harmonized test scores + unit: score + description_short: Average learning outcomes correspond to [harmonized](#dod:harmonized-scores) test scores across standardized, psychometrically-robust international and regional student achievement tests. + description_from_producer: |- + Harmonized test scores from major international student achievement testing programs. They are measured in TIMSS-equivalent units, where 300 is minimal attainment and 625 is advanced attainment. + + World Bank variable id: HD.HCI.HLO + + Original source: Patrinos and Angrist (2018). http://documents.worldbank.org/curated/en/390321538076747773/Global-Dataset-on-Education-Quality-A-Review-and-Update-2000-2017 + display: + numDecimalPlaces: 0 + + learning_adjusted_years_of_school: + title: Learning-adjusted years of schooling + unit: years + description_short: |- + [Learning-adjusted years of schooling](#dod:lays) merge the quantity and quality of education into one metric, accounting for the fact that similar durations of schooling can yield different learning outcomes. + description_from_producer: |- + Learning-Adjusted Years of School are calculated by multiplying the estimates of Expected Years of School by the ratio of most recent Harmonized Test Score to 625, where 625 corresponds to advancement attainment on the TIMSS (Trends in International Mathematics and Science Study) test. For more information, consult the Human Capital Index website: http://www.worldbank.org/en/publication/human-capital. + + World Bank variable id: HD.HCI.LAYS + + Original source: World Bank staff calculation based on methodology in Filmer et al. (2018). http://documents.worldbank.org/curated/en/243261538075151093/Learning-Adjusted-Years-of-Schooling-LAYS-Defining-A-New-Macro-Measure-of-Education + display: + numDecimalPlaces: 0 + + piaac__mean_adult_literacy_proficiency__total: + title: PIAAC Mean Adult Literacy Proficiency. Total + unit: score + description_short: Average literacy proficiency of adults aged 16-65. + description_from_producer: |- + Literacy is defined as the ability to understand, evaluate, use and engage with written texts to participate in society, to achieve one’s goals, and to develop one’s knowledge and potential. Literacy encompasses a range of skills from the decoding of written words and sentences to the comprehension, interpretation, and evaluation of complex texts. It does not, however, involve the production of text (writing). Information on the skills of adults with low levels of proficiency is provided by an assessment of reading components that covers text vocabulary, sentence comprehension and passage fluency. The target population for the survey was the non institutionalized population, aged 16-65 years, residing in the country at the time of data collection, irrespective of nationality, citizenship or language status. Literacy-related non-respondents are not included in the calculation of the mean scores which, thus, present an upper bound of the estimated literacy proficiency of the population. For more information, consult the OECD PIAAC website: http://www.oecd.org/site/piaac/. + + World Bank variable id: LO.PIAAC.LIT + + Original source: OECD Programme for the International Assessment of Adult Competencies (PIAAC) + display: + numDecimalPlaces: 0 + + piaac__mean_adult_numeracy_proficiency__total: + title: PIAAC Mean Adult Numeracy Proficiency. Total + unit: score + description_short: Average numeracy proficiency of adults aged 16-65. + description_from_producer: |- + Numeracy is defined as the ability to access, use, interpret and communicate mathematical information and ideas in order to engage in and manage the mathematical demands of a range of situations in adult life. To this end, numeracy involves managing a situation or solving a problem in a real context, by responding to mathematical content/information/ideas represented in multiple ways. The target population for the survey was the non-institutionalized population, aged 16-65 years, residing in the country at the time of data collection, irrespective of nationality, citizenship or language status. For more information, consult the OECD PIAAC website: http://www.oecd.org/site/piaac/. + + World Bank variable id: LO.PIAAC.NUM + + Original source: OECD Programme for the International Assessment of Adult Competencies (PIAAC) + display: + numDecimalPlaces: 0 + + + piaac__mean_young_adult_numeracy_proficiency__total: + title: PIAAC Mean Young Adult Numeracy Proficiency. Total + unit: score + description_short: Average numeracy proficiency of young adults aged 16-34. + description_from_producer: |- + Numeracy is defined as the ability to access, use, interpret and communicate mathematical information and ideas in order to engage in and manage the mathematical demands of a range of situations in adult life. To this end, numeracy involves managing a situation or solving a problem in a real context, by responding to mathematical content/information/ideas represented in multiple ways. The target population was the non-institutionalized population, aged 16-24 years, residing in the country at the time of data collection, irrespective of nationality, citizenship or language status. For more information, consult the OECD PIAAC website: http://www.oecd.org/site/piaac/. + + World Bank variable id: LO.PIAAC.NUM.YOU + + Original source: OECD Programme for the International Assessment of Adult Competencies (PIAAC) + display: + numDecimalPlaces: 0 + + pisa__female_15_year_olds_by_mathematics_proficiency_level__pct__level_6: + title: PISA. Female 15-year-olds by mathematics proficiency level (%). Level 6 + unit: "%" + short_unit: "%" + description_short: Percentage of 15-year-old female students scoring higher than 669 on the PISA mathematics scale. + description_from_producer: |- + Percentage of 15-year-old female students scoring higher than 669 on the PISA mathematics scale. At Level 6, students can conceptualize, generalize and utilize information based on their investigations and modeling of complex problem situations, and can use their knowledge in relatively non-standard contexts. They can link different information sources and representations and flexibly translate among them. Students at this level are capable of advanced mathematical thinking and reasoning. These students can apply this insight and understanding, along with a mastery of symbolic and formal mathematical operations and relationships, to develop new approaches and strategies for attacking novel situations. Students at this level can reflect on their actions, and can formulate and precisely communicate their actions and reflections regarding their findings, interpretations, arguments, and the appropriateness of these to the original situation. Data reflects country performance in the stated year according to PISA reports, but may not be comparable across years or countries. Consult the PISA website for more detailed information: http://www.oecd.org/pisa/. + + World Bank variable id: LO.PISA.MAT.6.FE + + Original source: OECD Programme for International Student Assessment (PISA) + display: + numDecimalPlaces: 1 + + pisa__male_15_year_olds_by_mathematics_proficiency_level__pct__level_6: + title: PISA. Male 15-year-olds by mathematics proficiency level (%). Level 6 + unit: "%" + short_unit: "%" + description_short: Percentage of 15-year-old male students scoring higher than 669 on the PISA mathematics scale. + description_from_producer: |- + Percentage of 15-year-old male students scoring higher than 669 on the PISA mathematics scale. At Level 6, students can conceptualize, generalize and utilize information based on their investigations and modeling of complex problem situations, and can use their knowledge in relatively non-standard contexts. They can link different information sources and representations and flexibly translate among them. Students at this level are capable of advanced mathematical thinking and reasoning. These students can apply this insight and understanding, along with a mastery of symbolic and formal mathematical operations and relationships, to develop new approaches and strategies for attacking novel situations. Students at this level can reflect on their actions, and can formulate and precisely communicate their actions and reflections regarding their findings, interpretations, arguments, and the appropriateness of these to the original situation. Data reflects country performance in the stated year according to PISA reports, but may not be comparable across years or countries. Consult the PISA website for more detailed information: http://www.oecd.org/pisa/ + + World Bank variable id: LO.PISA.MAT.6.MA + + Original source: OECD Programme for International Student Assessment (PISA) + display: + numDecimalPlaces: 1 + + annual_statutory_teacher_salaries_in_public_institutions_in_usd__primary__10_years_of_experience: + title: Annual statutory teacher salaries in public institutions in USD. Primary. 10 years of experience + unit: US dollars + short_unit: US $ + description_short: Gross annual salary expressed in [international-$](#dod:int_dollar_abbreviation) at 2020/21 prices before tax for a fully qualified teacher with 10 years of experience, based on official pay scales, and includes employer contributions to social security and pension. + description_from_producer: |- + Salaries after 10 years of experience refer to the scheduled annual salary of a full-time classroom teacher with the minimum training necessary to be fully qualified plus 10 years of experience. Salaries are in equivalent USD converted using PPPs for private consumption. Statutory salaries refer to scheduled salaries according to official pay scales, while actual salaries refer to the average annual salary earned by a full-time teacher. The salaries reported are gross (total sum paid by the employer) less the employer’s contribution to social security and pension, according to existing salary scales. Salaries are “before tax”, i.e. before deductions for income tax. Teachers’ salaries are one component of teachers’ total compensation. Other benefits, such as regional allowances for teaching in remote areas, family allowances, reduced rates on public transport and tax allowances on the purchase of cultural materials, may also form part of teachers’ total remuneration. There are also large differences in taxation and social-benefits systems in OECD countries. All this should be borne in mind when comparing statutory salaries across countries. Data after 2009 is not comparable to data for 2009 and before due to changes in methodology. For more information, consult the OECD's Education at a Glance website: http://www.oecd.org/edu/eag.htm + + World Bank variable id: OECD.TSAL.1.E10 + + Original source: Organisation for Economic Co-operation and Development (OECD) + display: + numDecimalPlaces: 0 + + wittgenstein_projection__percentage_of_the_population_age_15plus_by_highest_level_of_educational_attainment__no_education__total: + title: Wittgenstein Projection. Percentage of the population age 15+ by highest level of educational attainment. No education. Total + unit: "%" + short_unit: "%" + description_short: Percentage of the population aged 15 and older with no formal education. + description_from_producer: |- + Share of the population of the stated age group that has never attended school. Projections are based on collected census and survey data for the base year (around 2010) and the Medium Shared Socioeconomic Pathways (SSP2) projection model. The SSP2 is a middle-of-the-road scenario that combines medium fertility with medium mortality, medium migration, and the Global Education Trend (GET) education scenario. For more information and other projection models, consult the Wittgenstein Centre for Demography and Global Human Capital's website: http://www.oeaw.ac.at/vid/dataexplorer/ + + World Bank variable id: PRJ.ATT.15UP.NED.MF + + Original source: Wittgenstein Centre for Demography and Global Human Capital: http://www.oeaw.ac.at/vid/dataexplorer/ + display: + numDecimalPlaces: 1 + + saber__teachers__policy_goal_7__supporting_teachers_to_improve_instruction: + title: SABER rating of education policies aimed at preparing teachers with training + unit: score + description_short: The rating score reflects each country's level of effectiveness in preparing teachers through training and experience, assessed on a scale from 1 (Latent) to 4 (Advanced), based on criteria like availability of professional development opportunities and their assignment based on perceived needs. + description_from_producer: |- + Data Interpretation: 1=Latent; 2=Emerging; 3=Established; 4=Advanced. For additional information, visit the SABER: (website: http://saber.worldbank.org/index.cfm + + World Bank variable id: SABER.TECH.GOAL7 + + Original source: Systems Approach for Better Education Results (SABER), World Bank + display: + numDecimalPlaces: 0 + + literacy_rate__adult_total__pct_of_people_ages_15_and_above: + title: Literacy rate. Adult total. % of people ages 15 and above + unit: "%" + short_unit: "%" + description_short: Percentage of people aged 15 and above who can read and write a short, simple statement on their everyday life. + description_from_producer: |- + Percentage of the population age 15 and above who can, with understanding, read and write a short, simple statement on their everyday life. Generally, ‘literacy’ also encompasses ‘numeracy’, the ability to make simple arithmetic calculations. This indicator is calculated by dividing the number of literates aged 15 years and over by the corresponding age group population and multiplying the result by 100. + + World Bank variable id: SE.ADT.LITR.ZS + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 1 + + school_enrollment__preprimary__pct_gross: + title: Gross enrolment ratio in pre-primary education + unit: "%" + short_unit: "%" + description_short: Number of children of any age group who are enrolled in [pre-primary](#dod:pre-primary-education) education expressed as a percentage of the total population of the official pre-primary school age. + description_from_producer: |- + Total enrollment in pre-primary education, regardless of age, expressed as a percentage of the total population of official pre-primary education age. GER can exceed 100% due to the inclusion of over-aged and under-aged students because of early or late school entrance and grade repetition. + + World Bank variable id: SE.PRE.ENRR + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 1 + + trained_teachers_in_primary_education__pct_of_total_teachers: + title: Share of teachers in primary education who are trained + unit: "%" + short_unit: "%" + description_short: Share of [primary](#dod:primary-education) school teachers who have completed either the training required before starting their teaching career (pre-service) or the ongoing professional development after becoming teachers (in-service) in their specific country. + description_from_producer: |- + Number of teachers who have received the minimum organized teacher training (pre-service or in-service) required for teaching at the primary level in the given country, expressed as a percentage of the total number of teachers at the primary level. + + World Bank variable id: SE.PRM.TCAQ.ZS + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 1 + + + trained_teachers_in_secondary_education__pct_of_total_teachers: + title: Share of teachers in secondary education who are trained + unit: "%" + short_unit: "%" + description_short: Share of [secondary](#dod:secondary-education) school teachers who have completed either the training required before starting their teaching career (pre-service) or the ongoing professional development after becoming teachers (in-service) in their specific country. + description_from_producer: |- + Number of teachers who have received the minimum organized teacher training (pre-service or in-service) required for teaching at the secondary level in the given country, expressed as a percentage of the total number of teachers at the secondary level. + + World Bank variable id: SE.SEC.TCAQ.ZS + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 1 + + government_expenditure_on_education__total__pct_of_gdp: + title: Government expenditure on education. Total. % of GDP + unit: "%" + short_unit: "%" + description_short: Public expenditure on education as a percentage of GDP. + description_from_producer: |- + Total general (local, regional and central) government expenditure on education (current, capital, and transfers), expressed as a percentage of GDP. It includes expenditure funded by transfers from international sources to government. Divide total government expenditure for a given level of education (ex. primary, secondary, or all levels combined) by the GDP, and multiply by 100. A higher percentage of GDP spent on education shows a higher government priority for education, but also a higher capacity of the government to raise revenues for public spending, in relation to the size of the country's economy. When interpreting this indicator however, one should keep in mind in some countries, the private sector and/or households may fund a higher proportion of total funding for education, thus making government expenditure appear lower than in other countries. For more information, consult the UNESCO Institute of Statistics website: http://www.uis.unesco.org/Education/ + + World Bank variable id: SE.XPD.TOTL.GD.ZS + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 1 + + + literacy_rate__population_25_64_years__both_sexes__pct: + title: Adult literacy rate + unit: "%" + short_unit: "%" + description_short: Percentage of the population aged 25-64 who can read and write a short, simple statement on their everyday life. + description_from_producer: |- + Percentage of the population age 25-64 who can, with understanding, read and write a short, simple statement on their everyday life. Generally, ‘literacy’ also encompasses ‘numeracy’, the ability to make simple arithmetic calculations. This indicator is calculated by dividing the number of literates aged 25-64 years by the corresponding age group population and multiplying the result by 100. + + World Bank variable id: SE.ADT.LITR.ZS + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 1 + + total_net_enrolment_rate__primary__gender_parity_index__gpi: + title: Gender parity in net enrolment rates in primary education + unit: index + description_short: Ratio of female to male net [primary education](#dod:primary-education) enrolment rates, which is defined as the ratio of children of official primary school age who are enrolled in primary school to the population of the official primary school age. + description_from_producer: |- + Ratio of female total net enrolment rate for primary to the male total net enrolment rate for primary. It is calculated by dividing the female value for the indicator by the male value for the indicator. A GPI equal to 1 indicates parity between females and males. In general, a value less than 1 indicates disparity in favor of males and a value greater than 1 indicates disparity in favor of females. + + World Bank variable id: UIS.NERT.1.GPI + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 2 + + total_net_enrolment_rate__lower_secondary__gender_parity_index__gpi: + title: Gender parity in net enrolment rates in lower-secondary education + unit: index + description_short: Ratio of female to male net [lower-secondary](#dod:lower-secondary-education) education enrolment rates, defined as the ratio of children of official secondary school age who are enrolled in lower-secondary school to the population of the official lower-secondary school age. + description_from_producer: |- + Ratio of female total net enrolment rate for lower secondary to the male total net enrolment rate for lower secondary. It is calculated by dividing the female value for the indicator by the male value for the indicator. A GPI equal to 1 indicates parity between females and males. In general, a value less than 1 indicates disparity in favor of males and a value greater than 1 indicates disparity in favor of females. + + World Bank variable id: UIS.NERT.2.GPI + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 2 + + out_of_school_children__adolescents_and_youth_of_primary_and_secondary_school_age__female__number: + title: Out-of-school girls, adolescents and youth of primary and secondary + unit: girls + description_short: Number of children, adolescents and youth of primary and secondary school age who are not enrolled in school. + description_from_producer: |- + Number of children, adolescents, and youth of primary, lower secondary, and upper secondary school age who are not enrolled or attending school during in a given academic year. For more information, consult the UNESCO Institute of Statistics website: http://www.uis.unesco.org/Education/ + + World Bank variable id: UIS.OFST.1T3.F.CP + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 0 + + out_of_school_children__adolescents_and_youth_of_primary_and_secondary_school_age__male__number: + title: Out-of-school boys, adolescents and youth of primary and secondary + unit: children + description_short: Number of children, adolescents and youth of primary and secondary school age who are not enrolled in school. + description_from_producer: |- + Number of children, adolescents, and youth of primary, lower secondary, and upper secondary school age who are not enrolled or attending school during in a given academic year. For more information, consult the UNESCO Institute of Statistics website: http://www.uis.unesco.org/Education/ + + World Bank variable id: UIS.OFST.1T3.M.CP + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 0 + + school_life_expectancy__primary__gender_parity_index__gpi: + title: Gender parity in primary school life expectancy + unit: index + description_short: Ratio of female to male primary school life expectancy, which is defined as the number of years a person of school entrance age can expect to spend within [primary](#dod:primary-education) education. + description_from_producer: |- + Ratio of female school life expectancy to the male school life expectancy. It is calculated by dividing the female value for the indicator by the male value for the indicator. A GPI equal to 1 indicates parity between females and males. In general, a value less than 1 indicates disparity in favor of males and a value greater than 1 indicates disparity in favor of females. + + World Bank variable id: UIS.SLE.1.GPI + + Original source: UNESCO Institute for Statistics + display: + numDecimalPlaces: 2 + + normalized_hci: + title: Normalized harmonized test scores + unit: score + description_short: The quality of schooling is assessed using the [harmonized learning scores](#dod:harmonized-scores), adjusted relative to the country with the highest performance, in this instance, Singapore. + description_from_producer: |- + Harmonized test scores from major international student achievement testing programs. They are measured in TIMSS-equivalent units, where 300 is minimal attainment and 625 is advanced attainment. Most recent estimates are used. Year of most recent estimate shown in data notes. \n\nTest scores from the following testing programs are included:\n• TIMSS/PIRLS: Refers to average of test scores from TIMSS (Trends in International Maths and Science Study) and PIRLS (Progress in International Reading Literacy Study), both carried out by the International Association for the Evaluation of Educational Achievement. Data from each PIRLS round is moved to the year of the nearest TIMSS round and averaged with the TIMSS data. \n• PISA: Refers to test scores from Programme for International Student Assessment\n• PISA+TIMSS/PIRLS: Refers to the average of these programs for countries and years where both are available\n• SACMEQ: Refers to test scores from Southern and Eastern Africa Consortium for Monitoring Educational Quality \n• PASEC: Refers to test scores from Program of Analysis of Education Systems\n• LLECE: Refers to test scores from Latin American Laboratory for Assessment of the Quality of Education\n• PILNA: Refers to test scores from Pacific Islands Literacy and Numeracy Assessment\n• EGRA: Refers to test scores from nationally-representative Early Grade Reading Assessments \n• EGRANR: Refers to test scores from non-nationally-representative Early Grade Reading Assessments\ + + World Bank variable id: HD.HCI.HLO + + Original source: Patrinos and Angrist (2018). http://documents.worldbank.org/curated/en/390321538076747773/Global-Dataset-on-Education-Quality-A-Review-and-Update-2000-2017 + description_processing: |- + Harmonized test scores are normalized to the country with the highest performance, in this case, Singapore. The normalization process involves dividing the country's score by the highest score. + + combined_literacy: + title: Literacy rate + unit: "%" + description_short: Percentage of the population aged 15 and above who can read and write a short, simple statement on their everyday life. + description_processing: |- + **Recent estimates:** + + Percentage of the population between age 25 and age 64 who can, with understanding, read and write a short, simple statement on their everyday life. Generally, ‘literacy’ also encompasses ‘numeracy’, the ability to make simple arithmetic calculations. This indicator is calculated by dividing the number of literates aged 25-64 years by the corresponding age group population and multiplying the result by 100. + + World Bank variable id: UIS.LR.AG25T64 + + Original source: UNESCO Institute for Statistics + + **Historical literacy data:** + + The historical estimates in this long-run cross-country dataset were derived from a blend of diverse sources, each contributing to different time periods. For data before 1800, the dataset relies on the work of Buringh and Van Zanden (2009), which offers insights into literacy through the lens of manuscript and book production in Europe from the sixth to the eighteenth centuries. For the years 1820 and 1870 (excluding the United States), it incorporates data from Broadberry and O'Rourke's "The Cambridge Economic History of Modern Europe." The United States data comes from the National Center for Education Statistics. Additionally, global estimates for the period 1820-2000 are drawn from van Zanden and colleagues’ "How Was Life?: Global Well-being since 1820," an OECD publication. For historical estimates specific to Latin America, the dataset uses the Oxford Latin American Economic History Database (OxLAD). Each source follows a consistent conceptual definition of literacy, although discrepancies among sources are acknowledged, necessitating cautious interpretation of year-to-year changes. The dataset also includes instances where specific sources were preferred, such as opting for OxLAD data over the World Bank for Paraguay in 1982 due to significant differences in literacy rate estimates. + display: + numDecimalPlaces: 1 + + combined_expenditure: + title: Public spending on education as a share of GDP + unit: "%" + description_short: Total general government expenditure on education (all levels of government and all levels of education), given as a share of GDP. + description_processing: |- + **Historical expenditure data:** + + Historical data in this dataset is based on a wide array of sources, reflecting a comprehensive approach to data collection across different time periods and regions. However, the diverse nature of these sources leads to inconsistencies, as methodologies and data quality vary between sources. For instance, older sources like the League of Nations Statistical Yearbook or Mitchell's 1962 data may use different metrics or collection methods compared to more modern sources like the OECD Education reports or UN surveys. This variance in source material and methodology means that direct comparisons across different years or countries might be challenging, necessitating careful interpretation and cross-reference for accuracy. The dataset serves as a rich historical repository but also underscores the complexities and challenges inherent in compiling and harmonizing historical data from multiple, diverse sources. + + **Recent estimates:** + + General government expenditure on education (current, capital, and transfers) is expressed as a percentage of GDP. It includes expenditure funded by transfers from international sources to government. General government usually refers to local, regional and central governments. + + World Bank variable id: SE.XPD.TOTL.GD.ZS + + Original source: UNESCO Institute for Statistics (UIS). UIS.Stat Bulk Data Download Service. Accessed October 24, 2022. + display: + numDecimalPlaces: 1 \ No newline at end of file diff --git a/etl/steps/data/garden/wb/2024-11-04/edstats.py b/etl/steps/data/garden/wb/2024-11-04/edstats.py new file mode 100644 index 00000000000..c5d89aa8b88 --- /dev/null +++ b/etl/steps/data/garden/wb/2024-11-04/edstats.py @@ -0,0 +1,164 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr +from owid.catalog import Table +from owid.catalog.utils import underscore + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +REGIONS = ["North America", "South America", "Europe", "Africa", "Asia", "Oceania", "World"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("edstats") + tb = ds_meadow["edstats"].reset_index() + + # Copy the table with just metadata + metadata_tb = tb.loc[:, ["indicator_name", "indicator_code", "source_note", "source"]] + + # Load historical literacy + ds_literacy = paths.load_dataset("literacy_rates") + tb_literacy = ds_literacy["literacy_rates"] + + # Load historical literacy expenditure data + ds_expenditure = paths.load_dataset("public_expenditure") + tb_expenditure = ds_expenditure["public_expenditure"] + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, + excluded_countries_file=paths.excluded_countries_path, + countries_file=paths.country_mapping_path, + ) + tb = tb.drop("source", axis=1) + + # Pivot the dataframe so that each indicator is a separate column + tb = tb.pivot(index=["country", "year"], columns="indicator_code", values="value") + tb = tb.reset_index() + + # Find the maximum value in the 'HD.HCI.HLOS' column (Harmonized Test Scores) + max_value = tb["HD.HCI.HLOS"].max() + + # Normalize every value in the 'HD.HCI.HLOS' column by the maximum value (How many years of effective learning do you get for every year of education) + tb["normalized_hci"] = tb["HD.HCI.HLOS"] / max_value + + # Combine recent literacy estimates and expenditure data with historical estimates from a migrated dataset + tb = combine_historical_literacy_expenditure(tb, tb_literacy, tb_expenditure) + + # Rename columns based on metadata + tb = rename_columns(tb, metadata_tb) + + # Convert the share of the population with no education to a percentage (bug in the data) + tb[ + "wittgenstein_projection__percentage_of_the_population_age_15plus_by_highest_level_of_educational_attainment__no_education__total" + ] *= 100 + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def combine_historical_literacy_expenditure(tb: Table, tb_literacy: Table, tb_expenditure: Table) -> Table: + """ + Merge historical and recent literacy and expenditure data into a single Table. + + This function combines data from two separate Tables containing historical literacy rates and + public expenditure on education with a primary WB Table. The function handles missing data by favoring recent World Bank data; if this is not available, + it falls back to historical data, which could also be missing (NaN). + + """ + + historic_literacy = ( + tb_literacy[["literacy_rates__world_bank__cia_world_factbook__and_other_sources"]].reset_index().copy() + ) + historic_expenditure = ( + tb_expenditure[["public_expenditure_on_education__tanzi__and__schuktnecht__2000"]].reset_index().copy() + ) + # Recent literacy rates + recent_literacy = tb[["year", "country", "SE.ADT.LITR.ZS"]].copy() + + # Recent public expenditure + recent_expenditure = tb[["year", "country", "SE.XPD.TOTL.GD.ZS"]].copy() + + # Merge the historic and more recent literacy data based on 'year' and 'country' + combined_df = pr.merge( + historic_literacy, + recent_literacy, + on=["year", "country"], + how="outer", + suffixes=("_historic_lit", "_recent_lit"), + ) + + # Merge the historic expenditure with newly created literacy table based on 'year' and 'country' + combined_df = pr.merge(combined_df, historic_expenditure, on=["year", "country"], how="outer") + + # Merge the recent expenditure with newly created literacy and historic expenditure table based on 'year' and 'country' + combined_df = pr.merge( + combined_df, recent_expenditure, on=["year", "country"], how="outer", suffixes=("_historic_exp", "_recent_exp") + ) + combined_df["combined_literacy"] = combined_df["SE.ADT.LITR.ZS"].fillna( + combined_df["literacy_rates__world_bank__cia_world_factbook__and_other_sources"] + ) + combined_df["combined_expenditure"] = combined_df["SE.XPD.TOTL.GD.ZS"].fillna( + combined_df["public_expenditure_on_education__tanzi__and__schuktnecht__2000"] + ) + + # Now, merge the relevant columns in newly created table that includes both historic and more recent data back into the original tb based on 'year' and 'country' + tb = pr.merge( + tb, + combined_df[["year", "country", "combined_literacy", "combined_expenditure"]], + on=["year", "country"], + how="outer", + ) + + return tb + + +def rename_columns(tb: Table, metadata_tb: Table) -> Table: + "Rename columns in the table based on the metadata table." + for column in tb.columns: + if column not in [ + "country", + "year", + "normalized_hci", + "combined_literacy", + "combined_expenditure", + ]: + # Extract relevant name. + name = ( + metadata_tb.loc[metadata_tb["indicator_code"] == column, "indicator_name"] + .str.replace("‚", "") # commas caused problems when renaming variables later on + .iloc[0] + ) + # Truncate the last 5 words if the length of the string exceeds 250 characters + if len(name) > 250: + # Separate the string into words and truncate + words = name.split() + # Get all words up to the fifth-to-last word + selected_words = words[:-10] + # Reconstruct the selected words into a single string + name = " ".join(selected_words) + + # Convert the name to underscore format + new_column_name = underscore(name) # Convert extracted name to underscore format + + # Update the column names and metadata + tb.rename(columns={column: new_column_name}, inplace=True) + return tb diff --git a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.countries.json b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.countries.json new file mode 100644 index 00000000000..7ca68192813 --- /dev/null +++ b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.countries.json @@ -0,0 +1,10 @@ +{ + "EAP": "East Asia and Pacific (PIP)", + "ECA": "Europe and Central Asia (PIP)", + "LAC": "Latin America and the Caribbean (PIP)", + "MNA": "Middle East and North Africa (PIP)", + "OHI": "Other high income countries (PIP)", + "SAS": "South Asia (PIP)", + "SSA": "Sub-Saharan Africa (PIP)", + "World": "World" +} \ No newline at end of file diff --git a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml new file mode 100644 index 00000000000..f3f04bbd41d --- /dev/null +++ b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml @@ -0,0 +1,108 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: minor + display: &common-display + tolerance: 0 + entityAnnotationsMap: |- + Other high income countries (PIP): e.g. US, Western Europe, Australia, Japan, South Korea and Saudi Arabia + presentation: + topic_tags: + - Poverty + + description_key_povertyline: |- + <% if povertyline == "2.15" %> + Extreme poverty here is defined as living below the International Poverty Line of $2.15 per day. + <% elif povertyline == "3.65" %> + A poverty line of $3.65 a day represents definitions of national poverty lines in lower-middle-income countries. + <% elif povertyline == "6.85" %> + A poverty line of $6.85 a day represents definitions of national poverty lines in upper-middle-income countries. + <%- endif -%> + + description_key_ppp: |- + The data is measured in international-$ at 2017 prices – this adjusts for inflation and for differences in the cost of living between countries. + + description_key_income_consumption: |- + Depending on the country and year, the data relates to income measured after taxes and benefits, or to consumption, per capita. "Per capita" means that the income of each household is attributed equally to each member of the household (including children). + + description_key_nonmarket_income: |- + Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account. + + description_key_scenarios: |- + <% if scenario == "Historical estimates" %> + Estimates are based on household surveys or extrapolated up until the year of the data release using GDP growth estimates and forecasts. For more details about the methodology, please refer to the [World Bank PIP documentation](https://datanalytics.worldbank.org/PIP-Methodology/lineupestimates.html#nowcasts). + <% elif scenario == "Current forecast + historical growth projections" %> + This data is a projection of the estimates based on GDP growth projections from the World Bank's Global Economic Prospects and the the Macro Poverty Outlook, together with IMF's World Economic Outlook, in the period 2025-2029. For the period 2030-2050, the data is projected using the average annual historical GDP per capita growth over 2010-2019. + <% elif scenario == "Historical estimates + projections" %> + This data combines data based on household surveys or extrapolated up until the year of the data release using GDP growth estimates and forecasts, with projections based on GDP growth projections from the World Bank's Global Economic Prospects and the the Macro Poverty Outlook, together with IMF's World Economic Outlook, in the period 2025-2029. For the period 2030-2050, the data is projected using the average annual historical GDP per capita growth over 2010-2019. + <% elif scenario == "2% growth projections" %> + This data is a projection of the estimates based on a scenario of 2% average GDP per capita growth, while keeping income inequality constant. + <% elif scenario == "2% growth + Gini reduction 1% projections" %> + This data is a projection of the estimates based on a scenatio of 2% average GDP per capita growth, while reducing income inequality by 1% of the Gini coefficient per year. + <% elif scenario == "2% growth + Gini reduction 2% projections" %> + This data is a projection of the estimates based on a scenatio of 2% average GDP per capita growth, while reducing income inequality by 2% of the Gini coefficient per year. + <% elif scenario == "4% growth projections" %> + This data is a projection of the estimates based on a scenario of 4% average GDP per capita growth, while keeping income inequality constant. + <% elif scenario == "6% growth projections" %> + This data is a projection of the estimates based on a scenario of 6% average GDP per capita growth, while keeping income inequality constant. + <% elif scenario == "8% growth projections" %> + This data is a projection of the estimates based on a scenario of 8% average GDP per capita growth, while keeping income inequality constant. + <%- endif -%> + + isprojection_by_scenario: |- + <% if scenario == "Historical estimates" or scenario == "Historical estimates + projections" %> + false + <% else %> + true + <%- endif -%> + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + title: Poverty projections by the World Bank + update_period_days: 681 + + +tables: + poverty_projections: + variables: + fgt0: + title: $<> a day - Share of population in poverty (<>) + unit: "%" + short_unit: "%" + description_short: "Percentage of population living in households with an income or consumption per person below $<> a day" + description_key: + - "{definitions.description_key_povertyline}" + - "{definitions.description_key_ppp}" + - "{definitions.description_key_income_consumption}" + - "{definitions.description_key_nonmarket_income}" + - "{definitions.description_key_scenarios}" + presentation: + title_public: Share of population living in poverty + title_variant: $<> a day, <> + display: + name: Share of population living below $<> a day (<>) + numDecimalPlaces: 1 + isProjection: "{definitions.isprojection_by_scenario}" + <<: *common-display + + poorpop: + title: $<> a day - Number of people in poverty (<>) + unit: "people" + short_unit: "" + description_short: "Number of people living in households with an income or consumption per person below $<> a day" + description_key: + - "{definitions.description_key_povertyline}" + - "{definitions.description_key_ppp}" + - "{definitions.description_key_income_consumption}" + - "{definitions.description_key_nonmarket_income}" + - "{definitions.description_key_scenarios}" + presentation: + title_public: Number of people living in poverty + title_variant: $<> a day, <> + display: + name: Number of people living below $<> a day (<>) + numDecimalPlaces: 0 + isProjection: "{definitions.isprojection_by_scenario}" + <<: *common-display diff --git a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py new file mode 100644 index 00000000000..66e637c2fcd --- /dev/null +++ b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py @@ -0,0 +1,135 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr +from owid.catalog import Table +from owid.datautils.dataframes import map_series + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Define latest year without projections +LATEST_YEAR_WITHOUT_PROJECTIONS = 2024 + +# Define tables to be loaded. I am not processing country, because they were created for the aggregations and not to highlight them. +TABLES = ["region", "global"] + +# Define scenarios and new names +SCENARIOS = { + "historical": "Historical estimates", + "current_forecast": "Current forecast + historical growth projections", + "2pct": "2% growth projections", + "2pct_gini1": "2% growth + Gini reduction 1% projections", + "2pct_gini2": "2% growth + Gini reduction 2% projections", + "4pct": "4% growth projections", + "6pct": "6% growth projections", + "8pct": "8% growth projections", +} + +# Define index columns +INDEX_COLUMNS = ["country", "year", "povertyline", "scenario"] + +# Define indicator columns +INDICATOR_COLUMNS = ["fgt0", "poorpop"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("poverty_projections") + + # Read tables from meadow dataset. + tables = [ds_meadow.read(table_name) for table_name in TABLES] + + # + # Process data. + # + # Concatenate tables + tb = pr.concat(tables, ignore_index=True) + + # Multiply poorpop by 1_000_000 + tb["poorpop"] = tb["poorpop"] * 1_000_000 + + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + + tb = connect_estimates_with_projections(tb) + + # Rename scenario column + tb["scenario"] = map_series( + series=tb["scenario"], + mapping=SCENARIOS, + ) + + # Recover origins + tb["scenario"] = tb["scenario"].copy_metadata(tb["country"]) + + tb = tb.format(INDEX_COLUMNS, short_name="poverty_projections") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def connect_estimates_with_projections(tb: Table) -> Table: + """ + Connects estimates with projections for visualizations in Grapher. + This is repeating the latest estimate in the historical scenario in the rest of the scenarios. + """ + + tb = tb.copy() + + # Save tb_historical and tb_current_forecast, by filtering scenario in historical and current_forecast + tb_historical = tb[tb["scenario"] == "historical"].copy().reset_index(drop=True) + tb_current_forecast = tb[tb["scenario"] == "current_forecast"].copy().reset_index(drop=True) + + # Make table wider, by using scenario as columns + tb = tb.pivot(index=["country", "year", "povertyline"], columns="scenario", values=INDICATOR_COLUMNS) + + # For year LATEST_YEAR_WITHOUT_PROJECTIONS, fill the rest of the columns with the same value + for indicator in INDICATOR_COLUMNS: + for scenario in SCENARIOS.keys(): + if scenario != "historical": + tb.loc[ + tb.index.get_level_values("year") == LATEST_YEAR_WITHOUT_PROJECTIONS, (indicator, scenario) + ] = tb.loc[ + tb.index.get_level_values("year") == LATEST_YEAR_WITHOUT_PROJECTIONS, (indicator, scenario) + ].combine_first( + tb.loc[ + tb.index.get_level_values("year") == LATEST_YEAR_WITHOUT_PROJECTIONS, (indicator, "historical") + ] + ) + + # Make table long again, by creating a scenario column + tb = tb.stack(level="scenario", future_stack=True).reset_index() + + # Recover origins + for indicator in INDICATOR_COLUMNS: + tb[indicator] = tb[indicator].copy_metadata(tb["country"]) + + # Combine historical and current_forecast, by concatenating tb_historical and tb_current_forecast + tb_connected = pr.concat([tb_historical, tb_current_forecast], ignore_index=True) + + # Rename scenario column to "Historical + current forecast + historical growth" + tb_connected["scenario"] = "Historical estimates + projections" + + # Keep only the columns in INDEX_COLUMNS and INDICATOR_COLUMNS + tb_connected = tb_connected[INDEX_COLUMNS + INDICATOR_COLUMNS] + + # Concatenate tb and tb_connected + tb = pr.concat([tb, tb_connected], ignore_index=True) + + return tb diff --git a/etl/steps/data/garden/who/2022-09-30/ghe.meta.yml b/etl/steps/data/garden/who/2022-09-30/ghe.meta.yml index ccfd9781552..b16d1b679f1 100644 --- a/etl/steps/data/garden/who/2022-09-30/ghe.meta.yml +++ b/etl/steps/data/garden/who/2022-09-30/ghe.meta.yml @@ -113,7 +113,7 @@ definitions: Estimated number of [DALYs](#dod:dalys) from << cause.lower() >> among {definitions.sex} aged {definitions.age}, per 100,000 people. <%- endif -%> footnote: |- - <%- if age == "Age-standardized" -%>To allow for comparisons between countries and over time, this metric is [age-standardized](#dod:age_standardized).<%- endif -%> + <%- if age_group is defined and age_group == "Age-standardized" -%>To allow for comparisons between countries and over time, this metric is [age-standardized](#dod:age_standardized).<%- endif -%> tables: ghe: variables: diff --git a/etl/steps/data/garden/who/2022-09-30/ghe.py b/etl/steps/data/garden/who/2022-09-30/ghe.py index f73d1aea4c9..d2f40f14a0c 100644 --- a/etl/steps/data/garden/who/2022-09-30/ghe.py +++ b/etl/steps/data/garden/who/2022-09-30/ghe.py @@ -50,7 +50,7 @@ def run(dest_dir: str) -> None: regions = paths.load_dataset("regions")["regions"] # Load WHO Standard population snap = paths.load_snapshot("standard_age_distribution.csv") - who_standard = snap.read() + who_standard = snap.read(safe_types=False) who_standard = format_who_standard(who_standard) # Read population dataset ds_population = paths.load_dataset("un_wpp") diff --git a/etl/steps/data/garden/who/2023-03-09/gho_suicides.py b/etl/steps/data/garden/who/2023-03-09/gho_suicides.py index 8761db74138..0b972975e69 100644 --- a/etl/steps/data/garden/who/2023-03-09/gho_suicides.py +++ b/etl/steps/data/garden/who/2023-03-09/gho_suicides.py @@ -97,6 +97,8 @@ def process_ratio(df: pd.DataFrame) -> pd.DataFrame: df_female = df[df["sex"] == "female"].drop(columns=["sex"]) # Merge data by year and country df_ratio = df_male.merge(df_female, on=["country", "year"], suffixes=("_m", "_f")) + # Don't divide by zero + df_ratio = df_ratio[df_ratio.suicide_rate_f != 0] # Estimate ratio df_ratio["suicide_rate_male_to_female"] = df_ratio["suicide_rate_m"] / df_ratio["suicide_rate_f"] # Keep only relevant columns diff --git a/etl/steps/data/garden/who/2024-07-26/mortality_database.meta.yml b/etl/steps/data/garden/who/2024-07-26/mortality_database.meta.yml index 281c07a235f..2c481de1635 100644 --- a/etl/steps/data/garden/who/2024-07-26/mortality_database.meta.yml +++ b/etl/steps/data/garden/who/2024-07-26/mortality_database.meta.yml @@ -5,8 +5,8 @@ definitions: topic_tags: - Causes of Death processing_level: minor - -# Learn more about the available fields: + annotation: |- + <% if cause == 'Maternal conditions' %>United States: Values from 2003–2017 affected by measurement change<%- endif -%> # http://docs.owid.io/projects/etl/architecture/metadata/reference/ dataset: update_period_days: 365 @@ -22,6 +22,8 @@ tables: unit: deaths display: numDecimalPlaces: 0 + entityAnnotationsMap: |- + {definitions.annotation} presentation: grapher_config: note: |- @@ -45,6 +47,8 @@ tables: unit: deaths per 100,000 people display: numDecimalPlaces: 1 + entityAnnotationsMap: |- + {definitions.annotation} presentation: grapher_config: note: |- @@ -56,6 +60,8 @@ tables: unit: deaths per 100,000 people display: numDecimalPlaces: 1 + entityAnnotationsMap: |- + {definitions.annotation} presentation: grapher_config: note: |- diff --git a/etl/steps/data/garden/who/2024-07-26/mortality_database.py b/etl/steps/data/garden/who/2024-07-26/mortality_database.py index b453039ae0d..9c5ec813ac4 100644 --- a/etl/steps/data/garden/who/2024-07-26/mortality_database.py +++ b/etl/steps/data/garden/who/2024-07-26/mortality_database.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("mortality_database") - tb = ds_meadow.read_table("mortality_database") + tb = ds_meadow.read("mortality_database", safe_types=False) # # Process data. # diff --git a/etl/steps/data/garden/who/2024-07-30/ghe.meta.yml b/etl/steps/data/garden/who/2024-07-30/ghe.meta.yml index 945ba3eb5bd..b575ed71b7d 100644 --- a/etl/steps/data/garden/who/2024-07-30/ghe.meta.yml +++ b/etl/steps/data/garden/who/2024-07-30/ghe.meta.yml @@ -11,109 +11,109 @@ definitions: - World processing_level: major age: |- - <%- if age_group == "ALLAges" -%> + <% if age_group == "ALLAges" %> all ages - <%- elif age_group == "age-standardized" -%> + <%- elif age_group == "age-standardized" %> an age-standardized population - <%- elif age_group == "YEARS0-14" -%> + <%- elif age_group == "YEARS0-14" %> 0-14 year olds - <%- elif age_group == "YEARS0-4" -%> + <%- elif age_group == "YEARS0-4" %> 0-4 year olds - <%- elif age_group == "YEARS5-14" -%> + <%- elif age_group == "YEARS5-14" %> 5-14 year olds - <%- elif age_group == "YEARS15-19" -%> + <%- elif age_group == "YEARS15-19" %> 15-19 year olds - <%- elif age_group == "YEARS15-49" -%> + <%- elif age_group == "YEARS15-49" %> 15-49 year olds - <%- elif age_group == "YEARS20-24" -%> + <%- elif age_group == "YEARS20-24" %> 20-24 year olds - <%- elif age_group == "YEARS25-34" -%> + <%- elif age_group == "YEARS25-34" %> 25-34 year olds - <%- elif age_group == "YEARS35-44" -%> + <%- elif age_group == "YEARS35-44" %> 35-44 year olds - <%- elif age_group == "YEARS45-54" -%> + <%- elif age_group == "YEARS45-54" %> 45-54 year olds - <%- elif age_group == "YEARS50-69" -%> + <%- elif age_group == "YEARS50-69" %> 50-69 year olds - <%- elif age_group == "YEARS55-64" -%> + <%- elif age_group == "YEARS55-64" %> 55-64 year olds - <%- elif age_group == "YEARS65-74" -%> + <%- elif age_group == "YEARS65-74" %> 65-74 year olds - <%- elif age_group == "YEARS70+" -%> + <%- elif age_group == "YEARS70+" %> 70+ year olds - <%- elif age_group == "YEARS75-84" -%> + <%- elif age_group == "YEARS75-84" %> 75-84 year olds - <%- elif age_group == "YEARS85PLUS" -%> + <%- elif age_group == "YEARS85PLUS" %> 85+ year olds - <%- endif -%> + <%- endif %> sex: |- - <%- if sex == "Both sexes" %>both sexes<% elif sex == "Male" %>males<% elif sex == "Female" %>females<% endif -%> + <% if sex == "Both sexes" %>both sexes<% elif sex == "Male" %>males<% elif sex == "Female" %>females<% endif %> deaths_title: |- - <%- if age_group == "ALLAges" -%> + <% if age_group == "ALLAges" %> Total deaths from << cause.lower() >> among {definitions.sex} - <%- elif age_group == "Age-standardized" -%> + <%- elif age_group == "Age-standardized" %> Age-standardized deaths from << cause.lower() >> among {definitions.sex} - <%- else -%> + <%- else %> Deaths from << cause.lower() >> among {definitions.sex} aged {definitions.age} - <%- endif -%> + <%- endif %> deaths_description: |- - <%- if age_group == "ALLAges" -%> + <% if age_group == "ALLAges" %> Estimated number of deaths from << cause.lower() >> in {definitions.sex}. - <%- elif age_group == "Age-standardized" -%> + <%- elif age_group == "Age-standardized" %> Estimated number of age-standardized deaths from << cause.lower() >> in {definitions.sex}. - <%- else -%> - Estimated number of deaths from << cause.lower() >> among {definitions.sex} aged {definitions.age}. - <%- endif -%> + <%- else %> + Estimated number of deaths from << cause.lower() >> among {definitions.sex} aged {definitions.age}. + <%- endif %> death_rate_title: |- - <%- if age_group == "ALLAges" -%> + <% if age_group == "ALLAges" %> Death rate from << cause.lower() >> among {definitions.sex} - <%- elif age_group == "Age-standardized" -%> + <%- elif age_group == "Age-standardized" %> Age-standardized death rate from << cause.lower() >> among {definitions.sex} - <%- else -%> + <%- else %> Death rate from << cause.lower() >> among {definitions.sex} aged {definitions.age} - <%- endif -%> + <%- endif %> death_rate_description: |- - <%- if age_group == "ALLAges" -%> + <% if age_group == "ALLAges" %> Estimated number of deaths from << cause.lower() >> in {definitions.sex}, per 100,000 people. - <%- elif age_group == "Age-standardized" -%> + <%- elif age_group == "Age-standardized" %> Estimated number of age-standardized deaths from << cause.lower() >> in {definitions.sex}, per 100,000 people. - <%- else -%> + <%- else %> Estimated number of deaths from << cause.lower() >> among {definitions.sex} aged {definitions.age}, per 100,000 people. - <%- endif -%> + <%- endif %> dalys_title: |- - <%- if age_group == "ALLAges" -%> + <% if age_group == "ALLAges" %> DALYs from << cause.lower() >> among {definitions.sex} - <%- elif age_group == "Age-standardized" -%> + <%- elif age_group == "Age-standardized" %> Age-standardized DALYs from << cause.lower() >> among {definitions.sex} - <%- else -%> + <%- else %> DALYs from << cause.lower() >> among {definitions.sex} aged {definitions.age} - <%- endif -%> + <%- endif %> dalys_description: |- - <%- if age_group == "ALLAges" -%> + <% if age_group == "ALLAges" %> Estimated number of [DALYs](#dod:dalys) from << cause.lower() >> in {definitions.sex}. - <%- elif age_group == "Age-standardized" -%> + <%- elif age_group == "Age-standardized" %> Estimated number of age-standardized [DALYs](#dod:dalys) from << cause.lower() >> in {definitions.sex}. - <%- else -%> + <%- else %> Estimated number of [DALYs](#dod:dalys) from << cause.lower() >> among {definitions.sex} aged {definitions.age}. - <%- endif -%> + <%- endif %> dalys_rate_title: |- - <%- if age_group == "ALLAges" -%> + <% if age_group == "ALLAges" %> DALYs from << cause.lower() >>, among {definitions.sex} per 100,000 people - <%- elif age_group == "Age-standardized" -%> + <%- elif age_group == "Age-standardized" %> Age-standardized DALYs from << cause.lower() >> among {definitions.sex}, per 100,000 people - <%- else -%> + <%- else %> DALYs from << cause.lower() >> among {definitions.sex} aged {definitions.age}, per 100,000 people - <%- endif -%> + <%- endif %> dalys_rate_description: |- - <%- if age_group == "ALLAges" -%> + <% if age_group == "ALLAges" %> Estimated number of [DALYs](#dod:dalys) from << cause.lower() >> in {definitions.sex}, per 100,000 people. - <%- elif age_group == "Age-standardized" -%> + <%- elif age_group == "Age-standardized" %> Estimated number of age-standardized [DALYs](#dod:dalys) from << cause.lower() >> in {definitions.sex}, per 100,000 people. - <%- else -%> + <%- else %> Estimated number of [DALYs](#dod:dalys) from << cause.lower() >> among {definitions.sex} aged {definitions.age}, per 100,000 people. - <%- endif -%> + <%- endif %> footnote: |- - <%- if age == "Age-standardized" %>To allow for comparisons between countries and over time, this metric is [age-standardized](#dod:age_standardized).<%- endif -%> + <% if age_group is defined and age_group == "Age-standardized" %>To allow for comparisons between countries and over time, this metric is [age-standardized](#dod:age_standardized).<% endif %> tables: ghe: variables: diff --git a/etl/steps/data/garden/who/2024-07-30/ghe.py b/etl/steps/data/garden/who/2024-07-30/ghe.py index 4a9fc1e1c30..bf1e01984a8 100644 --- a/etl/steps/data/garden/who/2024-07-30/ghe.py +++ b/etl/steps/data/garden/who/2024-07-30/ghe.py @@ -71,20 +71,20 @@ def run(dest_dir: str) -> None: # read dataset from meadow ds_meadow = paths.load_dataset() - tb = ds_meadow.read_table("ghe") + tb = ds_meadow.read("ghe", safe_types=False) tb = tb.drop(columns="flag_level") tb = rename_table_for_compatibility(tb) if SUBSET: - required_causes = ["Drug use disorders", "Alcohol use disorders"] + required_causes = ["Drug use disorders", "Alcohol use disorders", "Self-harm"] tb = tb[tb.cause.isin(SUBSET.split(",") + required_causes)] # Load countries regions regions = paths.load_dataset("regions")["regions"] # Load WHO Standard population snap = paths.load_snapshot("standard_age_distribution.csv") - who_standard = snap.read() + who_standard = snap.read(safe_types=False) who_standard = format_who_standard(who_standard) # Read population dataset ds_population = paths.load_dataset("un_wpp") @@ -114,7 +114,7 @@ def run(dest_dir: str) -> None: def rename_table_for_compatibility(tb: Table) -> Table: """Rename columns and labels to be compatible with the previous version of the dataset.""" - tb.age_group = tb.age_group.map(AGE_GROUPS_MAP) + tb.age_group = dataframes.map_series(tb.age_group, AGE_GROUPS_MAP) tb = tb.rename( columns={ "val_dths_count_numeric": "death_count", diff --git a/etl/steps/data/garden/who/2024-08-06/mortality_database_cancer.py b/etl/steps/data/garden/who/2024-08-06/mortality_database_cancer.py index c1dd63279dc..fc739d0178e 100644 --- a/etl/steps/data/garden/who/2024-08-06/mortality_database_cancer.py +++ b/etl/steps/data/garden/who/2024-08-06/mortality_database_cancer.py @@ -15,7 +15,8 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("mortality_database_cancer") - tb = ds_meadow.read_table("mortality_database_cancer") + tb = ds_meadow.read("mortality_database_cancer", safe_types=False) + # # Process data. # diff --git a/etl/steps/data/garden/who/2024-08-06/mortality_database_cancer_most_common.py b/etl/steps/data/garden/who/2024-08-06/mortality_database_cancer_most_common.py index aaf90b63164..3503a56d6cb 100644 --- a/etl/steps/data/garden/who/2024-08-06/mortality_database_cancer_most_common.py +++ b/etl/steps/data/garden/who/2024-08-06/mortality_database_cancer_most_common.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("mortality_database_cancer") - tb = ds_meadow.read_table("mortality_database_cancer") + tb = ds_meadow.read("mortality_database_cancer") # # Process data. diff --git a/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.meta.yml b/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.meta.yml index 857a41e9d98..10a9a8fc9c2 100644 --- a/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.meta.yml +++ b/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.meta.yml @@ -6,7 +6,6 @@ definitions: - Economic Inequality grapher_config: originUrl: https://ourworldindata.org/economic-inequality - $schema: https://files.ourworldindata.org/schemas/grapher-schema.003.json processing_level: major @@ -95,6 +94,14 @@ tables: - China - France + p90p100_share_wealth: + presentation: + title_public: Wealth share of the richest 10% + + p99p100_share_wealth: + presentation: + title_public: Wealth share of the richest 1% + world_inequality_database_fiscal: common: description_key: diff --git a/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.py b/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.py index 9df7216915a..f509aa232f7 100644 --- a/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.py +++ b/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.py @@ -6,7 +6,6 @@ """ - import owid.catalog.processing as pr from owid.catalog import Table from shared import add_metadata_vars, add_metadata_vars_distribution diff --git a/etl/steps/data/garden/worldbank_wdi/2023-05-29/wdi.meta.yml b/etl/steps/data/garden/worldbank_wdi/2023-05-29/wdi.meta.yml index f131c9cb6c4..f9124b59ec0 100644 --- a/etl/steps/data/garden/worldbank_wdi/2023-05-29/wdi.meta.yml +++ b/etl/steps/data/garden/worldbank_wdi/2023-05-29/wdi.meta.yml @@ -4057,10 +4057,12 @@ tables: title: Average transaction cost of sending remittances to a specific country (%) short_unit: '%' unit: '%' + description_processing: We calculated average transaction cost of regions by taking the average cost of each country weighted by the volume of remittances sent to that country. Only countries where both average remittance costs and total remittance inflows are available are included in the calculation. si_rmt_cost_ob_zs: title: Average transaction cost of sending remittances from a specific country (%) short_unit: '%' unit: '%' + description_processing: We calculated average transaction cost of regions by taking the average cost of each country weighted by the volume of remittances sent from that country. Only countries where both average remittance costs and total remittance outflows are available are included in the calculation. si_spr_pc40: title: Survey mean consumption or income per capita, bottom 40% of population (2017 PPP $ per day) short_unit: $ diff --git a/etl/steps/data/garden/worldbank_wdi/2024-05-20/wdi.meta.yml b/etl/steps/data/garden/worldbank_wdi/2024-05-20/wdi.meta.yml index 3d1e2775e81..966a235f212 100644 --- a/etl/steps/data/garden/worldbank_wdi/2024-05-20/wdi.meta.yml +++ b/etl/steps/data/garden/worldbank_wdi/2024-05-20/wdi.meta.yml @@ -4054,10 +4054,12 @@ tables: title: Average transaction cost of sending remittances to a specific country (%) short_unit: '%' unit: '%' + description_processing: We calculated the average transaction cost for sending money to a region by calculating total volume of incoming transactions and the total associated costs (assuming average transaction costs). We then divided the total costs by the total volume of transactions to get the average transaction cost. This indicator is only available if cost data is available for more than 75% of the total volume of transactions. si_rmt_cost_ob_zs: title: Average transaction cost of sending remittances from a specific country (%) short_unit: '%' unit: '%' + description_processing: We calculated the average transaction cost for sending money from a region by calculating total volume of outgoing transactions and the total associated costs (assuming average transaction costs). We then divided the total costs by the total volume of transactions to get the average transaction cost. This indicator is only available if cost data is available for more than 75% of the total volume of transactions. si_spr_pc40: title: Survey mean consumption or income per capita, bottom 40% of population (2017 PPP $ per day) short_unit: $ diff --git a/etl/steps/data/garden/worldbank_wdi/2024-05-20/wdi.py b/etl/steps/data/garden/worldbank_wdi/2024-05-20/wdi.py index 154e983c376..b4bfd036734 100644 --- a/etl/steps/data/garden/worldbank_wdi/2024-05-20/wdi.py +++ b/etl/steps/data/garden/worldbank_wdi/2024-05-20/wdi.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Any, Dict, Optional +import owid.catalog.processing as pr import pandas as pd import structlog from owid.catalog import Dataset, Table, VariableMeta @@ -27,6 +28,8 @@ def run(dest_dir: str) -> None: # Load meadow dataset. ds_meadow = paths.load_dataset() ds_population = paths.load_dataset("population") + ds_regions = paths.load_dataset("regions") + ds_income_groups = paths.load_dataset("income_groups") # # Process data. @@ -63,6 +66,9 @@ def run(dest_dir: str) -> None: # add armed personnel as share of population tb_garden = add_armed_personnel_as_share_of_population(tb_garden, ds_population) + # add regions to remittance data + tb_garden = add_regions_to_remittance_data(tb_garden, ds_regions, ds_income_groups) + #################################################################################################################### # @@ -77,6 +83,96 @@ def run(dest_dir: str) -> None: log.info("wdi.end") +def add_regions_to_remittance_data(tb: Table, ds_regions: Dataset, ds_income_groups: Dataset) -> Table: + """ + Add regions to remittance data, if more than 75% of remittance volume sent/ received is covered by cost data. + + notes for indicators: + - si_rmt_cost_ib_zs: % cost of receiving remittances (inbound) + - si_rmt_cost_ob_zs: % cost of sending remittances (outbound) + - bx_trf_pwkr_cd_dt: total remittances received by country + - bm_trf_pwkr_cd_dt: total remittances sent by country + """ + + tb = tb.reset_index() + + # create a copy so other indicators are not affected + regions_tb = tb.copy() + + # create new columns for total remittances (only for countries where remittance cost is available) + # this is needed to calculate share of remittance volume covered by cost data + regions_tb["total_received_remittances"] = regions_tb["bx_trf_pwkr_cd_dt"].where( + regions_tb["si_rmt_cost_ib_zs"].notna() + ) + regions_tb["total_sent_remittances"] = regions_tb["bm_trf_pwkr_cd_dt"].where( + regions_tb["si_rmt_cost_ob_zs"].notna() + ) + + # calculate total cost of remittance for each country + regions_tb["total_cost_of_receiving_remittances"] = ( + regions_tb["si_rmt_cost_ib_zs"] * regions_tb["total_received_remittances"] + ) + regions_tb["total_cost_of_sending_remittances"] = ( + regions_tb["si_rmt_cost_ob_zs"] * regions_tb["total_sent_remittances"] + ) + + # aggregation for regions + agg = { + "total_cost_of_receiving_remittances": "sum", + "total_cost_of_sending_remittances": "sum", + "total_received_remittances": "sum", + "total_sent_remittances": "sum", + "bx_trf_pwkr_cd_dt": "sum", + "bm_trf_pwkr_cd_dt": "sum", + } + + # add regions to table + regions_tb = geo.add_regions_to_table( + regions_tb, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + aggregations=agg, + min_num_values_per_year=1, + ) + + # calculate cost of remittances per region + regions_tb["calc_cost_received_for_regions"] = ( + regions_tb["total_cost_of_receiving_remittances"] / regions_tb["total_received_remittances"] + ) + regions_tb["calc_cost_sent_for_regions"] = ( + regions_tb["total_cost_of_sending_remittances"] / regions_tb["total_sent_remittances"] + ) + + # calculate share of remittances covered by cost + regions_tb["perc_covered_by_cost_received"] = ( + regions_tb["total_received_remittances"] / regions_tb["bx_trf_pwkr_cd_dt"] + ) + regions_tb["perc_covered_by_cost_sent"] = regions_tb["total_sent_remittances"] / regions_tb["bm_trf_pwkr_cd_dt"] + + # only keep cost for regions if >75% of remittance volumne sent/ received is covered by cost + regions_tb["si_rmt_cost_ib_zs"] = regions_tb["calc_cost_received_for_regions"].where( + regions_tb["perc_covered_by_cost_received"] > 0.75 + ) + regions_tb["si_rmt_cost_ob_zs"] = regions_tb["calc_cost_sent_for_regions"].where( + regions_tb["perc_covered_by_cost_sent"] > 0.75 + ) + + col_to_replace = [ + "si_rmt_cost_ib_zs", + "si_rmt_cost_ob_zs", + "bx_trf_pwkr_cd_dt", + "bm_trf_pwkr_cd_dt", + ] + + col_rest = [col for col in tb.columns if col not in col_to_replace] + + tb = pr.merge(tb[col_rest], regions_tb[col_to_replace + ["country", "year"]], on=["country", "year"], how="outer") + + tb = tb.format(["country", "year"]) + + return tb + + def mk_omms(table: Table) -> Table: """calculates custom variables (aka "owid-maintained metrics")""" df = pd.DataFrame(table) diff --git a/etl/steps/data/garden/wpf/2024-10-03/famines_by_place.meta.yml b/etl/steps/data/garden/wpf/2024-10-03/famines_by_place.meta.yml index 2bad2f61c36..58a04091e0a 100644 --- a/etl/steps/data/garden/wpf/2024-10-03/famines_by_place.meta.yml +++ b/etl/steps/data/garden/wpf/2024-10-03/famines_by_place.meta.yml @@ -25,3 +25,6 @@ tables: decadal_famine_deaths: title: Deaths from famines by top countries by decade unit: 'deaths' + presentation: + grapher_config: + note: Decadal figures represent data averaged over each ten-year period (e.g., 1990–1999 for the 1990s). The 2020s figure is provisional and includes data only up to and including 2023. diff --git a/etl/steps/data/garden/wpf/2024-10-03/famines_by_place.py b/etl/steps/data/garden/wpf/2024-10-03/famines_by_place.py index 63fad32fbf9..b1d23fe069a 100644 --- a/etl/steps/data/garden/wpf/2024-10-03/famines_by_place.py +++ b/etl/steps/data/garden/wpf/2024-10-03/famines_by_place.py @@ -59,13 +59,14 @@ def run(dest_dir: str) -> None: tb_other = tb[~tb["country"].isin(main_countries)] tb_main["country"] = tb_main["country"].replace( { - "Russia": "USSR", - "Ukraine": "USSR", - "Russia, Ukraine": "USSR", - "Germany, USSR": "USSR", - "Moldova, Ukraine, Russia, Belarus": "USSR", - "Russia, Western Soviet States": "USSR", - "Russia, Kazakhstan": "USSR", + "Russia": "USSR/Russia", + "Ukraine": "USSR/Russia", + "Russia, Ukraine": "USSR/Russia", + "Germany, USSR": "USSR/Russia", + "Moldova, Ukraine, Russia, Belarus": "USSR/Russia", + "Russia, Western Soviet States": "USSR/Russia", + "Russia, Kazakhstan": "USSR/Russia", + "USSR": "USSR/Russia", "India, Bangladesh": "India", } ) diff --git a/etl/steps/data/garden/wpf/2024-10-03/total_famines_by_year_decade.meta.yml b/etl/steps/data/garden/wpf/2024-10-03/total_famines_by_year_decade.meta.yml index 52c22674679..d84ee958339 100644 --- a/etl/steps/data/garden/wpf/2024-10-03/total_famines_by_year_decade.meta.yml +++ b/etl/steps/data/garden/wpf/2024-10-03/total_famines_by_year_decade.meta.yml @@ -7,6 +7,8 @@ definitions: topic_tags: - Famines + footnote_decadal: |- + Decadal figures represent data averaged over each ten-year period (e.g., 1990–1999 for the 1990s). The 2020s figure is provisional and includes data only up to and including 2023. # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ @@ -30,6 +32,10 @@ tables: description_short: Deaths in famines that are estimated to have killed 100,000 people or more. display: numDecimalPlaces: 0 + presentation: + grapher_config: + note: |- + {definitions.footnote_decadal} famine_deaths_per_rate: title: Death rates from famines @@ -44,6 +50,10 @@ tables: description_short: Deaths in famines that are estimated to have killed 100,000 people or more, per 100,000 people. display: numDecimalPlaces: 1 + presentation: + grapher_config: + note: |- + {definitions.footnote_decadal} famine_count: title: Number of famines @@ -58,3 +68,7 @@ tables: description_short: Famines that are estimated to have killed 100,000 people or more. display: numDecimalPlaces: 0 + presentation: + grapher_config: + note: |- + {definitions.footnote_decadal} \ No newline at end of file diff --git a/etl/steps/data/garden/wpf/2024-10-03/total_famines_by_year_decade.py b/etl/steps/data/garden/wpf/2024-10-03/total_famines_by_year_decade.py index cbb06e28145..5067cd0df3f 100644 --- a/etl/steps/data/garden/wpf/2024-10-03/total_famines_by_year_decade.py +++ b/etl/steps/data/garden/wpf/2024-10-03/total_famines_by_year_decade.py @@ -30,6 +30,55 @@ def run(dest_dir: str) -> None: # # Process data. # + + # Calculate decadal number of famines before exploding the 'date' column to avoid double counting + tb_decadal_famine_counts = tb.copy() + + tb_decadal_famine_counts["date"] = tb_decadal_famine_counts["date"].astype(str) + # Split the years in the 'date' column and extract as a list of integers + tb_decadal_famine_counts["years"] = tb_decadal_famine_counts["date"].apply( + lambda x: [int(year.strip()) for year in x.split(",")] + ) + + # Filter years to keep only those from different decades + def filter_decades(years): + # Use a dictionary to ensure only one year per decade is retained + decade_map = {} + for year in years: + decade = (year // 10) * 10 + if decade not in decade_map: + decade_map[decade] = year + return sorted(decade_map.values()) + + tb_decadal_famine_counts["filtered_years"] = tb_decadal_famine_counts["years"].apply(filter_decades) + + # Explode the filtered years into individual rows - this ensures we keep the correct count of famines per decade + tb_decadal_famine_counts = tb_decadal_famine_counts.explode("filtered_years").rename( + columns={"filtered_years": "year"} + ) + + # Calculate the decade for each year + tb_decadal_famine_counts["decade"] = (tb_decadal_famine_counts["year"] // 10) * 10 + + # Group the data by region and decade + famine_decadal_counts = ( + tb_decadal_famine_counts.groupby(["region", "decade"], observed=False) + .size() + .reset_index(name="decadal_famine_count") + ) + + # Create a 'World' row by summing counts across regions + famine_decadal_counts_world_only = ( + tb_decadal_famine_counts.groupby("decade").size().reset_index(name="decadal_famine_count") + ) + famine_decadal_counts_world_only["region"] = "World" + + # Concatenating the world row data with the regional data + famine_counts_decadal_combined = pr.concat( + [famine_decadal_counts, famine_decadal_counts_world_only], ignore_index=True + ) + famine_counts_decadal_combined = famine_counts_decadal_combined.rename(columns={"decade": "year"}) + # Divide each row's 'wpf_authoritative_mortality_estimate' by the length of the corresponding 'Date' value to assume a uniform distribution of deaths over the period tb["wpf_authoritative_mortality_estimate"] = tb.apply( lambda row: row["wpf_authoritative_mortality_estimate"] / len(row["date"].split(",")) @@ -45,44 +94,36 @@ def run(dest_dir: str) -> None: tb["year"] = tb["year"].astype(int) tb["region"] = tb["region"].astype("category") - # Grouping by relevant columns and summing the 'wpf_authoritative_mortality_estimate' for regional data + # Calculate the total number of famine deaths per year and region deaths_counts = ( tb.groupby(["year", "region"], observed=False)["wpf_authoritative_mortality_estimate"] .sum() .reset_index(name="famine_deaths") ) - - # Grouping by 'year', 'region', 'conflict', 'government_policy_overall', 'external_factors' and counting the occurrences - famine_counts = tb.groupby(["year", "region"], observed=False).size().reset_index(name="famine_count") - # Creating a 'World' row by summing counts across unique regions for each group - famine_counts_world_only = tb.groupby(["year"]).size().reset_index(name="famine_count") - famine_counts_world_only["region"] = "World" - # Concatenating the world row data with the regional data - famine_counts_combined = pr.concat([famine_counts, famine_counts_world_only], ignore_index=True) - - # Grouping by relevant columns and summing the 'wpf_authoritative_mortality_estimate' for regional data - deaths_counts = ( - tb.groupby(["year", "region"], observed=False)["wpf_authoritative_mortality_estimate"] - .sum() - .reset_index(name="famine_deaths") - ) - - # Creating a 'World' row by summing mortality estimates across all regions for each group deaths_counts_world_only = ( tb.groupby(["year"])["wpf_authoritative_mortality_estimate"].sum().reset_index(name="famine_deaths") ) deaths_counts_world_only["region"] = "World" - # Concatenating the world row data with the regional data + # Concatenate the number of famines per year and region deaths_counts_combined = pr.concat([deaths_counts, deaths_counts_world_only], ignore_index=True) + famine_counts = tb.groupby(["year", "region"], observed=False).size().reset_index(name="famine_count") + famine_counts_world_only = tb.groupby(["year"]).size().reset_index(name="famine_count") + famine_counts_world_only["region"] = "World" + + # Concatenating the world row data with the regional data + famine_counts_combined = pr.concat([famine_counts, famine_counts_world_only], ignore_index=True) + tb = pr.merge( famine_counts_combined, deaths_counts_combined, on=["year", "region"], ) - # Create a DataFrame with all years from 1870 to 2023 + tb = pr.merge(tb, famine_counts_decadal_combined, on=["year", "region"], how="outer") + + # Create a DataFrame with all years from 1870 to 2023 so we don't have 0s for years where there is no data all_years = pd.DataFrame({"year": range(1870, 2024)}) # Get all unique regions from the original data @@ -94,26 +135,23 @@ def run(dest_dir: str) -> None: ) all_years_regions = Table(all_years_regions) - # Merge this DataFrame with the existing data to ensure all years are present + # Merge this Table with the existing data to ensure all years are present tb = pr.merge(tb, all_years_regions, on=["year", "region"], how="right") # Fill NaN values in the 'famine_count' and 'famine_deaths' columns with zeros tb["famine_count"] = tb["famine_count"].fillna(0) tb["famine_deaths"] = tb["famine_deaths"].fillna(0) - # Calculate the decade + # Calculate decadal deaths tb["decade"] = (tb["year"] // 10) * 10 + tb["decadal_famine_deaths"] = tb.groupby(["region", "decade"], observed=False)["famine_deaths"].transform("sum") - # Group the data by region and decade, then calculate the decadal sum - for column in ["famine_count", "famine_deaths"]: - tb["decadal_" + column] = tb.groupby(["region", "decade"], observed=False)[column].transform("sum") - # Set NaN everywhere except the start of a decade - tb["decadal_" + column] = tb["decadal_" + column].where(tb["year"] % 10 == 0, np.nan) + # Set NaN everywhere except the start of a decade + tb["decadal_famine_deaths"] = tb["decadal_famine_deaths"].where(tb["year"] % 10 == 0, np.nan) tb = tb.drop(columns=["decade"]) tb = tb.rename(columns={"region": "country"}) # Calculate the rate of famine deaths per 100,000 people - tb = geo.add_population_to_table(tb, ds_population) # The World total population doesn't include a value for each year but every region does so calculate it for each year based on the regional sums instead diff --git a/etl/steps/data/grapher/animal_welfare/2024-09-13/fur_laws.py b/etl/steps/data/grapher/animal_welfare/2024-09-13/fur_laws.py index 8765570c08b..6f3867a35f0 100644 --- a/etl/steps/data/grapher/animal_welfare/2024-09-13/fur_laws.py +++ b/etl/steps/data/grapher/animal_welfare/2024-09-13/fur_laws.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: # # Load garden dataset and read its main table. ds_garden = paths.load_dataset("fur_laws") - tb = ds_garden.read_table("fur_laws") + tb = ds_garden.read("fur_laws") # # Process data. diff --git a/etl/steps/data/grapher/antibiotics/2024-10-18/who_glass.py b/etl/steps/data/grapher/antibiotics/2024-10-18/who_glass.py new file mode 100644 index 00000000000..705afc693d8 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-10-18/who_glass.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("who_glass") + + # Read table from garden dataset. + tb = ds_garden["who_glass"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-10-18/who_glass_by_antibiotic.py b/etl/steps/data/grapher/antibiotics/2024-10-18/who_glass_by_antibiotic.py new file mode 100644 index 00000000000..546df47de18 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-10-18/who_glass_by_antibiotic.py @@ -0,0 +1,29 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("who_glass_by_antibiotic") + + # Read table from garden dataset. + tb_anti = ds_garden["antibiotic_table"] + tb_bci = ds_garden["bci_table"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb_bci, tb_anti], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-11-12/antimicrobial_usage.py b/etl/steps/data/grapher/antibiotics/2024-11-12/antimicrobial_usage.py new file mode 100644 index 00000000000..9a24fbc4608 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-11-12/antimicrobial_usage.py @@ -0,0 +1,37 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("antimicrobial_usage") + + # Read table from garden dataset. + tb_class = ds_garden["class"] + tb_aware = ds_garden["aware"] + tb_class_agg = ds_garden["class_aggregated"] + + # + # Process data. + # + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, + tables=[tb_class, tb_aware, tb_class_agg], + check_variables_metadata=True, + default_metadata=ds_garden.metadata, + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-11-15/testing_coverage.py b/etl/steps/data/grapher/antibiotics/2024-11-15/testing_coverage.py new file mode 100644 index 00000000000..c515dd3aa39 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-11-15/testing_coverage.py @@ -0,0 +1,27 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("testing_coverage") + + # Read table from garden dataset. + tb = ds_garden["testing_coverage"] + tb_specimen = ds_garden["specimen"] + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb, tb_specimen], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-11-20/microbe.py b/etl/steps/data/grapher/antibiotics/2024-11-20/microbe.py new file mode 100644 index 00000000000..848a2788147 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-11-20/microbe.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("microbe") + + # Read table from garden dataset. + tb = ds_garden.read("microbe", reset_index=False) + tb_pathogen = ds_garden.read("pathogen_entity", reset_index=False) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb, tb_pathogen], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-11-20/pathogen_bloodstream.py b/etl/steps/data/grapher/antibiotics/2024-11-20/pathogen_bloodstream.py new file mode 100644 index 00000000000..1b41f653a19 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-11-20/pathogen_bloodstream.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("pathogen_bloodstream") + + # Read table from garden dataset. + tb = ds_garden.read("pathogen_bloodstream", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-11-24/total_syndrome.py b/etl/steps/data/grapher/antibiotics/2024-11-24/total_syndrome.py new file mode 100644 index 00000000000..375d4e8f358 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-11-24/total_syndrome.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("total_syndrome") + + # Read table from garden dataset. + tb = ds_garden.read("total_syndrome", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-12-02/microbe_amr.py b/etl/steps/data/grapher/antibiotics/2024-12-02/microbe_amr.py new file mode 100644 index 00000000000..f54fcc98772 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-12-02/microbe_amr.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("microbe_amr") + + # Read table from garden dataset. + tb = ds_garden.read("microbe_amr", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-12-02/microbe_neonatal_amr.py b/etl/steps/data/grapher/antibiotics/2024-12-02/microbe_neonatal_amr.py new file mode 100644 index 00000000000..885a0c65b02 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-12-02/microbe_neonatal_amr.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("microbe_neonatal_amr") + + # Read table from garden dataset. + tb = ds_garden.read("microbe_neonatal_amr", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-12-02/total_pathogen_bloodstream.py b/etl/steps/data/grapher/antibiotics/2024-12-02/total_pathogen_bloodstream.py new file mode 100644 index 00000000000..3738ea0149f --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-12-02/total_pathogen_bloodstream.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("total_pathogen_bloodstream") + + # Read table from garden dataset. + tb = ds_garden.read("total_pathogen_bloodstream", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py b/etl/steps/data/grapher/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py new file mode 100644 index 00000000000..81a269f0ae0 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("total_pathogen_bloodstream_amr") + + # Read table from garden dataset. + tb = ds_garden.read("total_pathogen_bloodstream_amr", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-12-03/glass_enrolment.py b/etl/steps/data/grapher/antibiotics/2024-12-03/glass_enrolment.py new file mode 100644 index 00000000000..3f1604a2896 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-12-03/glass_enrolment.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("glass_enrolment") + + # Read table from garden dataset. + tb = ds_garden.read("glass_enrolment", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-12-04/microbe_total_pathogens.py b/etl/steps/data/grapher/antibiotics/2024-12-04/microbe_total_pathogens.py new file mode 100644 index 00000000000..898f3506d2b --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-12-04/microbe_total_pathogens.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("microbe_total_pathogens") + + # Read table from garden dataset. + tb = ds_garden.read("microbe_total_pathogens", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-12-04/microbe_total_pathogens_amr.py b/etl/steps/data/grapher/antibiotics/2024-12-04/microbe_total_pathogens_amr.py new file mode 100644 index 00000000000..2a5df838f2e --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-12-04/microbe_total_pathogens_amr.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("microbe_total_pathogens_amr") + + # Read table from garden dataset. + tb = ds_garden.read("microbe_total_pathogens_amr", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-12-05/microbe_neonatal_total_amr.py b/etl/steps/data/grapher/antibiotics/2024-12-05/microbe_neonatal_total_amr.py new file mode 100644 index 00000000000..6a743b902c6 --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-12-05/microbe_neonatal_total_amr.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("microbe_neonatal_total_amr") + + # Read table from garden dataset. + tb = ds_garden.read("microbe_neonatal_total_amr", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py b/etl/steps/data/grapher/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py new file mode 100644 index 00000000000..407f17d98ab --- /dev/null +++ b/etl/steps/data/grapher/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("microbe_total_deaths_by_syndrome_amr") + + # Read table from garden dataset. + tb = ds_garden.read("microbe_total_deaths_by_syndrome_amr", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_corporate_investment.meta.yml b/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_corporate_investment.meta.yml index d1491908586..cf5ceb40082 100644 --- a/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_corporate_investment.meta.yml +++ b/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_corporate_investment.meta.yml @@ -9,19 +9,19 @@ definitions: note: This data is expressed in constant 2021 US$. Inflation adjustment is based on the US Consumer Price Index (CPI). description_processing: |- - - Reporting a time series of AI investments in nominal prices (i.e., without adjusting for inflation) means it makes little sense to compare observations across time; it is therefore not very useful. To make comparisons across time possible, one has to take into account that prices change (e.g., there is inflation). - - It is not obvious how to adjust this time series for inflation, and we debated it at some length within our team. - - It would be straightforward to adjust the time series for price changes if we knew the prices of the specific goods and services that these investments purchased. This would make it possible to calculate a volume measure of AI investments, and it would tell us how much these investments bought. But such a metric is not available. While a comprehensive price index is not available, we know that the cost for some crucial AI technology has fallen rapidly in price. - - In the absence of a comprehensive price index that captures the price of AI-specific goods and services, one has to rely on one of the available metrics for the price of a bundle of goods and services. In the end we decided to use the US Consumer Price Index (CPI). - - The US CPI does not provide us with a volume measure of AI goods and services, but it does capture the opportunity costs of these investments. The inflation adjustment of this time series of AI investments therefore lets us understand the size of these investments relative to whatever else these sums of money could have purchased. + - Reporting a time series of AI investments in nominal prices would make it difficult to compare observations across time. To make these comparisons possible, one has to take into account that prices change (inflation). + - It is not obvious how to adjust this time series for inflation, and our team discussed the best solutions at our disposal. + - It would be straightforward to adjust the time series for price changes if we knew the prices of the specific goods and services purchased through these investments. This would make it possible to calculate a volume measure of AI investments and tell us how much these investments bought. But such a metric is not available. While a comprehensive price index is not available, we know that the cost of some crucial AI technology has fallen rapidly in price. + - In the absence of a comprehensive price index that captures the price of AI-specific goods and services, one has to rely on one of the available metrics for the price of a bundle of goods and services. Ultimately, we decided to use the US Consumer Price Index (CPI). + - The US CPI does not provide us with a volume measure of AI goods and services, but it does capture the opportunity costs of these investments. The inflation adjustment of this time series of AI investments, therefore, lets us understand the size of these investments relative to whatever else these sums of money could have purchased. description_key: + - The data likely underestimates total global AI investment, as it only captures certain types of private equity transactions, excluding other significant channels and categories of AI-related spending. + - This data focuses on traditional corporate finance deals, but the source does not fully disclose its methodology and what's included or excluded. This means it may not fully capture important areas of AI investment, such as those from publicly traded companies, corporate internal R&D, government funding, public sector initiatives, data center infrastructure, hardware production, semiconductor manufacturing, and expenses for research and talent. + - One-time events, such as large acquisitions, can distort yearly figures, while broader economic factors like interest rates and market sentiment can influence investment trends independently of AI-specific developments. - A merger is a corporate strategy involving two companies joining together to form a new company. An acquisition is a corporate strategy involving one company buying another company. - - Private investment in AI companies in each year that received an investment of more than $1.5 million (not adjusted for inflation). + - Private investment is defined as investment in AI companies of more than $1.5 million (in current US dollars). - A public offering is the sale of equity shares or other financial instruments to the public in order to raise capital. - A minority stake is an ownership interest of less than 50% of the total shares of a company. - - The categories shown suggest a focus on traditional corporate finance deals, but without a detailed methodology, we can't be certain about what's included or excluded. This means it may not fully capture important areas of AI investment, such as those from public companies (e.g., NVIDIA, TSMC), corporate internal R&D, government funding, public sector initiatives, data center infrastructure, hardware production, semiconductor manufacturing, and expenses for research and talent. - - One-time events like large acquisitions can skew yearly figures, and broader economic factors like interest rates or market sentiment can also affect AI investment trends independently of AI-specific developments. - - The dataset likely underestimates the total global AI investment, as it only captures certain types of private equity transactions, excluding other significant channels and categories of AI-related spending. description_short: This data is expressed in US dollars, adjusted for inflation. unit: 'constant 2021 US$' @@ -38,5 +38,3 @@ tables: variables: world: title: Global corporate investment in AI - - diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_investment_generative_companies.meta.yml b/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_investment_generative_companies.meta.yml index d6866c816d2..5ce51b53dd1 100644 --- a/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_investment_generative_companies.meta.yml +++ b/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_investment_generative_companies.meta.yml @@ -21,15 +21,15 @@ tables: short_unit: '$' description_short: Generative AI refers to AI systems that can create new content, such as images, text, or music, based on patterns learned from existing data. description_processing: |- - - Reporting a time series of AI investments in nominal prices (i.e., without adjusting for inflation) means it makes little sense to compare observations across time; it is therefore not very useful. To make comparisons across time possible, one has to take into account that prices change (e.g., there is inflation). - - It is not obvious how to adjust this time series for inflation, and we debated it at some length within our team. - - It would be straightforward to adjust the time series for price changes if we knew the prices of the specific goods and services that these investments purchased. This would make it possible to calculate a volume measure of AI investments, and it would tell us how much these investments bought. But such a metric is not available. While a comprehensive price index is not available, we know that the cost for some crucial AI technology has fallen rapidly in price. - - In the absence of a comprehensive price index that captures the price of AI-specific goods and services, one has to rely on one of the available metrics for the price of a bundle of goods and services. In the end we decided to use the US Consumer Price Index (CPI). - - The US CPI does not provide us with a volume measure of AI goods and services, but it does capture the opportunity costs of these investments. The inflation adjustment of this time series of AI investments therefore lets us understand the size of these investments relative to whatever else these sums of money could have purchased. + - Reporting a time series of AI investments in nominal prices would make it difficult to compare observations across time. To make these comparisons possible, one has to take into account that prices change (inflation). + - It is not obvious how to adjust this time series for inflation, and our team discussed the best solutions at our disposal. + - It would be straightforward to adjust the time series for price changes if we knew the prices of the specific goods and services purchased through these investments. This would make it possible to calculate a volume measure of AI investments and tell us how much these investments bought. But such a metric is not available. While a comprehensive price index is not available, we know that the cost of some crucial AI technology has fallen rapidly in price. + - In the absence of a comprehensive price index that captures the price of AI-specific goods and services, one has to rely on one of the available metrics for the price of a bundle of goods and services. Ultimately, we decided to use the US Consumer Price Index (CPI). + - The US CPI does not provide us with a volume measure of AI goods and services, but it does capture the opportunity costs of these investments. The inflation adjustment of this time series of AI investments, therefore, lets us understand the size of these investments relative to whatever else these sums of money could have purchased. description_key: - - One-time events like large acquisitions can skew yearly figures, and broader economic factors like interest rates or market sentiment can also affect AI investment trends independently of AI-specific developments. - - The dataset’s methodology doesn’t specify which types of AI investments are included, so it may overlook important areas of AI investment, such as those from public companies (e.g., NVIDIA, TSMC), corporate internal R&D, government funding, public sector initiatives, data center infrastructure, hardware production, semiconductor manufacturing, and expenses for research and talent. - - The dataset likely underestimates the total global AI investment, as it only captures certain types of private equity transactions, excluding other significant channels and categories of AI-related spending. + - The data likely underestimates total global AI investment, as it only captures certain types of private equity transactions, excluding other significant channels and categories of AI-related spending. + - The source does not fully disclose its methodology and what's included or excluded. This means it may not fully capture important areas of AI investment, such as those from publicly traded companies, corporate internal R&D, government funding, public sector initiatives, data center infrastructure, hardware production, semiconductor manufacturing, and expenses for research and talent. + - One-time events, such as large acquisitions, can distort yearly figures, while broader economic factors like interest rates and market sentiment can influence investment trends independently of AI-specific developments. presentation: grapher_config: diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_private_investment.meta.yml b/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_private_investment.meta.yml index 717ac9b8266..e99162560b3 100644 --- a/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_private_investment.meta.yml +++ b/etl/steps/data/grapher/artificial_intelligence/2024-06-28/ai_private_investment.meta.yml @@ -9,17 +9,17 @@ definitions: note: This data is expressed in constant 2021 US$. Inflation adjustment is based on the US Consumer Price Index (CPI). description_processing: |- - - Reporting a time series of AI investments in nominal prices (i.e., without adjusting for inflation) means it makes little sense to compare observations across time; it is therefore not very useful. To make comparisons across time possible, one has to take into account that prices change (e.g., there is inflation). - - It is not obvious how to adjust this time series for inflation, and we debated it at some length within our team. - - It would be straightforward to adjust the time series for price changes if we knew the prices of the specific goods and services that these investments purchased. This would make it possible to calculate a volume measure of AI investments, and it would tell us how much these investments bought. But such a metric is not available. While a comprehensive price index is not available, we know that the cost for some crucial AI technology has fallen rapidly in price. - - In the absence of a comprehensive price index that captures the price of AI-specific goods and services, one has to rely on one of the available metrics for the price of a bundle of goods and services. In the end we decided to use the US Consumer Price Index (CPI). - - The US CPI does not provide us with a volume measure of AI goods and services, but it does capture the opportunity costs of these investments. The inflation adjustment of this time series of AI investments therefore lets us understand the size of these investments relative to whatever else these sums of money could have purchased. + - Reporting a time series of AI investments in nominal prices would make it difficult to compare observations across time. To make these comparisons possible, one has to take into account that prices change (inflation). + - It is not obvious how to adjust this time series for inflation, and our team discussed the best solutions at our disposal. + - It would be straightforward to adjust the time series for price changes if we knew the prices of the specific goods and services purchased through these investments. This would make it possible to calculate a volume measure of AI investments and tell us how much these investments bought. But such a metric is not available. While a comprehensive price index is not available, we know that the cost of some crucial AI technology has fallen rapidly in price. + - In the absence of a comprehensive price index that captures the price of AI-specific goods and services, one has to rely on one of the available metrics for the price of a bundle of goods and services. Ultimately, we decided to use the US Consumer Price Index (CPI). + - The US CPI does not provide us with a volume measure of AI goods and services, but it does capture the opportunity costs of these investments. The inflation adjustment of this time series of AI investments, therefore, lets us understand the size of these investments relative to whatever else these sums of money could have purchased. description_short: Includes companies that received more than $1.5 million in investment (not adjusted for inflation). This data is expressed in US dollars, adjusted for inflation. description_key: - - One-time events like large acquisitions can skew yearly figures, and broader economic factors like interest rates or market sentiment can also affect AI investment trends independently of AI-specific developments. - - The dataset’s methodology doesn’t specify which types of AI investments are included, so it may overlook important areas of AI investment, such as those from public companies (e.g., NVIDIA, TSMC), corporate internal R&D, government funding, public sector initiatives, data center infrastructure, hardware production, semiconductor manufacturing, and expenses for research and talent. - - The dataset likely underestimates the total global AI investment, as it only captures certain types of private equity transactions, excluding other significant channels and categories of AI-related spending. + - The data likely underestimates total global AI investment, as it only captures certain types of private equity transactions, excluding other significant channels and categories of AI-related spending. + - The source does not fully disclose its methodology and what's included or excluded. This means it may not fully capture important areas of AI investment, such as those from publicly traded companies, corporate internal R&D, government funding, public sector initiatives, data center infrastructure, hardware production, semiconductor manufacturing, and expenses for research and talent. + - One-time events, such as large acquisitions, can distort yearly figures, while broader economic factors like interest rates and market sentiment can influence investment trends independently of AI-specific developments. unit: 'constant 2021 US$' short_unit: '$' @@ -41,5 +41,3 @@ tables: title: Private investment in AI in the United States european_union_and_united_kingdom: title: Private investment in AI in the European Union and United Kingdom - - diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-10-01/epoch.py b/etl/steps/data/grapher/artificial_intelligence/2024-10-01/epoch.py index 435e1f6c9d2..1fca1ec6b8b 100644 --- a/etl/steps/data/grapher/artificial_intelligence/2024-10-01/epoch.py +++ b/etl/steps/data/grapher/artificial_intelligence/2024-10-01/epoch.py @@ -81,7 +81,7 @@ def find_max_label_and_concat(tb, column, label): max_value = -float("inf") rows_to_keep = [] - for _, row in tb.iterrows(): + for _, row in tb.dropna(subset=[column]).iterrows(): if row[column] > max_value: max_value = row[column] rows_to_keep.append(row) diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch.meta.yml b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch.meta.yml new file mode 100644 index 00000000000..af50f790b40 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch.meta.yml @@ -0,0 +1,18 @@ +definitions: + common: + unit: '' + short_unit: '' + display: + zeroDay: '1949-01-01' + yearIsDay: true + +tables: + epoch: + variables: + max_compute: + title: Maximum compute + max_data: + title: Maximum data + max_parameters: + title: Maximum parameters + diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch.py new file mode 100644 index 00000000000..e8c91465a15 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch.py @@ -0,0 +1,102 @@ +"""Load a garden dataset and create a grapher dataset.""" + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch") + + # Read table from garden dataset. + tb = ds_garden["epoch"].reset_index() + # + # Process data. + # + # Extract year from 'publication_date' and create a new 'year' column + tb["year"] = tb["publication_date"].dt.year + + # For visualization purposes I am adding the rows with the maximum values of compute, data, and parameters in each year to the table as a separate "system". I don't want to do this in garden as it'd affect other datasets that depend on this one. + columns = { + "training_computation_petaflop": "compute", + "training_dataset_size__datapoints": "data", + "parameters": "parameters", + } + # Find maximum values for a given column (compute, data, params) per year, label them, and add summary rows. + for column, label in columns.items(): + tb = find_max_label_and_concat(tb, column, label) + + # Update metadata + for col in ["max_compute", "max_parameters", "max_data"]: + tb[col].metadata.origins = tb["model"].metadata.origins + + # Drop year as we don't need it anymore + tb = tb.drop("year", axis=1) + + # Rename for plotting model name as country in grapher + tb = tb.rename(columns={"model": "country", "days_since_1949": "year"}) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() + + +def find_max_label_and_concat(tb, column, label): + """ + Find maximum values for a given column per year, label them, and add summary rows. + + This function: + 1. Identifies rows with maximum values for the specified column in each year. + 2. Labels these maximum value rows in a new column using their original system names. + 3. Creates new summary rows for these maximum values. + 4. Adds these new summary rows to the original table. + + Note: + - Creates a new column named f"max_{label}" to indicate maximum values. + - Preserves original data and system names. + - Adds new summary rows with "system" set to f"Maximum {label}". + """ + tb = tb.sort_values(by=["year"]) # Ensure the DataFrame is sorted by year + max_value = -float("inf") + rows_to_keep = [] + + for _, row in tb.iterrows(): + if not pd.isna(row[column]) and row[column] > max_value: + max_value = row[column] + rows_to_keep.append(row) + + tb_filtered = Table(rows_to_keep) + + idx = tb_filtered[[column, "year"]].fillna(0).groupby("year")[column].idxmax() + + tb_filtered[f"max_{label}"] = "Other" + tb_filtered.loc[idx, f"max_{label}"] = f"Maximum {label}" + + max_rows = tb_filtered.loc[idx].copy() + max_rows["system"] = f"Maximum {label}" + + tb = pr.concat([tb, max_rows], ignore_index=True) + + return tb diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation.py new file mode 100644 index 00000000000..6582a86db80 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation.py @@ -0,0 +1,41 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_affiliation") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_affiliation"] + + # + # Process data. + # + # Rename for plotting research affiliation as country in grapher + tb = tb.rename_index_names( + { + "organization_categorization": "country", + } + ) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_countries.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_countries.py new file mode 100644 index 00000000000..658d7982804 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_countries.py @@ -0,0 +1,30 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_countries") + + # Read table from garden dataset. + tb_garden = ds_garden["epoch_aggregates_countries"] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_domain.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_domain.py new file mode 100644 index 00000000000..fb2fa66d43b --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_domain.py @@ -0,0 +1,39 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_domain") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_domain"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "domain": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_organizations.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_organizations.py new file mode 100644 index 00000000000..f479f165881 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_aggregates_organizations.py @@ -0,0 +1,38 @@ +"""Load a garden dataset and create a grapher dataset.""" +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_organizations") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_organizations"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "organization": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive.py new file mode 100644 index 00000000000..f6df4df4e55 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive.py @@ -0,0 +1,33 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive"] + + # + # Process data. + # + # Rename for plotting model name as country in grapher + tb = tb.rename_index_names({"model": "country", "days_since_1949": "year"}) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.py new file mode 100644 index 00000000000..ef0aea55b10 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries.py @@ -0,0 +1,30 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_countries") + + # Read table from garden dataset. + tb_garden = ds_garden["epoch_compute_intensive_countries"] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain.py new file mode 100644 index 00000000000..efb5fea33ce --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain.py @@ -0,0 +1,39 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_domain") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive_domain"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "domain": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_organizations.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_organizations.py new file mode 100644 index 00000000000..9478c5e5e42 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_organizations.py @@ -0,0 +1,38 @@ +"""Load a garden dataset and create a grapher dataset.""" +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_organizations") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive_organizations"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "organization": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_regressions.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_regressions.py new file mode 100644 index 00000000000..8c21dfbbc5a --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_regressions.py @@ -0,0 +1,33 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_regressions") + + # Read table from garden dataset. + tb = ds_garden["epoch_regressions"] + tb = tb.rename_index_names({"model": "country", "days_since_1949": "year"}) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.meta.yml b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.meta.yml new file mode 100644 index 00000000000..af50f790b40 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.meta.yml @@ -0,0 +1,18 @@ +definitions: + common: + unit: '' + short_unit: '' + display: + zeroDay: '1949-01-01' + yearIsDay: true + +tables: + epoch: + variables: + max_compute: + title: Maximum compute + max_data: + title: Maximum data + max_parameters: + title: Maximum parameters + diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.py new file mode 100644 index 00000000000..6db83e94816 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.py @@ -0,0 +1,102 @@ +"""Load a garden dataset and create a grapher dataset.""" + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch") + + # Read table from garden dataset. + tb = ds_garden["epoch"].reset_index() + # + # Process data. + # + # Extract year from 'publication_date' and create a new 'year' column + tb["year"] = tb["publication_date"].dt.year + + # For visualization purposes I am adding the rows with the maximum values of compute, data, and parameters in each year to the table as a separate "model". I don't want to do this in garden as it'd affect other datasets that depend on this one. + columns = { + "training_computation_petaflop": "compute", + "training_dataset_size__datapoints": "data", + "parameters": "parameters", + } + # Find maximum values for a given column (compute, data, params) per year, label them, and add summary rows. + for column, label in columns.items(): + tb = find_max_label_and_concat(tb, column, label) + + # Update metadata + for col in ["max_compute", "max_parameters", "max_data"]: + tb[col].metadata.origins = tb["model"].metadata.origins + + # Drop year as we don't need it anymore + tb = tb.drop("year", axis=1) + + # Rename for plotting model name as country in grapher + tb = tb.rename(columns={"model": "country", "days_since_1949": "year"}) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() + + +def find_max_label_and_concat(tb, column, label): + """ + Find maximum values for a given column per year, label them, and add summary rows. + + This function: + 1. Identifies rows with maximum values for the specified column in each year. + 2. Labels these maximum value rows in a new column using their original model names. + 3. Creates new summary rows for these maximum values. + 4. Adds these new summary rows to the original table. + + Note: + - Creates a new column named f"max_{label}" to indicate maximum values. + - Preserves original data and model names. + - Adds new summary rows with "model" set to f"Maximum {label}". + """ + tb = tb.sort_values(by=["year"]) # Ensure the DataFrame is sorted by year + max_value = -float("inf") + rows_to_keep = [] + + for _, row in tb.iterrows(): + if not pd.isna(row[column]) and row[column] > max_value: + max_value = row[column] + rows_to_keep.append(row) + + tb_filtered = Table(rows_to_keep) + + idx = tb_filtered[[column, "year"]].fillna(0).groupby("year")[column].idxmax() + + tb_filtered[f"max_{label}"] = "Other" + tb_filtered.loc[idx, f"max_{label}"] = f"Maximum {label}" + + max_rows = tb_filtered.loc[idx].copy() + max_rows["model"] = f"Maximum {label}" + + tb = pr.concat([tb, max_rows], ignore_index=True) + + return tb diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py new file mode 100644 index 00000000000..6582a86db80 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py @@ -0,0 +1,41 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_affiliation") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_affiliation"] + + # + # Process data. + # + # Rename for plotting research affiliation as country in grapher + tb = tb.rename_index_names( + { + "organization_categorization": "country", + } + ) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_countries.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_countries.py new file mode 100644 index 00000000000..658d7982804 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_countries.py @@ -0,0 +1,30 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_countries") + + # Read table from garden dataset. + tb_garden = ds_garden["epoch_aggregates_countries"] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py new file mode 100644 index 00000000000..fb2fa66d43b --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py @@ -0,0 +1,39 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_domain") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_domain"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "domain": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_organizations.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_organizations.py new file mode 100644 index 00000000000..f479f165881 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_organizations.py @@ -0,0 +1,38 @@ +"""Load a garden dataset and create a grapher dataset.""" +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_organizations") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_organizations"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "organization": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive.py new file mode 100644 index 00000000000..f6df4df4e55 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive.py @@ -0,0 +1,33 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive"] + + # + # Process data. + # + # Rename for plotting model name as country in grapher + tb = tb.rename_index_names({"model": "country", "days_since_1949": "year"}) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py new file mode 100644 index 00000000000..ef0aea55b10 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py @@ -0,0 +1,30 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_countries") + + # Read table from garden dataset. + tb_garden = ds_garden["epoch_compute_intensive_countries"] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py new file mode 100644 index 00000000000..efb5fea33ce --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py @@ -0,0 +1,39 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_domain") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive_domain"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "domain": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_organizations.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_organizations.py new file mode 100644 index 00000000000..9478c5e5e42 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_organizations.py @@ -0,0 +1,38 @@ +"""Load a garden dataset and create a grapher dataset.""" +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_organizations") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive_organizations"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "organization": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_regressions.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_regressions.py new file mode 100644 index 00000000000..8c21dfbbc5a --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_regressions.py @@ -0,0 +1,33 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_regressions") + + # Read table from garden dataset. + tb = ds_garden["epoch_regressions"] + tb = tb.rename_index_names({"model": "country", "days_since_1949": "year"}) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/climate/2024-07-23/sea_ice_anomalies_by_month.py b/etl/steps/data/grapher/climate/2024-07-23/sea_ice_anomalies_by_month.py index 9f04bb5edfc..b16879ec60b 100644 --- a/etl/steps/data/grapher/climate/2024-07-23/sea_ice_anomalies_by_month.py +++ b/etl/steps/data/grapher/climate/2024-07-23/sea_ice_anomalies_by_month.py @@ -110,7 +110,7 @@ def run(dest_dir: str) -> None: # # Load garden dataset. ds_garden = paths.load_dataset("sea_ice_index") - tb = ds_garden.read_table("sea_ice_index") + tb = ds_garden.read("sea_ice_index") # # Process data. diff --git a/etl/steps/data/grapher/climate/2024-07-23/sea_ice_extent_by_decade.py b/etl/steps/data/grapher/climate/2024-07-23/sea_ice_extent_by_decade.py index ebb8e2f063f..b7334ca78a4 100644 --- a/etl/steps/data/grapher/climate/2024-07-23/sea_ice_extent_by_decade.py +++ b/etl/steps/data/grapher/climate/2024-07-23/sea_ice_extent_by_decade.py @@ -148,7 +148,7 @@ def run(dest_dir: str) -> None: # # Load garden dataset. ds_garden = paths.load_dataset("sea_ice_index") - tb = ds_garden.read_table("sea_ice_index") + tb = ds_garden.read("sea_ice_index") # # Process data. diff --git a/etl/steps/data/grapher/climate/2024-07-23/sea_ice_extent_by_year.py b/etl/steps/data/grapher/climate/2024-07-23/sea_ice_extent_by_year.py index 940f0eb32f8..c9b2e0b2ae6 100644 --- a/etl/steps/data/grapher/climate/2024-07-23/sea_ice_extent_by_year.py +++ b/etl/steps/data/grapher/climate/2024-07-23/sea_ice_extent_by_year.py @@ -121,7 +121,7 @@ def run(dest_dir: str) -> None: # # Load garden dataset. ds_garden = paths.load_dataset("sea_ice_index") - tb = ds_garden.read_table("sea_ice_index") + tb = ds_garden.read("sea_ice_index") # # Process data. diff --git a/etl/steps/data/grapher/climate/2024-09-30/sea_ice_anomalies_by_month.py b/etl/steps/data/grapher/climate/2024-09-30/sea_ice_anomalies_by_month.py index 9f04bb5edfc..b16879ec60b 100644 --- a/etl/steps/data/grapher/climate/2024-09-30/sea_ice_anomalies_by_month.py +++ b/etl/steps/data/grapher/climate/2024-09-30/sea_ice_anomalies_by_month.py @@ -110,7 +110,7 @@ def run(dest_dir: str) -> None: # # Load garden dataset. ds_garden = paths.load_dataset("sea_ice_index") - tb = ds_garden.read_table("sea_ice_index") + tb = ds_garden.read("sea_ice_index") # # Process data. diff --git a/etl/steps/data/grapher/climate/2024-09-30/sea_ice_extent_by_decade.py b/etl/steps/data/grapher/climate/2024-09-30/sea_ice_extent_by_decade.py index ebb8e2f063f..b7334ca78a4 100644 --- a/etl/steps/data/grapher/climate/2024-09-30/sea_ice_extent_by_decade.py +++ b/etl/steps/data/grapher/climate/2024-09-30/sea_ice_extent_by_decade.py @@ -148,7 +148,7 @@ def run(dest_dir: str) -> None: # # Load garden dataset. ds_garden = paths.load_dataset("sea_ice_index") - tb = ds_garden.read_table("sea_ice_index") + tb = ds_garden.read("sea_ice_index") # # Process data. diff --git a/etl/steps/data/grapher/climate/2024-09-30/sea_ice_extent_by_year.py b/etl/steps/data/grapher/climate/2024-09-30/sea_ice_extent_by_year.py index 940f0eb32f8..c9b2e0b2ae6 100644 --- a/etl/steps/data/grapher/climate/2024-09-30/sea_ice_extent_by_year.py +++ b/etl/steps/data/grapher/climate/2024-09-30/sea_ice_extent_by_year.py @@ -121,7 +121,7 @@ def run(dest_dir: str) -> None: # # Load garden dataset. ds_garden = paths.load_dataset("sea_ice_index") - tb = ds_garden.read_table("sea_ice_index") + tb = ds_garden.read("sea_ice_index") # # Process data. diff --git a/etl/steps/data/grapher/climate/2024-11-18/climate_change_impacts_annual.py b/etl/steps/data/grapher/climate/2024-11-18/climate_change_impacts_annual.py new file mode 100644 index 00000000000..d2ce85e4a2d --- /dev/null +++ b/etl/steps/data/grapher/climate/2024-11-18/climate_change_impacts_annual.py @@ -0,0 +1,34 @@ +"""Load a garden dataset and create a grapher dataset. + +""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its annual table. + ds_garden = paths.load_dataset("climate_change_impacts") + tb_annual = ds_garden["climate_change_impacts_annual"].reset_index() + + # + # Process data. + # + # Create a country column (required by grapher). + tb_annual = tb_annual.rename(columns={"location": "country"}, errors="raise") + + # Set an appropriate index and sort conveniently. + tb_annual = tb_annual.set_index(["country", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_annual], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/climate/2024-11-18/climate_change_impacts_monthly.py b/etl/steps/data/grapher/climate/2024-11-18/climate_change_impacts_monthly.py new file mode 100644 index 00000000000..c69428bae1b --- /dev/null +++ b/etl/steps/data/grapher/climate/2024-11-18/climate_change_impacts_monthly.py @@ -0,0 +1,37 @@ +"""Load a garden dataset and create a grapher dataset. + +""" + +from etl.grapher_helpers import adapt_table_with_dates_to_grapher +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its monthly table. + ds_garden = paths.load_dataset("climate_change_impacts") + tb = ds_garden["climate_change_impacts_monthly"].reset_index() + + # + # Process data. + # + # Create a country column (required by grapher). + tb = tb.rename(columns={"location": "country"}, errors="raise") + + # Adapt table with dates to grapher requirements. + tb = adapt_table_with_dates_to_grapher(tb) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/climate/2024-11-18/sea_ice_anomalies_by_month.py b/etl/steps/data/grapher/climate/2024-11-18/sea_ice_anomalies_by_month.py new file mode 100644 index 00000000000..9c1bcfce4d5 --- /dev/null +++ b/etl/steps/data/grapher/climate/2024-11-18/sea_ice_anomalies_by_month.py @@ -0,0 +1,171 @@ +"""Load a garden dataset and create a grapher dataset.""" + + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Minimum year to consider. +# This is chosen because the minimum year informed is 1978 (with only 2 months informed). +# NOTE: We could include 1979. But, for consistency between yearly and decadal data, we ignore this year. +YEAR_MIN = 1980 + +# For each month's sea ice extent, subtract a certain baseline sea ice extent, calculated as an average value (for that month) between two reference years (defined above as REFERENCE_YEAR_MIN and REFERENCE_YEAR_MAX). +# NOTE: Both min and max years are included. +REFERENCE_YEAR_MIN = 1981 +REFERENCE_YEAR_MAX = 2010 + + +def improve_metadata(tb: Table) -> Table: + tb = tb.copy() + + # Rename table. + tb.metadata.title = "Sea ice anomaly in the northern and southern hemispheres" + for column in tb.drop(columns=["country", "year"]).columns: + location = column.split("sea_ice_extent_")[-1].title() + title = f"Sea ice anomaly in the {location} by month" + description_short_yearly = f"Each point represents the monthly average sea ice extent relative to a baseline, which is the average sea ice extent for the same month over the {REFERENCE_YEAR_MAX-REFERENCE_YEAR_MIN+1}-year period from {REFERENCE_YEAR_MIN} to {REFERENCE_YEAR_MAX}." + footnote = ( + "All years have data for all 12 months, except 1987 and 1988 (each missing one month) and the current year." + ) + + # Name of data column (there is only one). + tb[column].metadata.title = title + tb[column].metadata.description_short = description_short_yearly + tb[column].metadata.presentation.title_public = title + # Set color for each entity. + tb[column].metadata.presentation.grapher_config = { + "selectedEntityNames": [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ], + # "selectedEntityColors": colors, + "originUrl": "https://ourworldindata.org/climate-change", + "note": footnote, + # "hideAnnotationFieldsInTitle": {"time": True}, + "entityType": "month", + "entityTypePlural": "months", + } + + return tb + + +def sanity_check_inputs(tb: Table) -> None: + error = "Expected 1978 to be the first year in the data. Data may have changed. Consider editing YEAR_MIN" + assert tb["year"].min() == 1978, error + + # All years should have 12 months except: + # * The very first year in the data (1978). + # * Years 1987 and 1988, that have 11 months (because 1987-12 and 1988-01 are missing). + # * The very last year in the data (since it's the ongoing year). + error = "Expected 12 months per year." + assert ( + tb[~tb["year"].isin([tb["year"].min(), 1987, 1988, tb["year"].max()])] + .groupby(["location", "year"]) + .count()["sea_ice_extent"] + == 12 + ).all(), error + # Each month-year should appear only once in the data. + error = "Repeated months." + assert (tb.groupby(["location", "year", "month"]).count()["sea_ice_extent"] == 1).all(), error + # Each month-decade should appear 10 times (one per year in the decade), except: + # * The very first decade (1970s), since it starts in 1978. This decade will be ignored in the decadal data. + # * January and December 1980s, that appear 9 times (because 1987-12 and 1988-01 are missing). + # * The very last decade (since it's the ongoing decade). + error = "Expected 10 instances of each month per decade (except in specific cases)." + exceptions = tb[ + (tb["decade"] == tb["decade"].min()) + | (tb["decade"] == tb["decade"].max()) + | ((tb["decade"] == 1980) & (tb["month"].isin([1, 12]))) + ].index + assert (tb.drop(exceptions).groupby(["location", "decade", "month"]).count()["sea_ice_extent"] == 10).all(), error + assert ( + tb[(tb["decade"] == 1980) & (tb["month"].isin([1, 12]))] + .groupby(["location", "decade", "month"]) + .count()["sea_ice_extent"] + == 9 + ).all(), error + assert ( + tb[(tb["decade"] == tb["decade"].max())].groupby(["location", "decade", "month"]).count()["sea_ice_extent"] + <= 10 + ).all(), error + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("sea_ice_index") + tb = ds_garden.read("sea_ice_index", safe_types=False) + + # + # Process data. + # + # Rename locations conveniently. + tb = tb.astype({"location": "string"}) + tb.loc[tb["location"] == "Northern Hemisphere", "location"] = "Arctic" + tb.loc[tb["location"] == "Southern Hemisphere", "location"] = "Antarctic" + assert set(tb["location"]) == {"Arctic", "Antarctic"}, "Unexpected locations." + + # Create columns for month, year, and decade. + tb["year"] = tb["date"].dt.year + tb["month"] = tb["date"].dt.month + tb["month_name"] = tb["date"].dt.strftime("%B") + tb["decade"] = (tb["year"] // 10) * 10 + + # Sanity checks. + sanity_check_inputs(tb=tb) + + # Select years after a certain minimum (see explanation above, where YEAR_MIN is defined) and a certain location. + tb = ( + tb[(tb["year"] >= YEAR_MIN)] + .sort_values(["year", "month"], ascending=(False, True)) + .drop(columns=["date", "month", "decade"], errors="raise") + .reset_index(drop=True) + ) + + # For each month's sea ice extent, subtract a certain baseline sea ice extent, calculated as an average value (for that month) between two reference years (defined above as REFERENCE_YEAR_MIN and REFERENCE_YEAR_MAX) + tb_reference = ( + tb[(tb["year"] >= REFERENCE_YEAR_MIN) & (tb["year"] <= REFERENCE_YEAR_MAX)] + .groupby(["location", "month_name"], as_index=False) + .agg({"sea_ice_extent": "mean"}) + .rename(columns={"sea_ice_extent": "sea_ice_extent_reference"}, errors="raise") + ) + tb = tb.merge(tb_reference, on=["location", "month_name"], how="left") + tb["sea_ice_extent"] -= tb["sea_ice_extent_reference"] + tb = tb.drop(columns=["sea_ice_extent_reference"], errors="raise") + + # Create one column for each hemisphere. + tb = tb.pivot( + index=["year", "month_name"], columns=["location"], values=["sea_ice_extent"], join_column_levels_with="_" + ).underscore() + + # Adapt column names to grapher. + tb = tb.rename(columns={"month_name": "country"}, errors="raise") + + # Improve metadata. + tb = improve_metadata(tb=tb) + + # Improve format. + tb = tb.format() + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/climate/2024-11-18/sea_ice_extent_by_decade.py b/etl/steps/data/grapher/climate/2024-11-18/sea_ice_extent_by_decade.py new file mode 100644 index 00000000000..c0580da642e --- /dev/null +++ b/etl/steps/data/grapher/climate/2024-11-18/sea_ice_extent_by_decade.py @@ -0,0 +1,203 @@ +"""Load a garden dataset and create a grapher dataset.""" + +import re + +import owid.catalog.processing as pr +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Minimum year to consider. +# This is chosen because the minimum year informed is 1978 (with only 2 months informed). +# NOTE: We could include 1979. But, for consistency between yearly and decadal data, we ignore this year. +YEAR_MIN = 1980 + + +def create_yearly_table(tb: Table) -> Table: + tb_yearly = tb.copy() + + tb_yearly = tb_yearly[tb_yearly["year"] == tb_yearly["year"].max()].reset_index(drop=True) + tb_yearly = tb_yearly.drop(columns=["decade"], errors="raise").rename( + columns={"year": "country", "month": "year"}, errors="raise" + ) + + return tb_yearly + + +def create_decadal_table(tb: Table) -> Table: + tb_decadal = tb.copy() + + # Calculate the sea ice extent of each month, averaged over the same 10 months of each decade. + # For example, January 1990 will be the average sea ice extent of the 10 months of January between 1990 and 1999. + tb_decadal["decade"] = tb_decadal["decade"].astype("string") + "s" + tb_decadal = tb_decadal.groupby(["month", "decade"], observed=True, as_index=False).agg( + {"sea_ice_extent_arctic": "mean", "sea_ice_extent_antarctic": "mean"} + ) + tb_decadal = tb_decadal.rename(columns={"decade": "country", "month": "year"}, errors="raise") + + return tb_decadal + + +def improve_metadata(tb: Table) -> Table: + tb = tb.astype({"country": "string"}).copy() + + # Gather years in the data, and assign colors to them. + colors = {} + columns = [str(year) for year in set(tb["country"])] + years = [int(re.findall(r"\d{4}", column)[0]) for column in columns] + for year, column in zip(years, columns): + if 1980 <= year < 1990: + # Light blue. + color = "#CCE5FF" + elif 1990 <= year < 2000: + # Medium light blue. + color = "#99CCFF" + elif 2000 <= year < 2010: + # Medium blue. + color = "#6699FF" + elif 2010 <= year < 2020: + # Darker blue. + color = "#3366FF" + elif year == max(years): + # Black. + color = "#000000" + else: + # Red. + color = "#F89B9B" + colors[column] = color + + # Rename table. + tb.metadata.title = "Sea ice extent in the northern and southern hemispheres by decade" + + for column in tb.drop(columns=["country", "year"]).columns: + location = column.split("sea_ice_extent_")[-1].title() + title = f"Monthly sea ice extent in the {location}, decadal average" + description_short = ( + "Each point represents the monthly average sea ice extent, averaged across all years within the decade." + ) + subtitle = ( + description_short + + " The current decade is highlighted in red, with the current year shown in black for comparison." + ) + footnote = "The horizontal axis shows months from January (1) to December (12). All years have data for all 12 months, except 1987 and 1988 (each missing one month) and the current year." + + tb[column].metadata.title = title + tb[column].metadata.description_short = description_short + tb[column].metadata.presentation.title_public = title + tb[column].metadata.presentation.grapher_config = { + "subtitle": subtitle, + "note": footnote, + "selectedEntityNames": columns, + "selectedEntityColors": colors, + "originUrl": "https://ourworldindata.org/climate-change", + "hideAnnotationFieldsInTitle": {"time": True}, + "entityType": "year", + "entityTypePlural": "years", + } + + return tb + + +def sanity_check_inputs(tb: Table) -> None: + error = "Expected 1978 to be the first year in the data. Data may have changed. Consider editing YEAR_MIN" + assert tb["year"].min() == 1978, error + + # All years should have 12 months except: + # * The very first year in the data (1978). + # * Years 1987 and 1988, that have 11 months (because 1987-12 and 1988-01 are missing). + # * The very last year in the data (since it's the ongoing year). + error = "Expected 12 months per year." + assert ( + tb[~tb["year"].isin([tb["year"].min(), 1987, 1988, tb["year"].max()])] + .groupby(["location", "year"]) + .count()["sea_ice_extent"] + == 12 + ).all(), error + # Each month-year should appear only once in the data. + error = "Repeated months." + assert (tb.groupby(["location", "year", "month"]).count()["sea_ice_extent"] == 1).all(), error + # Each month-decade should appear 10 times (one per year in the decade), except: + # * The very first decade (1970s), since it starts in 1978. This decade will be ignored in the decadal data. + # * January and December 1980s, that appear 9 times (because 1987-12 and 1988-01 are missing). + # * The very last decade (since it's the ongoing decade). + error = "Expected 10 instances of each month per decade (except in specific cases)." + exceptions = tb[ + (tb["decade"] == tb["decade"].min()) + | (tb["decade"] == tb["decade"].max()) + | ((tb["decade"] == 1980) & (tb["month"].isin([1, 12]))) + ].index + assert (tb.drop(exceptions).groupby(["location", "decade", "month"]).count()["sea_ice_extent"] == 10).all(), error + assert ( + tb[(tb["decade"] == 1980) & (tb["month"].isin([1, 12]))] + .groupby(["location", "decade", "month"]) + .count()["sea_ice_extent"] + == 9 + ).all(), error + assert ( + tb[(tb["decade"] == tb["decade"].max())].groupby(["location", "decade", "month"]).count()["sea_ice_extent"] + <= 10 + ).all(), error + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("sea_ice_index") + tb = ds_garden.read("sea_ice_index", safe_types=False) + + # + # Process data. + # + # Rename locations conveniently. + tb = tb.astype({"location": "string"}) + tb.loc[tb["location"] == "Northern Hemisphere", "location"] = "Arctic" + tb.loc[tb["location"] == "Southern Hemisphere", "location"] = "Antarctic" + assert set(tb["location"]) == {"Arctic", "Antarctic"}, "Unexpected locations." + + # Create columns for month, year, and decade. + tb["year"] = tb["date"].dt.year + tb["month"] = tb["date"].dt.month + tb["decade"] = (tb["year"] // 10) * 10 + + # Sanity checks. + sanity_check_inputs(tb=tb) + + # Select years after a certain minimum (see explanation above, where YEAR_MIN is defined) and a certain location. + tb = ( + tb[(tb["year"] >= YEAR_MIN)] + .sort_values(["year", "month"], ascending=(False, True)) + .drop(columns=["date"], errors="raise") + .reset_index(drop=True) + ) + + # Create one column for each hemisphere. + tb = tb.pivot( + index=["year", "decade", "month"], columns=["location"], values=["sea_ice_extent"], join_column_levels_with="_" + ).underscore() + + # Create yearly table, adapted to grapher. + tb_yearly = create_yearly_table(tb=tb) + + # Create decadal table, adapted to grapher. + tb_decadal = create_decadal_table(tb=tb) + + # Combine both tables (take decadal data prior to 2020, and individual years from 2020 on). + tb_combined = pr.concat([tb_decadal, tb_yearly], ignore_index=True) + + # Improve metadata. + tb_combined = improve_metadata(tb=tb_combined) + + # Improve format. + tb_combined = tb_combined.format(sort_rows=False) + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_combined], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/climate/2024-11-18/sea_ice_extent_by_year.py b/etl/steps/data/grapher/climate/2024-11-18/sea_ice_extent_by_year.py new file mode 100644 index 00000000000..34899676ced --- /dev/null +++ b/etl/steps/data/grapher/climate/2024-11-18/sea_ice_extent_by_year.py @@ -0,0 +1,170 @@ +"""Load a garden dataset and create a grapher dataset.""" + +import re + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Minimum year to consider. +# This is chosen because the minimum year informed is 1978 (with only 2 months informed). +# NOTE: We could include 1979. But, for consistency between yearly and decadal data, we ignore this year. +YEAR_MIN = 1980 + + +def improve_metadata(tb: Table) -> Table: + tb = tb.copy() + + # Gather years in the data, and assign colors to them. + colors = {} + columns = [str(year) for year in sorted(set(tb["country"]), reverse=True)] + years = [int(re.findall(r"\d{4}", column)[0]) for column in columns] + for year, column in zip(years, columns): + if 1980 <= year < 1990: + # Light blue. + color = "#CCE5FF" + elif 1990 <= year < 2000: + # Medium light blue. + color = "#99CCFF" + elif 2000 <= year < 2010: + # Medium blue. + color = "#6699FF" + elif 2010 <= year < 2020: + # Darker blue. + color = "#3366FF" + elif year == max(years): + # Black. + color = "#000000" + else: + # Red. + color = "#F89B9B" + colors[column] = color + + # Rename table. + tb.metadata.title = "Sea ice extent in the northern and southern hemispheres by year" + for column in tb.drop(columns=["country", "year"]).columns: + location = column.split("sea_ice_extent_")[-1].title() + title = f"Monthly sea ice extent in the {location}" + description_short = "Each point represents the monthly average sea ice extent." + subtitle = ( + description_short + + " Years in the current decade are highlighted in red, with the current year highlighted in black." + ) + footnote = ( + "All years have data for all 12 months, except 1987 and 1988 (each missing one month) and the current year." + ) + + tb[column].metadata.title = title + tb[column].metadata.description_short = description_short + tb[column].metadata.presentation.title_public = title + tb[column].metadata.presentation.grapher_config = { + "subtitle": subtitle, + "note": footnote, + "selectedEntityNames": columns, + "selectedEntityColors": colors, + "originUrl": "https://ourworldindata.org/climate-change", + "hideAnnotationFieldsInTitle": {"time": True}, + "entityType": "year", + "entityTypePlural": "years", + } + + return tb + + +def sanity_check_inputs(tb: Table) -> None: + error = "Expected 1978 to be the first year in the data. Data may have changed. Consider editing YEAR_MIN" + assert tb["year"].min() == 1978, error + + # All years should have 12 months except: + # * The very first year in the data (1978). + # * Years 1987 and 1988, that have 11 months (because 1987-12 and 1988-01 are missing). + # * The very last year in the data (since it's the ongoing year). + error = "Expected 12 months per year." + assert ( + tb[~tb["year"].isin([tb["year"].min(), 1987, 1988, tb["year"].max()])] + .groupby(["location", "year"]) + .count()["sea_ice_extent"] + == 12 + ).all(), error + # Each month-year should appear only once in the data. + error = "Repeated months." + assert (tb.groupby(["location", "year", "month"]).count()["sea_ice_extent"] == 1).all(), error + # Each month-decade should appear 10 times (one per year in the decade), except: + # * The very first decade (1970s), since it starts in 1978. This decade will be ignored in the decadal data. + # * January and December 1980s, that appear 9 times (because 1987-12 and 1988-01 are missing). + # * The very last decade (since it's the ongoing decade). + error = "Expected 10 instances of each month per decade (except in specific cases)." + exceptions = tb[ + (tb["decade"] == tb["decade"].min()) + | (tb["decade"] == tb["decade"].max()) + | ((tb["decade"] == 1980) & (tb["month"].isin([1, 12]))) + ].index + assert (tb.drop(exceptions).groupby(["location", "decade", "month"]).count()["sea_ice_extent"] == 10).all(), error + assert ( + tb[(tb["decade"] == 1980) & (tb["month"].isin([1, 12]))] + .groupby(["location", "decade", "month"]) + .count()["sea_ice_extent"] + == 9 + ).all(), error + assert ( + tb[(tb["decade"] == tb["decade"].max())].groupby(["location", "decade", "month"]).count()["sea_ice_extent"] + <= 10 + ).all(), error + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("sea_ice_index") + tb = ds_garden.read("sea_ice_index", safe_types=False) + + # + # Process data. + # + # Rename locations conveniently. + tb = tb.astype({"location": "string"}) + tb.loc[tb["location"] == "Northern Hemisphere", "location"] = "Arctic" + tb.loc[tb["location"] == "Southern Hemisphere", "location"] = "Antarctic" + assert set(tb["location"]) == {"Arctic", "Antarctic"}, "Unexpected locations." + + # Create columns for month, year, and decade. + tb["year"] = tb["date"].dt.year + tb["month"] = tb["date"].dt.month + tb["decade"] = (tb["year"] // 10) * 10 + + # Sanity checks. + sanity_check_inputs(tb=tb) + + # Select years after a certain minimum (see explanation above, where YEAR_MIN is defined) and a certain location. + tb = ( + tb[(tb["year"] >= YEAR_MIN)] + .sort_values(["year", "month"], ascending=(False, True)) + .drop(columns=["date"], errors="raise") + .reset_index(drop=True) + ) + + # Create one column for each hemisphere. + tb = tb.pivot( + index=["year", "month"], columns=["location"], values=["sea_ice_extent"], join_column_levels_with="_" + ).underscore() + + # Create yearly table, adapted column names to grapher. + tb = tb.rename(columns={"year": "country", "month": "year"}, errors="raise") + + # Improve metadata. + tb = improve_metadata(tb=tb) + + # Improve format. + tb = tb.format() + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/climate/2024-11-19/total_precipitation_annual.meta.yml b/etl/steps/data/grapher/climate/2024-11-19/total_precipitation_annual.meta.yml new file mode 100644 index 00000000000..ac3e8664b4d --- /dev/null +++ b/etl/steps/data/grapher/climate/2024-11-19/total_precipitation_annual.meta.yml @@ -0,0 +1,44 @@ +definitions: + common: + unit: millimeters + short_unit: mm + presentation: + topic_tags: + - Climate Change + display: + numDecimalPlaces: 0 + description_from_producer: + This parameter is the accumulated liquid and frozen water, comprising rain and snow, that falls to the Earth's surface. It is the sum of large-scale precipitation and convective precipitation. Large-scale precipitation is generated by the cloud scheme in the ECMWF Integrated Forecasting System (IFS). The cloud scheme represents the formation and dissipation of clouds and large-scale precipitation due to changes in atmospheric quantities (such as pressure, temperature and moisture) predicted directly by the IFS at spatial scales of the grid box or larger. Convective precipitation is generated by the convection scheme in the IFS, which represents convection at spatial scales smaller than the grid box. This parameter does not include fog, dew or the precipitation that evaporates in the atmosphere before it lands at the surface of the Earth. This parameter is accumulated over a particular time period which depends on the data extracted. For the monthly averaged reanalysis and the monthly averaged ensemble members, the accumulation period is 1 day. For the monthly averaged reanalysis by hour of day, the accumulation period is 1 hour and for the monthly averaged ensemble members by hour of day, the accumulation period is 3 hours. The units of this parameter are depth in metres of water equivalent. It is the depth the water would have if it were spread evenly over the grid box. Care should be taken when comparing model parameters with observations, because observations are often local to a particular point in space and time, rather than representing averages over a model grid box. + processing_level: major + common_processing: |- + - Initially, the dataset is provided with specific coordinates in terms of longitude and latitude. To tailor this data to each country, we use geographical boundaries as defined by the World Bank. The method involves trimming the precipitation dataset to match the exact geographical shape of each country. To correct for potential distortions caused by projecting the Earth's curved surface onto a flat map, we apply a latitude-based weighting. This step is essential for maintaining accuracy, particularly in high-latitude regions where distortion is more pronounced. The result of this process is a latitude-weighted average precipitation for each nation. + - It’s important to note, however, that due to the resolution constraints of the Copernicus dataset, this methodology might not be as effective for countries with very small landmasses. In such cases, the process may not yield reliable data. + - The derived precipitation for each country is calculated based on administrative borders, encompassing all land surface types within these areas. As a result, precipitation over oceans and seas is not included in these averages, keeping the data focused on terrestrial environments. + - Global precipitation averages and anomalies, however, are calculated over both land and ocean surfaces. + precipitation_anomaly: |- + - The precipitation anomaly is calculated by comparing the average precipitation of a specific time period (e.g., a particular year or month) to the average surface precipitation of the same period from 1991 to 2020. + - When calculating anomalies for each country, the total precipitation of a given year or month is compared to the 1991-2020 average precipitation for that specific country. + - The reason for using the 1991-2020 period as the reference mean is that it is the standard reference period used by our data source, the Copernicus Climate Change Service. This period is also adopted by the UK Met Office. This approach ensures consistency in identifying climate variations over time. + + +dataset: + title: Annual precipitation and anomalies by country + update_period_days: 180 +tables: + total_precipitation: + variables: + total_precipitation: + title: Annual precipitation + description_short: Total annual precipitation—rain and snow—calculated as the sum of daily averages, reported as the depth of water falling to Earth's surface, excluding fog and dew. + description_processing: |- + {definitions.common_processing} + {definitions.precipitation_anomaly} + + + precipitation_anomaly: + title: Annual precipitation anomaly + description_short: The difference in a specific year's total precipitation—rain and snow—from the 1991–2020 average, measured in millimeters, excluding fog and dew. + description_processing: |- + {definitions.common_processing} + {definitions.precipitation_anomaly} + diff --git a/etl/steps/data/grapher/climate/2024-11-19/total_precipitation_annual.py b/etl/steps/data/grapher/climate/2024-11-19/total_precipitation_annual.py new file mode 100644 index 00000000000..ca01c8c5837 --- /dev/null +++ b/etl/steps/data/grapher/climate/2024-11-19/total_precipitation_annual.py @@ -0,0 +1,48 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +# Year with incomplete data +INCOMPLETE_YEAR = 2024 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("total_precipitation") + tb = ds_garden["total_precipitation"].reset_index() + + # + # Process data. + # + + # Get the year + tb["year"] = tb["time"].astype(str).str[0:4] + + # Group by year and calculate the mean of the specified columns + tb = ( + tb.groupby(["year", "country"]) + .agg({"total_precipitation": "sum", "precipitation_anomaly": "sum"}) + .reset_index() + ) + + # Remove rows where the year is 2024 as it's incomplete + tb["year"] = tb["year"].astype(int) + tb = tb[tb["year"] != INCOMPLETE_YEAR] + + tb = tb.format(["year", "country"]) + + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) + + ds_grapher.save() diff --git a/etl/steps/data/grapher/climate/latest/wildfires_by_year.meta.yml b/etl/steps/data/grapher/climate/latest/wildfires_by_year.meta.yml index b43c1e15c20..4ee055aa2bd 100644 --- a/etl/steps/data/grapher/climate/latest/wildfires_by_year.meta.yml +++ b/etl/steps/data/grapher/climate/latest/wildfires_by_year.meta.yml @@ -1,5 +1,17 @@ -# Learn more about the available fields: -# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: minor + display: + numDecimalPlaces: 0 + desc_wildfires: &desc_wildfires + - Wildfires are detected through the use of satellite imagery obtained from MODIS (Moderate Resolution Imaging Spectroradiometer) and VIIRS (Visible Infrared Imaging Radiometer Suite). These satellite systems are capable of identifying thermal anomalies and alterations in landscape patterns, which are indicative of burning. + - The data provider is presently engaged in a global accuracy assessment and acknowledged that they might be underestimating the genuine impact of wildfires, primarily due to constraints imposed by the spatial resolution of the sensors they employ. + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + tables: weekly_wildfires: # Learn more about the available fields: @@ -37,9 +49,24 @@ tables: area_ha_per_wildfire: title: Annual area burnt per wildfire + unit: hectares + short_unit: ha + description_short: The average area burnt per [wildfire](#dod:wildfires), in hectares. {definitions.desc_update} + description_processing: The area burnt per wildfire is calculated by dividing the area burnt by wildfires by the number of fires. + description_key: *desc_wildfires co2_ha_per_area: - title: Annual carbon dioxide emissions per area burnt + title: Carbon dioxide emissions per hectare burnt + unit: tonnes + short_unit: t + description_short: Carbon dioxide emissions per hectare burnt by [wildfires](#dod:wildfires), in tonnes. {definitions.desc_update} + description_processing: The carbon dioxide emissions per hectare is calculated by dividing the carbon dioxide emissions by the area burnt by wildfires. + description_key: *desc_wildfires pm2_5_ha_per_area: - title: Annual PM2.5 emissions per area burnt \ No newline at end of file + title: PM2.5 emissions per hectare burnt + unit: tonnes + short_unit: t + description_short: PM2.5 emissions per hectare burnt by [wildfires](#dod:wildfires), in tonnes. {definitions.desc_update} + description_processing: The PM2.5 emissions per hectare is calculated by dividing the PM2.5 emissions by the area burnt by wildfires. + description_key: *desc_wildfires diff --git a/etl/steps/data/grapher/climate/latest/wildfires_by_year.py b/etl/steps/data/grapher/climate/latest/wildfires_by_year.py index e5b6a77fadd..d15412f0821 100644 --- a/etl/steps/data/grapher/climate/latest/wildfires_by_year.py +++ b/etl/steps/data/grapher/climate/latest/wildfires_by_year.py @@ -1,8 +1,9 @@ """Load a garden dataset and create a grapher dataset.""" +import numpy as np import owid.catalog.processing as pr -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, last_date_accessed # Get paths and naming conventions for current step. paths = PathFinder(__file__) @@ -22,20 +23,7 @@ def run(dest_dir: str) -> None: # Get the year tb["year"] = tb["date"].astype(str).str[0:4] - tb = tb[ - [ - "country", - "year", - "area_ha", - "events", - "pm2_5", - "co2", - "share_area_ha", - "area_ha_per_wildfire", - "co2_ha_per_area", - "pm2_5_ha_per_area", - ] - ] + tb = tb[["country", "year", "area_ha", "events", "pm2_5", "co2", "share_area_ha"]] # Aggregate the data by year and country (ignore missing values when summing the columns) tb_annual_sum = tb.groupby(["country", "year"]).sum(min_count=1).reset_index() @@ -49,13 +37,30 @@ def run(dest_dir: str) -> None: ) for col in ["area_ha", "events", "pm2_5", "co2", "share_area_ha"]: tb_cumulative = tb_cumulative.rename(columns={col: col + "_cumulative"}) + tb = pr.merge(tb_annual_sum, tb_cumulative, on=["year", "country"]) + # Area per wildfire + tb["area_ha_per_wildfire"] = tb["area_ha"] / tb["events"] + + tb["co2_ha_per_area"] = tb["co2"] / tb["area_ha"] + tb["pm2_5_ha_per_area"] = tb["pm2_5"] / tb["area_ha"] + + tb[["co2_ha_per_area", "pm2_5_ha_per_area"]] = tb[["co2_ha_per_area", "pm2_5_ha_per_area"]].replace( + [float("inf"), -float("inf")], np.nan + ) + tb = tb.set_index(["country", "year"], verify_integrity=True) # Save outputs. # # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + ds_grapher = create_dataset( + dest_dir, + tables=[tb], + default_metadata=ds_garden.metadata, + yaml_params={"date_accessed": last_date_accessed(tb)}, + ) + ds_grapher.metadata.title = "Seasonal wildfire trends by year" ds_grapher.save() diff --git a/etl/steps/data/grapher/climate_watch/2024-11-21/emissions_by_sector.py b/etl/steps/data/grapher/climate_watch/2024-11-21/emissions_by_sector.py new file mode 100644 index 00000000000..d6caf97b9e1 --- /dev/null +++ b/etl/steps/data/grapher/climate_watch/2024-11-21/emissions_by_sector.py @@ -0,0 +1,51 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Convert million tonnes to tonnes. +MT_TO_T = 1e6 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("emissions_by_sector") + + # + # Process data. + # + # Process each table in the dataset. + tables = [] + for table_name in ds_garden.table_names: + tb = ds_garden[table_name].copy() + + # Drop unnecessary columns. + tb = tb.drop(columns=["population"], errors="raise") + + # For convenience, change units from "million tonnes" to "tonnes" and multiply all variables by a million. + # Doing this, grapher will know when to use the word "million" and when to use "billion". + for column in tb.columns: + if tb[column].metadata.unit == "million tonnes": + tb[column].metadata.unit = "tonnes" + tb[column].metadata.short_unit = "t" + tb[column] *= MT_TO_T + tb[column].metadata.description_short = tb[column].metadata.description_short.replace( + "million tonnes", "tonnes" + ) + + # Add current table to the list. + tables.append(tb) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, default_metadata=ds_garden.metadata, tables=tables, check_variables_metadata=True + ) + ds_grapher.save() diff --git a/etl/steps/data/grapher/countries/2024-08-27/gleditsch.py b/etl/steps/data/grapher/countries/2024-08-27/gleditsch.py index 6c5628bf200..b57e28447ba 100644 --- a/etl/steps/data/grapher/countries/2024-08-27/gleditsch.py +++ b/etl/steps/data/grapher/countries/2024-08-27/gleditsch.py @@ -32,8 +32,8 @@ def run(dest_dir: str) -> None: column_index = ["year", "country"] tb_countries = expand_time_column( tb_countries, - ["country"], - "year", + dimension_col=["country"], + time_col="year", method="full_range", fillna_method="zero", ) diff --git a/etl/steps/data/grapher/covid/2024-11-05/github_stats.py b/etl/steps/data/grapher/covid/2024-11-05/github_stats.py new file mode 100644 index 00000000000..bac832dbb18 --- /dev/null +++ b/etl/steps/data/grapher/covid/2024-11-05/github_stats.py @@ -0,0 +1,40 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("github_stats") + + # + # Process data. + # + tb_usr_contrib = ds_garden.read("user_contributions") + tb_contrib = ds_garden.read("contributions") + + # Add entity + tb_usr_contrib["country"] = "World" + tb_contrib["country"] = "World" + + # + # Save outputs. + # + tables = [ + tb_contrib.format(["country", "date", "interval"]), + tb_usr_contrib.format(["country", "date", "interval"]), + ] + + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/covid/latest/countries_reporting.py b/etl/steps/data/grapher/covid/latest/countries_reporting.py new file mode 100644 index 00000000000..8d9d1c11080 --- /dev/null +++ b/etl/steps/data/grapher/covid/latest/countries_reporting.py @@ -0,0 +1,29 @@ +"""Load a garden dataset and create a grapher dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("countries_reporting") + + # Read table from garden dataset. + tables = list(ds_garden) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/covid/latest/vaccinations_us.py b/etl/steps/data/grapher/covid/latest/vaccinations_us.py index dbb0ad8fde2..3c1d0eaee0a 100644 --- a/etl/steps/data/grapher/covid/latest/vaccinations_us.py +++ b/etl/steps/data/grapher/covid/latest/vaccinations_us.py @@ -1,7 +1,5 @@ """Load a garden dataset and create a grapher dataset.""" -from shared import to_grapher_date - from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -21,9 +19,6 @@ def run(dest_dir: str) -> None: # # Process data. # - # Grapher date - tb = to_grapher_date(tb, "2021-01-01") - # Rename state -> country for grapher tb = tb.rename_index_names( { diff --git a/etl/steps/data/grapher/demography/2023-03-31/population.py b/etl/steps/data/grapher/demography/2023-03-31/population.py index 142566154a4..f29d8d6ddd4 100644 --- a/etl/steps/data/grapher/demography/2023-03-31/population.py +++ b/etl/steps/data/grapher/demography/2023-03-31/population.py @@ -3,7 +3,6 @@ from copy import deepcopy from typing import Any, List -import numpy as np from owid.catalog import Table from etl.helpers import PathFinder, create_dataset @@ -172,10 +171,6 @@ def _create_metric_version_from_mask( Table Table with the new column. """ - # Get dtype - dtype = table[metric].dtype - if np.issubdtype(table[metric].dtype, np.integer): - dtype = "Int64" metric_new = f"{metric}_{metric_suffix}" table.loc[mask, metric_new] = deepcopy(table.loc[mask, metric]) table[metric_new].metadata = deepcopy(table[metric].metadata) @@ -190,4 +185,8 @@ def _create_metric_version_from_mask( display_name = table[metric_new].metadata.title table[metric_new].metadata.display["name"] = f"{display_name} {display_name_suffix}" table[metric_new].metadata.description = description + # Get dtype + dtype = table[metric].dtype + if "int" in str(dtype).lower(): + dtype = "Int64" return table.astype({metric_new: dtype}) diff --git a/etl/steps/data/grapher/demography/2024-11-26/multiple_births.py b/etl/steps/data/grapher/demography/2024-11-26/multiple_births.py new file mode 100644 index 00000000000..4940b624e48 --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-11-26/multiple_births.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("multiple_births") + + # Read table from garden dataset. + tb = ds_garden.read("multiple_births", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py b/etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py new file mode 100644 index 00000000000..1e319eaee4c --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py @@ -0,0 +1,35 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("survivor_percentiles") + + # Read table from garden dataset. + + # + # Process data. + # + tables = list(ds_garden) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_garden.metadata, + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/demography/2024-12-03/birth_rate.py b/etl/steps/data/grapher/demography/2024-12-03/birth_rate.py new file mode 100644 index 00000000000..dc16db838f9 --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-12-03/birth_rate.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("birth_rate") + + # Read table from garden dataset. + tb = ds_garden.read("birth_rate", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/demography/2024-12-03/broken_limits_le.py b/etl/steps/data/grapher/demography/2024-12-03/broken_limits_le.py new file mode 100644 index 00000000000..5c223bd8430 --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-12-03/broken_limits_le.py @@ -0,0 +1,32 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("broken_limits_le") + + # Read table from garden dataset. + tb = ds_garden["broken_limits_le"] + + # + # Process data. + # + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/demography/2024-12-03/fertility_rate.py b/etl/steps/data/grapher/demography/2024-12-03/fertility_rate.py new file mode 100644 index 00000000000..fd49f86a998 --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-12-03/fertility_rate.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("fertility_rate") + + # Read table from garden dataset. + tb = ds_garden.read("fertility_rate", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/demography/2024-12-03/gini_le.py b/etl/steps/data/grapher/demography/2024-12-03/gini_le.py new file mode 100644 index 00000000000..27d2649971e --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-12-03/gini_le.py @@ -0,0 +1,32 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("gini_le") + + # Read table from garden dataset. + tables = list(ds_garden) + + # + # Process data. + # + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/demography/2024-12-03/life_expectancy.py b/etl/steps/data/grapher/demography/2024-12-03/life_expectancy.py new file mode 100644 index 00000000000..1e23557767e --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-12-03/life_expectancy.py @@ -0,0 +1,32 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("life_expectancy") + + # Read table from garden dataset. + tables = list(ds_garden) + + # + # Process data. + # + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/demography/2024-12-03/life_tables.py b/etl/steps/data/grapher/demography/2024-12-03/life_tables.py new file mode 100644 index 00000000000..53efc672ef1 --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-12-03/life_tables.py @@ -0,0 +1,69 @@ +"""There is some work to filter only those indicators and dimensions that are relevant for the grapher. + +That is, we may just want a subset of the indicators, and few single-age groups. +""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Relevant indicators +INDICATORS_RELEVANT = [ + "central_death_rate", + "life_expectancy", + "probability_of_death", +] +INDICATORS_RELEVANT_REL = [ + "life_expectancy_fm_diff", + "life_expectancy_fm_ratio", + "central_death_rate_mf_ratio", +] +# Single-age groups to preserve +AGES_SINGLE = [ + 0, + 10, + 15, + 25, + 45, + 65, + 80, +] +AGES_SINGLE = list(map(str, AGES_SINGLE)) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("life_tables") + + # Read table from garden dataset. + tb = ds_garden.read("life_tables") + tb_diff = ds_garden.read("diff_ratios") + + # + # Process data. + # + ## Only keep particular ages + tb = tb.loc[tb["age"].isin(AGES_SINGLE)] + tb_diff = tb_diff.loc[tb_diff["age"].isin(AGES_SINGLE)] + + ## Set index back + tb = tb.format(["country", "year", "sex", "age", "type"]) + tb_diff = tb_diff.format(["country", "year", "age", "type"]) + + ## Only keep subset of columns + tb = tb[INDICATORS_RELEVANT] + tb_diff = tb_diff[INDICATORS_RELEVANT_REL] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb, tb_diff], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/demography/2024-12-03/phi_gender_le.py b/etl/steps/data/grapher/demography/2024-12-03/phi_gender_le.py new file mode 100644 index 00000000000..04c5c3ceccf --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-12-03/phi_gender_le.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("phi_gender_le") + + # Read table from garden dataset. + tables = list(ds_garden) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/emissions/2024-11-21/national_contributions.py b/etl/steps/data/grapher/emissions/2024-11-21/national_contributions.py new file mode 100644 index 00000000000..a8bf5f2bebf --- /dev/null +++ b/etl/steps/data/grapher/emissions/2024-11-21/national_contributions.py @@ -0,0 +1,22 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("national_contributions") + tb_garden = ds_garden["national_contributions"] + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/energy/2024-11-01/photovoltaic_cost_and_capacity.py b/etl/steps/data/grapher/energy/2024-11-01/photovoltaic_cost_and_capacity.py new file mode 100644 index 00000000000..15d3fb825d1 --- /dev/null +++ b/etl/steps/data/grapher/energy/2024-11-01/photovoltaic_cost_and_capacity.py @@ -0,0 +1,21 @@ +"""Load garden dataset of photovoltaic cost and capacity and create a grapher dataset. + +""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # Load table from garden dataset. + ds_garden = paths.load_dataset("photovoltaic_cost_and_capacity") + tb_garden = ds_garden["photovoltaic_cost_and_capacity"] + + # Remove unnecessary columns. + tb_garden = tb_garden.drop(columns=["cost_source", "cumulative_capacity_source"], errors="raise") + + # Create a new grapher dataset. + dataset = create_dataset(dest_dir=dest_dir, tables=[tb_garden], check_variables_metadata=True) + dataset.save() diff --git a/etl/steps/data/grapher/energy/2024-11-15/photovoltaic_cost_and_capacity.py b/etl/steps/data/grapher/energy/2024-11-15/photovoltaic_cost_and_capacity.py new file mode 100644 index 00000000000..15d3fb825d1 --- /dev/null +++ b/etl/steps/data/grapher/energy/2024-11-15/photovoltaic_cost_and_capacity.py @@ -0,0 +1,21 @@ +"""Load garden dataset of photovoltaic cost and capacity and create a grapher dataset. + +""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # Load table from garden dataset. + ds_garden = paths.load_dataset("photovoltaic_cost_and_capacity") + tb_garden = ds_garden["photovoltaic_cost_and_capacity"] + + # Remove unnecessary columns. + tb_garden = tb_garden.drop(columns=["cost_source", "cumulative_capacity_source"], errors="raise") + + # Create a new grapher dataset. + dataset = create_dataset(dest_dir=dest_dir, tables=[tb_garden], check_variables_metadata=True) + dataset.save() diff --git a/etl/steps/data/grapher/energy/2024-11-20/energy_prices.py b/etl/steps/data/grapher/energy/2024-11-20/energy_prices.py new file mode 100644 index 00000000000..237959d850e --- /dev/null +++ b/etl/steps/data/grapher/energy/2024-11-20/energy_prices.py @@ -0,0 +1,24 @@ +"""Load garden dataset and create a grapher dataset. + +""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # Load tables from garden dataset. + ds_garden = paths.load_dataset("energy_prices") + tb_annual = ds_garden.read("energy_prices_annual", reset_index=False) + tb_monthly = ds_garden.read("energy_prices_monthly", reset_index=False) + + # Create a new grapher dataset. + dataset = create_dataset( + dest_dir=dest_dir, + tables=[tb_annual, tb_monthly], + check_variables_metadata=True, + default_metadata=ds_garden.metadata, + ) + dataset.save() diff --git a/etl/steps/data/grapher/ess/2023-08-02/ess_trust.py b/etl/steps/data/grapher/ess/2023-08-02/ess_trust.py deleted file mode 100644 index fbcf6019387..00000000000 --- a/etl/steps/data/grapher/ess/2023-08-02/ess_trust.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Load a garden dataset and create a grapher dataset.""" - - -from etl.helpers import PathFinder, create_dataset, grapher_checks - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load garden dataset. - ds_garden = paths.load_dataset("ess_trust") - - # Read table from garden dataset. - tb = ds_garden["ess_trust"] - - # - # Process data. - # - - # - # Save outputs. - # - # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) - - # - # Checks. - # - grapher_checks(ds_grapher) - - # Save changes in the new grapher dataset. - ds_grapher.save() diff --git a/etl/steps/data/grapher/fasttrack/2022-11-01/lighting_efficiency_uk.meta.yml b/etl/steps/data/grapher/fasttrack/2022-11-01/lighting_efficiency_uk.meta.yml index af141edea71..eff3194f072 100644 --- a/etl/steps/data/grapher/fasttrack/2022-11-01/lighting_efficiency_uk.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2022-11-01/lighting_efficiency_uk.meta.yml @@ -1,40 +1,18 @@ dataset: - namespace: fasttrack - version: '2022-11-01' - short_name: lighting_efficiency_uk title: Lighting effiency and shares in the UK description: '' - sources: - - name: 'Fouquet & Pearson (2006). Seven centuries of energy services: The price - and use of light in the United Kingdom (1300-2000).' - published_by: 'Fouquet, R., & Pearson, P. J. (2006). Seven centuries of energy - services: The price and use of light in the United Kingdom (1300-2000). The - energy journal, 27(1).' - url: https://www.jstor.org/stable/23296980 + licenses: + - {} tables: lighting_efficiency_uk: variables: share_of_lighting_uk: title: share_of_lighting_uk - short_unit: '%' unit: '%' + short_unit: '%' description: The share of lighting in the UK that was provided by each source. - sources: - - name: 'Fouquet & Pearson (2006). Seven centuries of energy services: The - price and use of light in the United Kingdom (1300-2000).' - published_by: 'Fouquet, R., & Pearson, P. J. (2006). Seven centuries of - energy services: The price and use of light in the United Kingdom (1300-2000). - The energy journal, 27(1).' - url: https://www.jstor.org/stable/23296980 efficiency_lighting_uk: title: efficiency_lighting_uk unit: lumen-hours per kWh - description: The efficiency of lighting measures the output of light per unit - of energy. It's measured in lumen-hours per kilowatt-hour (kWh). - sources: - - name: 'Fouquet & Pearson (2006). Seven centuries of energy services: The - price and use of light in the United Kingdom (1300-2000).' - published_by: 'Fouquet, R., & Pearson, P. J. (2006). Seven centuries of - energy services: The price and use of light in the United Kingdom (1300-2000). - The energy journal, 27(1).' - url: https://www.jstor.org/stable/23296980 + description: |- + The efficiency of lighting measures the output of light per unit of energy. It's measured in lumen-hours per kilowatt-hour (kWh). diff --git a/etl/steps/data/grapher/fasttrack/2022-11-01/lighting_efficiency_uk.py b/etl/steps/data/grapher/fasttrack/2022-11-01/lighting_efficiency_uk.py index db8277a1af8..019afbe6fc1 100644 --- a/etl/steps/data/grapher/fasttrack/2022-11-01/lighting_efficiency_uk.py +++ b/etl/steps/data/grapher/fasttrack/2022-11-01/lighting_efficiency_uk.py @@ -1,21 +1,40 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -N = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: # load snapshot - data = pd.read_csv(Snapshot("fasttrack/2022-11-01/lighting_efficiency_uk.csv").path) + snap = Snapshot("fasttrack/2022-11-01/lighting_efficiency_uk.csv") - # create empty dataframe and table - ds = catalog.Dataset.create_empty(dest_dir) - tb = catalog.Table(data, short_name=N.short_name) + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds.add(tb) - ds.update_metadata(N.metadata_path) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/2023-03-27/global_warming_contributions.meta.yml b/etl/steps/data/grapher/fasttrack/2023-03-27/global_warming_contributions.meta.yml new file mode 100644 index 00000000000..d937637adbd --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/2023-03-27/global_warming_contributions.meta.yml @@ -0,0 +1,255 @@ +dataset: + title: Global warming contributions + description: |- + Jones et al. (2023) quantify national and regional contributions to the increase of global mean surface temperature over the last few centuries. As they detail: the "dataset describing the global warming response to national emissions CO2, CH4 and N2O from fossil and land use sources during 1851-2021. + + National CO2 emissions data are collated from the Global Carbon Project (Andrew and Peters, 2022; Friedlingstein et al., 2022). + + National CH4 and N2O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2022). + + We construct a time series of cumulative CO2-equivalent emissions for each country, gas, and emissions source (fossil or land use). Emissions of CH4 and N2O emissions are related to cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021). + + Warming in response to cumulative CO2-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST)." + licenses: + - {} +tables: + global_warming_contributions: + variables: + annual_fossil_co2: + title: Annual fossil CO2 emissions + unit: tonnes + short_unit: t + annual_land_co2: + title: Annual CO2 emissions from agriculture and land use + unit: tonnes + short_unit: t + annual_co2: + title: Annual CO2 emissions + unit: tonnes + short_unit: t + annual_fossil_ch4: + title: annual_fossil_ch4 + unit: tonnes + short_unit: t + annual_land_ch4: + title: annual_land_ch4 + unit: tonnes + short_unit: t + annual_ch4: + title: annual_ch4 + unit: tonnes + short_unit: t + annual_fossil_n2o: + title: annual_fossil_n2o + unit: tonnes + short_unit: t + annual_land_n2o: + title: annual_land_n2o + unit: tonnes + short_unit: t + annual_n2o: + title: annual_n2o + unit: tonnes + short_unit: t + annual_fossil_ch4_co2eq: + title: Annual methane emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description: |- + Methane emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC conversion factors. Jones et al. (2023) give methane emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources, and 27.2 for agricultural and land use sources (as per the IPCC AR6 report). + annual_land_ch4_co2eq: + title: Annual methane emissions from agriculture and land use + unit: tonnes + short_unit: t + description: |- + Methane emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC conversion factors. Jones et al. (2023) give methane emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources, and 27.2 for agricultural and land use sources (as per the IPCC AR6 report). + annual_ch4_co2eq: + title: Annual methane emissions + unit: tonnes + short_unit: t + description: |- + Methane emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC conversion factors. Jones et al. (2023) give methane emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources, and 27.2 for agricultural and land use sources (as per the IPCC AR6 report). + annual_fossil_n2o_co2eq: + title: Annual nitrous oxide emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description: |- + Nitrous oxide emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 (as per the IPCC AR6 report). + annual_land_n2o_co2eq: + title: Annual nitrous oxide emissions from agriculture and land use + unit: tonnes + short_unit: t + description: |- + Nitrous oxide emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 (as per the IPCC AR6 report). + annual_n2o_co2eq: + title: Annual nitrous oxide emissions + unit: tonnes + short_unit: t + description: |- + Nitrous oxide emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 (as per the IPCC AR6 report). + annual_ghg_co2eq: + title: Annual greenhouse gas emissions + unit: tonnes + short_unit: t + description: |- + Greenhouse gas emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). + annual_fossil_co2eq: + title: Annual greenhouse gas emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description: |- + Greenhouse gas emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). + annual_land_co2eq: + title: Annual greenhouse gas emissions from agriculture and land use + unit: tonnes + short_unit: t + description: |- + Greenhouse gas emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). + share_global_ch4: + title: Share of global methane emissions + unit: '%' + short_unit: '%' + description: |- + Methane emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC conversion factors. Jones et al. (2023) give methane emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources, and 27.2 for agricultural and land use sources (as per the IPCC AR6 report). + share_global_n2o: + title: Share of global nitrous oxide emissions + unit: '%' + short_unit: '%' + description: |- + Nitrous oxide emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 (as per the IPCC AR6 report). + share_global_ghg: + title: Share of global greenhouse gas emissions + unit: '%' + short_unit: '%' + description: |- + Greenhouse gas emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). + cumulative_fossil_co2: + title: cumulative_fossil_co2 + unit: tonnes + short_unit: t + cumulative_land_co2: + title: cumulative_land_co2 + unit: tonnes + short_unit: t + cumulative_co2: + title: cumulative_co2 + unit: tonnes + short_unit: t + cumulative_fossil_ch4: + title: cumulative_fossil_ch4 + unit: tonnes + short_unit: t + cumulative_land_ch4: + title: cumulative_land_ch4 + unit: tonnes + short_unit: t + cumulative_ch4: + title: cumulative_ch4 + unit: tonnes + short_unit: t + cumulative_fossil_n2o: + title: cumulative_fossil_n2o + unit: tonnes + short_unit: t + cumulative_land_n2o: + title: cumulative_land_n2o + unit: tonnes + short_unit: t + cumulative_n2o: + title: cumulative_n2o + unit: tonnes + short_unit: t + cumulative_fossil_ghg: + title: cumulative_fossil_ghg + unit: tonnes + short_unit: t + cumulative_land_ghg: + title: cumulative_land_ghg + unit: tonnes + short_unit: t + cumulative_ghg: + title: Cumulative greenhouse gas emissions + unit: tonnes + short_unit: t + description: |- + Greenhouse gas emissions are calculated by Our World in Data based on emissions data from Jones et al. (2023) and IPCC AR6 conversion factors. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). + temp_fossil_co2: + title: temp_fossil_co2 + unit: °C + short_unit: °C + temp_land_co2: + title: temp_land_co2 + unit: °C + short_unit: °C + temp_co2: + title: Change in global mean surface temperature from CO2 emissions + unit: °C + short_unit: °C + description: |- + This measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide. The warming effects of each gas are calculated based on cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach. + temp_fossil_ch4: + title: temp_fossil_ch4 + unit: °C + short_unit: °C + temp_land_ch4: + title: temp_land_ch4 + unit: °C + short_unit: °C + temp_ch4: + title: Change in global mean surface temperature from methane emissions + unit: °C + short_unit: °C + description: |- + This measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of methane. The warming effects of each gas are calculated based on cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach. + temp_fossil_n2o: + title: temp_fossil_n2o + unit: °C + short_unit: °C + temp_land_n2o: + title: temp_land_n2o + unit: °C + short_unit: °C + temp_n2o: + title: Change in global mean surface temperature from nitrous oxide emissions + unit: °C + short_unit: °C + description: |- + This measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative nitrous oxide emissions. The warming effects of each gas are calculated based on cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach. + temp_fossil_ghg: + title: temp_fossil_ghg + unit: °C + short_unit: °C + temp_land_ghg: + title: temp_land_ghg + unit: °C + short_unit: °C + temp_ghg: + title: Change in global mean surface temperature from greenhouse gas emissions + unit: °C + short_unit: °C + description: |- + This measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide. The warming effects of each gas are calculated based on cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach. + share_global_warming: + title: Share of contribution to global warming + unit: '%' + short_unit: '%' + description: |- + This measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide. The warming effects of each gas are calculated based on cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach. + annual_ch4_per_capita: + title: Methane emissions per person + unit: tonnes + short_unit: t + description: |- + Methane emissions per person are calculated by Our World in Data based on emissions data from Jones et al. (2023) and population data from HYDE and the UN World Population Prospects. Jones et al. (2023) give methane emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources, and 27.2 for agricultural and land use sources (as per the IPCC AR6 report). + annual_n2o_per_capita: + title: Nitrous oxide emissions per person + unit: tonnes + short_unit: t + description: |- + Nitrous oxide emissions per person are calculated by Our World in Data based on emissions data from Jones et al. (2023) and population data from HYDE and the UN World Population Prospects. Jones et al. (2023) give nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 (as per the IPCC AR6 report). + annual_ghg_co2eq_per_capita: + title: Greenhouse gas emissions per person + unit: tonnes + short_unit: t + description: |- + Greenhouse gas emissions per person are calculated by Our World in Data based on emissions data from Jones et al. (2023) and population data from HYDE and the UN World Population Prospects. Jones et al. (2023) give methane and nitrous oxide emissions in standard metric tonnes per year. We have converted these emissions to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources (as per the IPCC AR6 report). diff --git a/etl/steps/data/grapher/fasttrack/2023-03-27/global_warming_contributions.py b/etl/steps/data/grapher/fasttrack/2023-03-27/global_warming_contributions.py new file mode 100644 index 00000000000..cc079edc61d --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/2023-03-27/global_warming_contributions.py @@ -0,0 +1,40 @@ +import pandas as pd + +from etl.helpers import PathFinder, create_dataset, get_metadata_path +from etl.snapshot import Snapshot + +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # load snapshot + snap = Snapshot("fasttrack/2023-03-27/global_warming_contributions.csv") + + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) + + # add table, update metadata from *.meta.yml and save + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/2023-04-30/paratz.meta.yml b/etl/steps/data/grapher/fasttrack/2023-04-30/paratz.meta.yml index fa6dbc58bf9..1f55ca2acad 100644 --- a/etl/steps/data/grapher/fasttrack/2023-04-30/paratz.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2023-04-30/paratz.meta.yml @@ -1,21 +1,16 @@ dataset: - sources: - - name: Paratz et al., (2023) - published_by: Heart Rhythm Journal - publication_year: 2023 - date_accessed: 2023-07-13 - url: https://www.heartrhythmjournal.com/article/S1547-5271(23)00027-9/fulltext + title: A systematic review of global autopsy rates in all-cause mortality and young sudden death, Paratz et al (2023) + description: |- + The data for this indicator is taken from: Paratz ED, Rowe SJ, Stub D, Pflaumer A, La Gerche A. A systematic review of global autopsy rates in all-cause mortality and young sudden death. Heart Rhythm. 2023 Apr;20(4):607-613. doi: 10.1016/j.hrthm.2023.01.008. + + The data is collated from a number of published papers and databases. The year shown reflects the date given in the database or the year of the publication. For Spain and Australia the data is only representative of a region of each country, Catalonia and Victoria, respectively. + licenses: + - {} tables: paratz: variables: autopsy_rate: title: Autopsy rate - short_unit: '%' unit: '%' + short_unit: '%' description: Autopsy rates reported in all-cause death. - sources: - - name: Paratz et al., (2023) - published_by: Heart Rhythm Journal - publication_year: 2023 - date_accessed: 2023-07-13 - url: https://www.heartrhythmjournal.com/article/S1547-5271(23)00027-9/fulltext diff --git a/etl/steps/data/grapher/fasttrack/2023-04-30/paratz.py b/etl/steps/data/grapher/fasttrack/2023-04-30/paratz.py index 349d0e134fd..114719618eb 100644 --- a/etl/steps/data/grapher/fasttrack/2023-04-30/paratz.py +++ b/etl/steps/data/grapher/fasttrack/2023-04-30/paratz.py @@ -1,10 +1,9 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/2023-04-30/paratz.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/2023-05-03/apms_2014.meta.yml b/etl/steps/data/grapher/fasttrack/2023-05-03/apms_2014.meta.yml index b4c0e24ead3..21de647880e 100644 --- a/etl/steps/data/grapher/fasttrack/2023-05-03/apms_2014.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2023-05-03/apms_2014.meta.yml @@ -1,24 +1,10 @@ dataset: - namespace: fasttrack - version: '2023-05-03' - short_name: apms_2014 title: Current depression in England by age and gender (APMS, 2014) - description: This is a dataset of the prevalence of current depression in the general - population in England, living in private households. Households were sampled randomly - and individuals were interviewed using the revised Clinical Interview Schedule - (CIS-R), which is a diagnostic structured interview format to determine whether - people had common mental disorders in the past week. In this dataset, presence - of a current episode of major depression was determined. - sources: - - name: Adult Psychiatric Morbidity Survey 2014, England (2016) - published_by: '"McManus S, Bebbington P, Jenkins R, Brugha T. (eds.) (2016) Mental - health and wellbeing in England: Adult Psychiatric Morbidity Survey 2014. Leeds: - NHS Digital"' - description: Surveys of individuals in randomly-selected private households - in England - publication_year: 2016 - date_accessed: 2022-12-01 - url: https://www.gov.uk/government/statistics/adult-psychiatric-morbidity-survey-mental-health-and-wellbeing-england-2014 + description: |- + This is a dataset of the prevalence of current depression in the general population in England, living in private households. Households were sampled randomly and individuals were interviewed using the revised Clinical Interview Schedule (CIS-R), which is a diagnostic structured interview format to determine whether people had common mental disorders in the past week. In this dataset, presence of a current episode of major depression was determined. + licenses: + - name: Open Government Licence v3.0 + url: https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/ tables: apms_2014: variables: diff --git a/etl/steps/data/grapher/fasttrack/2023-05-03/apms_2014.py b/etl/steps/data/grapher/fasttrack/2023-05-03/apms_2014.py index 07dbcf3e545..a1bcca99f8d 100644 --- a/etl/steps/data/grapher/fasttrack/2023-05-03/apms_2014.py +++ b/etl/steps/data/grapher/fasttrack/2023-05-03/apms_2014.py @@ -1,19 +1,40 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: # load snapshot - data = pd.read_csv(Snapshot("fasttrack/2023-05-03/apms_2014.csv").path) + snap = Snapshot("fasttrack/2023-05-03/apms_2014.csv") - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb]) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/2023-05-31/cholera.meta.yml b/etl/steps/data/grapher/fasttrack/2023-05-31/cholera.meta.yml index 7b2b37923d5..0be2698a83e 100644 --- a/etl/steps/data/grapher/fasttrack/2023-05-31/cholera.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2023-05-31/cholera.meta.yml @@ -1,7 +1,4 @@ dataset: - namespace: fasttrack - version: '2023-05-31' - short_name: cholera title: Cholera reported cases, deaths and case fatality rate (WHO, 2023) description: |- The data is created by combining multiple WHO Weekly Epidemiological Reports for cholera reported cases, deaths and case fatality rate. @@ -17,37 +14,22 @@ dataset: 2020: https://web.archive.org/web/20230326231135/http://apps.who.int/iris/bitstream/handle/10665/345271/WER9637-445-454-eng-fre.pdf?sequence=1&isAllowed=y 2021: https://web.archive.org/web/20230526223955/https://apps.who.int/iris/bitstream/handle/10665/362858/WER9737-453-464-eng-fre.pdf?sequence=1&isAllowed=y - sources: - - name: World Health Organization (2023) - published_by: World Health Organization - date_accessed: 2023-05-31 + licenses: + - {} tables: cholera: variables: cholera_reported_cases: title: Cholera reported cases unit: reported cases - description: Confirmed cholera cases, including those confirmed clinically, - epidemiologically, or by laboratory investigation. - sources: - - name: World Health Organization (2023) - published_by: World Health Organization - date_accessed: 2023-05-31 + description: Confirmed cholera cases, including those confirmed clinically, epidemiologically, or by laboratory investigation. cholera_case_fatality_rate: title: Cholera case fatality rate - short_unit: '%' unit: '%' - description: WHO calculates case fatality rates based on the numbers of cases - and deaths as reported by national authorities (Ministries of Health). - sources: - - name: World Health Organization (2023) - published_by: World Health Organization - date_accessed: 2023-05-31 + short_unit: '%' + description: |- + WHO calculates case fatality rates based on the numbers of cases and deaths as reported by national authorities (Ministries of Health). cholera_deaths: title: Cholera deaths unit: deaths description: Number of deaths from cholera reported to WHO - sources: - - name: World Health Organization (2023) - published_by: World Health Organization - date_accessed: 2023-05-31 diff --git a/etl/steps/data/grapher/fasttrack/2023-05-31/cholera.py b/etl/steps/data/grapher/fasttrack/2023-05-31/cholera.py index 9282b916edd..740dcc7f72e 100644 --- a/etl/steps/data/grapher/fasttrack/2023-05-31/cholera.py +++ b/etl/steps/data/grapher/fasttrack/2023-05-31/cholera.py @@ -1,19 +1,40 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: # load snapshot - data = pd.read_csv(Snapshot("fasttrack/2023-05-31/cholera.csv").path) + snap = Snapshot("fasttrack/2023-05-31/cholera.csv") - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb]) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/2023-06-16/guinea_worm.meta.yml b/etl/steps/data/grapher/fasttrack/2023-06-16/guinea_worm.meta.yml index fbd897b7e74..fc674303d0e 100644 --- a/etl/steps/data/grapher/fasttrack/2023-06-16/guinea_worm.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2023-06-16/guinea_worm.meta.yml @@ -1,15 +1,8 @@ dataset: - namespace: fasttrack - version: '2023-06-16' - short_name: guinea_worm title: Guinea Worm Cases - Carter Center (2023) description: The number of cases of guinea worm disease worldwide since 1989 - sources: - - name: The Carter Center (2023) - published_by: The Carter Center (2023) - publication_year: 2023 - date_accessed: 2023-06-16 - url: https://www.cartercenter.org/resources/pdfs/news/health_publications/guinea_worm/guinea-worm-cases-by-year-from-1989.pdf + licenses: + - {} tables: guinea_worm: variables: @@ -17,9 +10,3 @@ tables: title: Guinea Worm Cases unit: cases description: The number of human cases of guinea worm disease. - sources: - - name: The Carter Center (2023) - published_by: The Carter Center (2023) - publication_year: 2023 - date_accessed: 2023-06-16 - url: https://www.cartercenter.org/resources/pdfs/news/health_publications/guinea_worm/guinea-worm-cases-by-year-from-1989.pdf diff --git a/etl/steps/data/grapher/fasttrack/2023-06-16/guinea_worm.py b/etl/steps/data/grapher/fasttrack/2023-06-16/guinea_worm.py index 7d0e4f07318..d247a84999a 100644 --- a/etl/steps/data/grapher/fasttrack/2023-06-16/guinea_worm.py +++ b/etl/steps/data/grapher/fasttrack/2023-06-16/guinea_worm.py @@ -1,19 +1,40 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: # load snapshot - data = pd.read_csv(Snapshot("fasttrack/2023-06-16/guinea_worm.csv").path) + snap = Snapshot("fasttrack/2023-06-16/guinea_worm.csv") - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])]) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/2023-06-19/world_population_comparison.meta.yml b/etl/steps/data/grapher/fasttrack/2023-06-19/world_population_comparison.meta.yml index 5802a16bd35..60c67e53309 100644 --- a/etl/steps/data/grapher/fasttrack/2023-06-19/world_population_comparison.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2023-06-19/world_population_comparison.meta.yml @@ -1,7 +1,24 @@ dataset: - sources: - - name: Multiple sources compiled by Our World in Data (2019) - published_by: Multiple sources compiled by Our World in Data (2019) + title: Historical world population comparison (various sources) + description: |- + Among others these are the original source: + + McEvedy, Colin and Richard Jones, 1978, “Atlas of World Population History,” Facts on File, New York, pp. 342-351. + + Biraben, Jean-Noel, 1980, An Essay Concerning Mankind’s Evolution, Population, Selected Papers, December, table 2. + + Durand, John D., 1974, “Historical Estimates of World Population: An Evaluation,” University of Pennsylvania, Population Center, Analytical and Technical Reports, Number 10, table 2. + + Haub, Carl, 1995, “How Many People Have Ever Lived on Earth?” Population Today, February, p. 5. + + Thomlinson, Ralph, 1975, “Demographic Problems, Controversy Over Population Control,” Second Edition, Table 1. + + United Nations, 1999, The World at Six Billion, Table 1, “World Population From” Year 0 to Stabilization, p. 5, + U.S. Census Bureau (USCB), 2012, Total Midyear Population for the World: 1950-2050. + + Michael Kremer (1993) “Population Growth and Technological Change: One Million B.C. to 1990”, Quarterly Journal of Economics., August 1993, pp.681-716. + licenses: + - {} tables: world_population_comparison: variables: diff --git a/etl/steps/data/grapher/fasttrack/2023-06-19/world_population_comparison.py b/etl/steps/data/grapher/fasttrack/2023-06-19/world_population_comparison.py index 83d3987374f..de557839e00 100644 --- a/etl/steps/data/grapher/fasttrack/2023-06-19/world_population_comparison.py +++ b/etl/steps/data/grapher/fasttrack/2023-06-19/world_population_comparison.py @@ -1,10 +1,9 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/2023-06-19/world_population_comparison.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/2023-08-07/pain_hours_days_hen_systems.meta.yml b/etl/steps/data/grapher/fasttrack/2023-08-07/pain_hours_days_hen_systems.meta.yml index 69e507508a6..42c4bfa247b 100644 --- a/etl/steps/data/grapher/fasttrack/2023-08-07/pain_hours_days_hen_systems.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2023-08-07/pain_hours_days_hen_systems.meta.yml @@ -1,119 +1,68 @@ dataset: - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ + title: Pain hours and days of hen systems (Welfare Footprint) + description: '' + licenses: + - {} tables: pain_hours_days_hen_systems: variables: total_hours_in_pain: title: total_hours_in_pain - short_unit: hours unit: hours - description: The total number of hours an average hen will spend in pain, regardless of the intensity. + short_unit: hours display: numDecimalPlaces: 0.0 - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ + description: The total number of hours an average hen will spend in pain, regardless of the intensity. excrutiating_hours_in_pain: title: excrutiating_hours_in_pain - short_unit: hours unit: hours - description: The number of hours an average hen will spend in excrutiating pain. + short_unit: hours display: numDecimalPlaces: 2.0 - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ + description: The number of hours an average hen will spend in excrutiating pain. disabling_pain_in_hours: title: disabling_pain_in_hours - short_unit: hours unit: hours - description: The number of hours an average hen will spend in disabling pain. + short_unit: hours display: numDecimalPlaces: 0.0 - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ + description: The number of hours an average hen will spend in disabling pain. hurtful_pain_in_hours: title: hurtful_pain_in_hours - short_unit: hours unit: hours - description: The number of hours an average hen will spend in hurtful pain. + short_unit: hours display: numDecimalPlaces: 0.0 - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ + description: The number of hours an average hen will spend in hurtful pain. annoying_pain_in_hours: title: annoying_pain_in_hours - short_unit: hours unit: hours - description: The number of hours an average hen will spend in annoying pain. + short_unit: hours display: numDecimalPlaces: 0.0 - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ + description: The number of hours an average hen will spend in annoying pain. total_days_in_pain: title: total_days_in_pain - short_unit: days unit: days + short_unit: days description: The total number of days an average hen will spend in pain, regardless of the intensity. - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ excrutating_pain_in_days: title: excrutating_pain_in_days - short_unit: days unit: days + short_unit: days description: The number of days an average hen will spend in excrutiating pain. - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ disabling_pain_in_days: title: disabling_pain_in_days - short_unit: days unit: days + short_unit: days description: The number of days an average hen will spend in disabling pain. - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ hurtful_pain_in_days: title: hurtful_pain_in_days - short_unit: days unit: days + short_unit: days description: The number of days an average hen will spend in hurtful pain. - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ annoying_pain_in_days: title: annoying_pain_in_days - short_unit: days unit: days + short_unit: days description: The number of days an average hen will spend in annoying pain. - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ diff --git a/etl/steps/data/grapher/fasttrack/2023-08-07/pain_hours_days_hen_systems.py b/etl/steps/data/grapher/fasttrack/2023-08-07/pain_hours_days_hen_systems.py index a8b5904420f..7f2af7320fd 100644 --- a/etl/steps/data/grapher/fasttrack/2023-08-07/pain_hours_days_hen_systems.py +++ b/etl/steps/data/grapher/fasttrack/2023-08-07/pain_hours_days_hen_systems.py @@ -1,10 +1,9 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/2023-08-07/pain_hours_days_hen_systems.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/2023-08-21/survey_livestock_oklahoma.meta.yml b/etl/steps/data/grapher/fasttrack/2023-08-21/survey_livestock_oklahoma.meta.yml index f9e0daeeaec..e24c3af8a4a 100644 --- a/etl/steps/data/grapher/fasttrack/2023-08-21/survey_livestock_oklahoma.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2023-08-21/survey_livestock_oklahoma.meta.yml @@ -2,54 +2,35 @@ dataset: title: Survey attitudes to livestock farming (Oklahoma University) description: '' licenses: - - {} - sources: - - name: Food Demand Survey, Oklahoma State University - url: https://web.archive.org/web/20190806000018/http://agecon.okstate.edu/files/january%202018.pdf - publication_year: '2018' - published_by: Oklahoma State University, Department of Agricultural Economics + - {} tables: survey_livestock_oklahoma: variables: strongly_agree: - title: strongly_agree + title: Strongly agree unit: '' short_unit: '%' - display: - name: Strongly agree agree: - title: agree + title: Agree unit: '' short_unit: '%' - display: - name: Agree somewhat_agree: - title: somewhat_agree + title: Somewhat agree unit: '' short_unit: '%' - display: - name: Somewhat agree no_opinion: - title: no_opinion + title: No opinion unit: '' short_unit: '%' - display: - name: No opinion somewhat_disagree: - title: somewhat_disagree + title: Somewhat disagree unit: '' short_unit: '%' - display: - name: Somewhat disagree disagree: - title: disagree + title: Disagree unit: '' short_unit: '%' - display: - name: Disagree strongly_disagree: - title: strongly_disagree + title: Strongly disagree unit: '' short_unit: '%' - display: - name: Strongly disagree diff --git a/etl/steps/data/grapher/fasttrack/2023-08-21/survey_livestock_oklahoma.py b/etl/steps/data/grapher/fasttrack/2023-08-21/survey_livestock_oklahoma.py index a958614e9b8..1723efe0e28 100644 --- a/etl/steps/data/grapher/fasttrack/2023-08-21/survey_livestock_oklahoma.py +++ b/etl/steps/data/grapher/fasttrack/2023-08-21/survey_livestock_oklahoma.py @@ -1,10 +1,9 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/2023-08-21/survey_livestock_oklahoma.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/2023-09-29/un_space_objects.meta.yml b/etl/steps/data/grapher/fasttrack/2023-09-29/un_space_objects.meta.yml index 8fe8448f45e..fef74cf4a55 100644 --- a/etl/steps/data/grapher/fasttrack/2023-09-29/un_space_objects.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2023-09-29/un_space_objects.meta.yml @@ -1,34 +1,17 @@ dataset: title: Online Index of Objects Launched into Outer Space (UN, 2023-09-29) - description: >- - This data is compiled from the Online Index of Objects Launched into Outer Space, maintained by the United Nations Office - for Outer Space Affairs. + description: |- + This data is compiled from the Online Index of Objects Launched into Outer Space, maintained by the United Nations Office for Outer Space Affairs. + Since 1962, the United Nations has maintained a Register of Objects Launched into Outer Space. Originally established as a mechanism to aid the United Nations Committee on the Peaceful Uses of Outer Space in its discussions on the political, legal, and technical issues concerning outer space, the evolution of international space law resulted in space object registration becoming a means of identifying which States' bear international responsibility and liability for space objects. - Since 1962, the United Nations has maintained a Register of Objects Launched into Outer Space. Originally established - as a mechanism to aid the United Nations Committee on the Peaceful Uses of Outer Space in its discussions on the political, - legal, and technical issues concerning outer space, the evolution of international space law resulted in space object - registration becoming a means of identifying which States' bear international responsibility and liability for space objects. - - - The source indicates that around 87% of all satellites, probes, landers, crewed spacecraft, and space station flight elements - launched into Earth orbit or beyond have been registered with the Secretary-General. - + The source indicates that around 87% of all satellites, probes, landers, crewed spacecraft, and space station flight elements launched into Earth orbit or beyond have been registered with the Secretary-General. In the data shown on our charts: - - when an object is launched by a country on behalf of another one, it is attributed to the latter; - - - when a launch is made jointly by several countries, it is recorded in each of these countries' time series, but only - once in the 'World' series. + - when a launch is made jointly by several countries, it is recorded in each of these countries' time series, but only once in the 'World' series. licenses: - {} - sources: - - name: United Nations Office for Outer Space Affairs, Online Index of Objects Launched into Outer Space (2023) - url: https://www.unoosa.org/oosa/osoindex/search-ng.jspx - date_accessed: '2023-09-29' - publication_year: '2023' - published_by: Online Index of Objects Launched into Outer Space (2023) tables: un_space_objects: variables: diff --git a/etl/steps/data/grapher/fasttrack/2023-09-29/un_space_objects.py b/etl/steps/data/grapher/fasttrack/2023-09-29/un_space_objects.py index 908f5a6e699..52ea7d8ffb2 100644 --- a/etl/steps/data/grapher/fasttrack/2023-09-29/un_space_objects.py +++ b/etl/steps/data/grapher/fasttrack/2023-09-29/un_space_objects.py @@ -1,3 +1,5 @@ +import pandas as pd + from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot @@ -11,8 +13,20 @@ def run(dest_dir: str) -> None: # load data tb = snap.read_csv() + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) + # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) # override metadata if necessary meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") @@ -20,3 +34,7 @@ def run(dest_dir: str) -> None: ds.update_metadata(meta_path) ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/2023-10-05/great_pacific_garbage_lebreton.meta.yml b/etl/steps/data/grapher/fasttrack/2023-10-05/great_pacific_garbage_lebreton.meta.yml index 3b6733fc4fb..c46186cfae7 100644 --- a/etl/steps/data/grapher/fasttrack/2023-10-05/great_pacific_garbage_lebreton.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2023-10-05/great_pacific_garbage_lebreton.meta.yml @@ -3,27 +3,21 @@ dataset: description: '' licenses: - {} - sources: - - name: Plastics in Great Pacific Garbage Patch (Lebreton et al. 2022) - url: https://www.nature.com/articles/s41598-022-16529-0 - publication_year: '2022' - published_by: Lebreton et al. (2022). Industrialised fishing nations largely contribute to floating plastic pollution - in the North Pacific subtropical gyre. Nature Scientific Reports. tables: great_pacific_garbage_lebreton: variables: share_number_plastic_items: title: share_number_plastic_items - description: The share of hard plastic items, larger than 5 centimetres, found in the Great Pacific Garbage Patch. unit: '%' short_unit: '%' display: numDecimalPlaces: 0 + description: The share of hard plastic items, larger than 5 centimetres, found in the Great Pacific Garbage Patch. share_mass_plastic_items: title: share_mass_plastic_items - description: The share of the mass of hard plastic items, larger than 5 centimetres, found in the Great Pacific Garbage - Patch. unit: '%' short_unit: '%' display: numDecimalPlaces: 2 + description: The share of the mass of hard plastic items, larger than 5 centimetres, found in the Great Pacific Garbage + Patch. diff --git a/etl/steps/data/grapher/fasttrack/2023-10-05/great_pacific_garbage_lebreton.py b/etl/steps/data/grapher/fasttrack/2023-10-05/great_pacific_garbage_lebreton.py index d3d980cab94..fe4e2084af2 100644 --- a/etl/steps/data/grapher/fasttrack/2023-10-05/great_pacific_garbage_lebreton.py +++ b/etl/steps/data/grapher/fasttrack/2023-10-05/great_pacific_garbage_lebreton.py @@ -1,4 +1,6 @@ -from etl.helpers import PathFinder, create_dataset +import pandas as pd + +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot paths = PathFinder(__file__) @@ -11,6 +13,28 @@ def run(dest_dir: str) -> None: # load data tb = snap.read_csv() + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) + # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/antimicrobial_usage_livestock.meta.yml b/etl/steps/data/grapher/fasttrack/latest/antimicrobial_usage_livestock.meta.yml index 4651cb6da5d..b10007cd0eb 100644 --- a/etl/steps/data/grapher/fasttrack/latest/antimicrobial_usage_livestock.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/antimicrobial_usage_livestock.meta.yml @@ -1,11 +1,9 @@ dataset: - sources: - - name: Mulchandani et al. (2023) - published_by: 'Mulchandani, R., Wang, Y., Gilbert, M., & Van Boeckel, T. P. (2023). Global trends in antimicrobial use - in food-producing animals: 2020 to 2030. PLOS Global Public Health, 3(2), e0001305. https://doi.org/10.1371/journal.pgph.0001305' - publication_year: 2023 - date_accessed: 2023-07-25 - url: https://journals.plos.org/globalpublichealth/article?id=10.1371/journal.pgph.0001305 + title: Antimicrobial usage in livestock + description: |- + This dataset estimates the usage of antimicrobials in livestock (cattle, sheep, chicken, and pigs) by country. Data on antimicrobials comes from government reports, surveillance systems and national surveys. In addition, the authors estimate the biomass of livestock in the country, to adjust for differences in antimicrobial usage by animal size. Biomass data comes from the Food and Agriculture Organization (FAO). 'The PCU represents the total number of animals in a country (alive or slaughtered), multiplied by the average weight of the animal at the time of treatment. Therefore, the PCU is a standardization metric that accounts for differences in animal weight, and number of production cycles per year between countries.' Therefore, mg/PCU refers to the usage of antimicrobials per animal population-corrected unit. + licenses: + - {} tables: antimicrobial_usage_livestock: variables: diff --git a/etl/steps/data/grapher/fasttrack/latest/antimicrobial_usage_livestock.py b/etl/steps/data/grapher/fasttrack/latest/antimicrobial_usage_livestock.py index 5d9877bfb65..390bab01951 100644 --- a/etl/steps/data/grapher/fasttrack/latest/antimicrobial_usage_livestock.py +++ b/etl/steps/data/grapher/fasttrack/latest/antimicrobial_usage_livestock.py @@ -1,10 +1,9 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/latest/antimicrobial_usage_livestock.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/cumulative_lives_saved_vaccination_shattock.py b/etl/steps/data/grapher/fasttrack/latest/cumulative_lives_saved_vaccination_shattock.py index d3b822d1f10..ed50f459b1a 100644 --- a/etl/steps/data/grapher/fasttrack/latest/cumulative_lives_saved_vaccination_shattock.py +++ b/etl/steps/data/grapher/fasttrack/latest/cumulative_lives_saved_vaccination_shattock.py @@ -1,3 +1,5 @@ +import pandas as pd + from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot @@ -11,8 +13,20 @@ def run(dest_dir: str) -> None: # load data tb = snap.read_csv() + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) + # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) # override metadata if necessary meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") @@ -20,3 +34,7 @@ def run(dest_dir: str) -> None: ds.update_metadata(meta_path) ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/democracy_freedom_house.meta.yml b/etl/steps/data/grapher/fasttrack/latest/democracy_freedom_house.meta.yml index 046439e36eb..6e9511619ea 100644 --- a/etl/steps/data/grapher/fasttrack/latest/democracy_freedom_house.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/democracy_freedom_house.meta.yml @@ -1,171 +1,141 @@ dataset: title: Democracy - Freedom House (2023) - description: >- + description: |- This dataset provides information on political regimes, using data from Freedom House's Freedom in the World (2023). - You can read a description of the data in this post: https://ourworldindata.org/democracies-measurement - You can download the code and complete dataset, including supplementary variables, from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/democracy licenses: - - {} - sources: - - name: Freedom House (2023) - url: https://freedomhouse.org/report/freedom-world - date_accessed: '2023-09-05' - published_by: Freedom House (2023). Freedom in the World. + - {} tables: democracy_freedom_house: variables: regime_fh: title: regime_fh - description: >- - The variable identifies the political regime of a country using the classification by Freedom House's Freedom in - the World. It distinguishes between free countries (score 2), partly free countries (score 1), and not free countries - (score 0). - + unit: '' + display: + name: Political regime + description: |- + The variable identifies the political regime of a country using the classification by Freedom House's Freedom in the World. It distinguishes between free countries (score 2), partly free countries (score 1), and not free countries (score 0). It matches Freedom House's variable scorest. - - Free countries are understood as countries in which citizens have many political rights (free and fair elections, - political pluralism and participation, functioning government) and civil liberties (freedoms of expression and association, - rule of law, personal autonomy). - + Free countries are understood as countries in which citizens have many political rights (free and fair elections, political pluralism and participation, functioning government) and civil liberties (freedoms of expression and association, rule of law, personal autonomy). Partly free country are countries in which citizens have some political rights and civil liberties. - Not free country are countries in which citizens have few political rights and civil liberties. - unit: '' - display: - name: Political regime polrights_fh: title: Political rights - description: >- - The variable identifies the extent of free and fair elections, political pluralism and participation, and a functioning - government, based on Freedom House's Freedom in the World. - + unit: '' + description: |- + The variable identifies the extent of free and fair elections, political pluralism and participation, and a functioning government, based on Freedom House's Freedom in the World. It matches Freedom House's variable scorepr. - Lower ratings indicate more rights. - unit: '' civlibs_fh: title: Civil liberties - description: >- - The variable identifies the extent of freedom of expression and association, the rule of law, and personal autonomy, - based on Freedom House's Freedom in the World. - + unit: '' + description: |- + The variable identifies the extent of freedom of expression and association, the rule of law, and personal autonomy, based on Freedom House's Freedom in the World. It matches Freedom House's variable scorecl. - Lower ratings indicate more liberties. - unit: '' electdem_fh: title: Political regime - description: >- - The variable identifies the political regime of a country using the second classification by Freedom House's Freedom - in the World. It identifies whether a country is an electoral democracy (score 1) or not (score 0). - - - Electoral democracies are understood as political systems in which citizens have the right to choose chief executive - and legislature in broadly free and fair elections and have substantial other political rights and civil liberties. unit: '' + description: |- + The variable identifies the political regime of a country using the second classification by Freedom House's Freedom in the World. It identifies whether a country is an electoral democracy (score 1) or not (score 0). + + Electoral democracies are understood as political systems in which citizens have the right to choose chief executive and legislature in broadly free and fair elections and have substantial other political rights and civil liberties. electprocess_fh: title: Democratic electoral institutions - description: >- - The variable identifies the extent to which the chief executive and legislature are chosen in free and fair elections - under universal suffrage, based on Freedom House's Freedom in the World. - + unit: '' + description: |- + The variable identifies the extent to which the chief executive and legislature are chosen in free and fair elections under universal suffrage, based on Freedom House's Freedom in the World. Higher scores indicate a freer electoral process. - unit: '' polrights_score_fh: title: polrights_score_fh - description: >- - The variable identifies the fine-grained extent of free and fair elections, political pluralism and participation, - and a functioning government, based on Freedom House's Freedom in the World. - - - Higher scores indicate more rights. unit: '' display: name: Political rights + description: |- + The variable identifies the fine-grained extent of free and fair elections, political pluralism and participation, and a functioning government, based on Freedom House's Freedom in the World. + + Higher scores indicate more rights. civlibs_score_fh: title: civlibs_score_fh - description: >- - The variable identifies the fine-grained extent of freedom of expression and association, the rule of law, and personal - autonomy, based on Freedom House's Freedom in the World. - - - Higher scores indicate more liberties. unit: '' display: name: Civil liberties + description: |- + The variable identifies the fine-grained extent of freedom of expression and association, the rule of law, and personal autonomy, based on Freedom House's Freedom in the World. + + Higher scores indicate more liberties. country_fh: title: country_fh - description: The variable identifies whether Freedom House consider the entity a country or territory. unit: '' + description: The variable identifies whether Freedom House consider the entity a country or territory. number_notfree_fh: title: number_notfree_fh - description: The variable identifies the number of not-free countries based on Freedom House's Freedom in the World. unit: '' + description: The variable identifies the number of not-free countries based on Freedom House's Freedom in the World. number_partlyfree_fh: title: number_partlyfree_fh - description: The variable identifies the number of partly-free countries based on Freedom House's Freedom in the World. unit: '' + description: The variable identifies the number of partly-free countries based on Freedom House's Freedom in the World. number_free_fh: title: number_free_fh - description: The variable identifies the number of free countries based on Freedom House's Freedom in the World. unit: '' + description: The variable identifies the number of free countries based on Freedom House's Freedom in the World. number_nonelectdem_fh: title: number_nonelectdem_fh - description: The variable identifies the number of non-democracies based on Freedom House's Freedom in the World. unit: '' + description: The variable identifies the number of non-democracies based on Freedom House's Freedom in the World. number_electdem_fh: title: number_electdem_fh - description: The variable identifies the number of electoral democracies based on Freedom House's Freedom in the World. unit: '' + description: The variable identifies the number of electoral democracies based on Freedom House's Freedom in the World. pop_notfree_fh: title: pop_notfree_fh - description: The variable identifies the number of people living in not free countries, based on Freedom House's Freedom - in the World. unit: '' + description: |- + The variable identifies the number of people living in not free countries, based on Freedom House's Freedom in the World. pop_partlyfree_fh: title: pop_partlyfree_fh - description: The variable identifies the number of people living in partly free countries, based on Freedom House's - Freedom in the World. unit: '' + description: |- + The variable identifies the number of people living in partly free countries, based on Freedom House's Freedom in the World. pop_free_fh: title: pop_free_fh + unit: '' description: The variable identifies the number of people living in free countries, based on Freedom House's Freedom in the World. - unit: '' pop_missreg_fh: title: pop_missreg_fh - description: The variable identifies the number of people living in countries without regime data, based on Freedom - House's Freedom in the World. unit: '' + description: |- + The variable identifies the number of people living in countries without regime data, based on Freedom House's Freedom in the World. pop_nonelectdem_fh: title: pop_nonelectdem_fh + unit: '' description: The variable identifies the number of people living in non-democracies, based on Freedom House's Freedom in the World. - unit: '' pop_electdem_fh: title: pop_electdem_fh - description: The variable identifies the number of people living in electoral democracies, based on Freedom House's - Freedom in the World. unit: '' + description: |- + The variable identifies the number of people living in electoral democracies, based on Freedom House's Freedom in the World. pop_missdem_fh: title: pop_missdem_fh - description: The variable identifies the number of people living in countries without democracy data, based on Freedom - House's Freedom in the World. unit: '' + description: |- + The variable identifies the number of people living in countries without democracy data, based on Freedom House's Freedom in the World. region: title: region - description: The variable identifies the region a country or territory is in. unit: '' + description: The variable identifies the region a country or territory is in. diff --git a/etl/steps/data/grapher/fasttrack/latest/democracy_freedom_house.py b/etl/steps/data/grapher/fasttrack/latest/democracy_freedom_house.py index d72eedbccc5..eff617c8ff4 100644 --- a/etl/steps/data/grapher/fasttrack/latest/democracy_freedom_house.py +++ b/etl/steps/data/grapher/fasttrack/latest/democracy_freedom_house.py @@ -1,7 +1,6 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot paths = PathFinder(__file__) @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/latest/democracy_freedom_house.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=paths.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/gbd_2019_mental_health_country_coverage.meta.yml b/etl/steps/data/grapher/fasttrack/latest/gbd_2019_mental_health_country_coverage.meta.yml index 384f41c717e..777a5a9a304 100644 --- a/etl/steps/data/grapher/fasttrack/latest/gbd_2019_mental_health_country_coverage.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/gbd_2019_mental_health_country_coverage.meta.yml @@ -1,25 +1,11 @@ dataset: - namespace: fasttrack - version: latest - short_name: gbd_2019_mental_health_country_coverage title: Countries with mental health data in GBD 2019 description: |- Dataset showing the number of countries with primary data on the prevalence of mental illnesses. These were found after a systematic review, grey literature search and expert consultation, to identify studies with data on the prevalence of each mental illness. 'The GBD inclusion criteria stipulated that: (1) the diagnostic criteria must be from 1980 onward; (2) “caseness” must be based on clinical threshold as established by the DSM, ICD, Chinese Classification of Mental Disorders (CCMD), or diagnosed by a clinician using established tools; (3) sufficient information must be provided on study method and sample characteristics to assess the quality of the study; and (4) study samples must be representative of the general population (i.e., case studies, veterans, or refugee samples were excluded). No limitation was set on the language of publication.' - sources: - - name: IHME GBD (2019) - published_by: 'Vos, T., Lim, S. S., Abbafati, C., Abbas, K. M., Abbasi, M., Abbasifard, - M., Abbasi-Kangevari, M., Abbastabar, H., Abd-Allah, F., Abdelalim, A., Abdollahi, - M., Abdollahpour, I., Abolhassani, H., Aboyans, V., Abrams, E. M., Abreu, L. - G., Abrigo, M. R. M., Abu-Raddad, L. J., Abushouk, A. I., … Murray, C. J. L. - (2020). Global burden of 369 diseases and injuries in 204 countries and territories, - 1990–2019: A systematic analysis for the Global Burden of Disease Study 2019. - The Lancet, 396(10258), 1204–1222.' - description: Country-level prevalence data - publication_year: 2020 - date_accessed: 2023-05-05 - url: https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30925-9/fulltext + licenses: + - url: https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30925-9/fulltext tables: gbd_2019_mental_health_country_coverage: variables: diff --git a/etl/steps/data/grapher/fasttrack/latest/gbd_2019_mental_health_country_coverage.py b/etl/steps/data/grapher/fasttrack/latest/gbd_2019_mental_health_country_coverage.py index f50cd5e8481..b43cc1b4135 100644 --- a/etl/steps/data/grapher/fasttrack/latest/gbd_2019_mental_health_country_coverage.py +++ b/etl/steps/data/grapher/fasttrack/latest/gbd_2019_mental_health_country_coverage.py @@ -1,19 +1,40 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: # load snapshot - data = pd.read_csv(Snapshot("fasttrack/latest/gbd_2019_mental_health_country_coverage.csv").path) + snap = Snapshot("fasttrack/latest/gbd_2019_mental_health_country_coverage.csv") - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb]) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/global_maternal_offspring_loss.meta.yml b/etl/steps/data/grapher/fasttrack/latest/global_maternal_offspring_loss.meta.yml index 60e45e07800..10c7516ebcb 100644 --- a/etl/steps/data/grapher/fasttrack/latest/global_maternal_offspring_loss.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/global_maternal_offspring_loss.meta.yml @@ -1,28 +1,17 @@ dataset: title: Global maternal offspring loss - Smith-Greenaway et al. 2021 - description: This dataset shows survey data and estimates of maternal offspring loss across countries. This includes mothers - who have lost an infant, child under 5 years old, or offspring. These are given as a rate per 1000 women in the age group. - Underlying data comes from large-scale surveys (such as the Demographic and Health Surveys and Multiple Indicator Cluster - Surveys) conducted in many low- and middle-income countries. For countries lacking data, these are estimated using an - indirect approach that combines formal kinship models and life-table methods in an additional 81 countries. + description: |- + This dataset shows survey data and estimates of maternal offspring loss across countries. This includes mothers who have lost an infant, child under 5 years old, or offspring. These are given as a rate per 1000 women in the age group. Underlying data comes from large-scale surveys (such as the Demographic and Health Surveys and Multiple Indicator Cluster Surveys) conducted in many low- and middle-income countries. For countries lacking data, these are estimated using an indirect approach that combines formal kinship models and life-table methods in an additional 81 countries. Citation: Smith-Greenaway, E., Alburez-Gutierrez, D., Trinitapoli, J., & Zagheni, E. (2021). Global burden of maternal bereavement: Indicators of the cumulative prevalence of child loss. BMJ Global Health, 6(4), e004837. https://doi.org/10.1136/bmjgh-2020-004837 licenses: - - {} - sources: - - name: Smith-Greenaway et al. (2021) - url: https://gh.bmj.com/content/6/4/e004837.abstract - date_accessed: '2023-09-05' - publication_year: '2021' - published_by: 'Global burden of maternal bereavement: indicators of the cumulative prevalence of child loss. (2021) Emily - Smith-Greenaway, Diego Alburez-Gutierrez, Jenny Trinitapoli, Emilio Zagheni.' + - {} tables: global_maternal_offspring_loss: variables: survey_source: title: Survey source - description: The survey used for underlying data – either the Demographic and Health Surveys (DHS) or Multiple Indicator - Cluster Surveys (MICS). Data was estimated for countries without a survey source, using formal kinship-models and - life tables. unit: '' + description: |- + The survey used for underlying data – either the Demographic and Health Surveys (DHS) or Multiple Indicator Cluster Surveys (MICS). Data was estimated for countries without a survey source, using formal kinship-models and life tables. cumulative_maternal_infant_mortality_among_mothers_aged_20_44_as_a_rate_per_1000_mothers_in_the_age_group: title: Cumulative maternal infant mortality among mothers aged 20-44 as a rate per 1000 mothers in the age group unit: per 1000 mothers diff --git a/etl/steps/data/grapher/fasttrack/latest/global_maternal_offspring_loss.py b/etl/steps/data/grapher/fasttrack/latest/global_maternal_offspring_loss.py index 6cd1b71eae7..b0ff839ac7b 100644 --- a/etl/steps/data/grapher/fasttrack/latest/global_maternal_offspring_loss.py +++ b/etl/steps/data/grapher/fasttrack/latest/global_maternal_offspring_loss.py @@ -1,7 +1,6 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot paths = PathFinder(__file__) @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/latest/global_maternal_offspring_loss.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=paths.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/historical_france_mortality_cause.meta.yml b/etl/steps/data/grapher/fasttrack/latest/historical_france_mortality_cause.meta.yml index 2af4f53a6b7..f785bb48b59 100644 --- a/etl/steps/data/grapher/fasttrack/latest/historical_france_mortality_cause.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/historical_france_mortality_cause.meta.yml @@ -1,6611 +1,6513 @@ dataset: title: Database on causes of death in France from 1925 to 1999 - description: >- - Dataset on mortality rates from each cause of death category in France between 1925 and 1999. The underlying data for - this chart comes from the Institut National d'Études Démographiques, published by Jacques Vallin and France Meslé, and - covers causes of deaths nationally in France between 1925 and 1999. Causes of death were categorized into categories according - to the 9th edition of the International Classification of Diseases (ICD-9) manual. Mortality rates are given for five-year - age bands, as an annual rate out of 100,000 people in that age group. Below are the ICD codes used for each cause category: - All causes = 000*-999*, - + description: |- + Dataset on mortality rates from each cause of death category in France between 1925 and 1999. The underlying data for this chart comes from the Institut National d'Études Démographiques, published by Jacques Vallin and France Meslé, and covers causes of deaths nationally in France between 1925 and 1999. Causes of death were categorized into categories according to the 9th edition of the International Classification of Diseases (ICD-9) manual. Mortality rates are given for five-year age bands, as an annual rate out of 100,000 people in that age group. Below are the ICD codes used for each cause category: All causes = 000*-999*, Infectious and parasitic diseases = 001*-139*, - Neoplasms = 140*-239*, - Endocrine nutritional and metabolic diseases and immunity disorders = 240*-279*, - Diseases of the blood and blood-forming organs = 280*-289*, - Mental disorders = 290*-319*, - Diseases of the nervous system = 320*-359*, - Diseases of the sense organs = 360*-389*, - Diseases of the circulatory system = 390*-459*, - Diseases of the respiratory system = 460*-519*, - Diseases of the digestive system = 520*-579*, - Diseases of the genitourinary system = 580*-629*, - Complications of pregnancy childbirth and the puerperium = 630*-679*, - Diseases of the skin and subcutaneous tissue = 680*-709*, - Diseases of the musculoskeletal system and connective tissue = 710*-739*, - Congenital anomalies = 740*-759*, - Certain conditions originating in the perinatal period = 760*-779*, - Symptoms signs and ill-defined conditions = 780*-799*, - External causes (injury and poisoning) = 800*-999* licenses: - - {} - sources: - - name: Institut National d'Études Démographiques - url: https://www.demographic-research.org/Volumes/Vol36/21/ - date_accessed: '2023-06-23' - publication_year: '2014' - published_by: Jacques Vallin and France Meslé + - {} tables: historical_france_mortality_cause: variables: all_causes_female_age_all_ages: title: All causes_female_age_all ages - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_under_1: title: All causes_female_age_under_1 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_1_4: title: All causes_female_age_1-4 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_5_9: title: All causes_female_age_5-9 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_10_14: title: All causes_female_age_10-14 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_15_19: title: All causes_female_age_15-19 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_20_24: title: All causes_female_age_20-24 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_25_29: title: All causes_female_age_25-29 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_30_34: title: All causes_female_age_30-34 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_35_39: title: All causes_female_age_35-39 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_40_44: title: All causes_female_age_40-44 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_45_49: title: All causes_female_age_45-49 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_50_54: title: All causes_female_age_50-54 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_55_59: title: All causes_female_age_55-59 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_60_64: title: All causes_female_age_60-64 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_65_69: title: All causes_female_age_65-69 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_70_74: title: All causes_female_age_70-74 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_75_79: title: All causes_female_age_75-79 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_80_84: title: All causes_female_age_80-84 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_85_89: title: All causes_female_age_85-89 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_90_94: title: All causes_female_age_90-94 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_95_99: title: All causes_female_age_95-99 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_female_age_100_and_over: title: All causes_female_age_100_and_over - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. infectious_and_parasitic_diseases_female_age_all_ages: title: Infectious and parasitic diseases_female_age_all ages - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_under_1: title: Infectious and parasitic diseases_female_age_under_1 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_1_4: title: Infectious and parasitic diseases_female_age_1-4 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_5_9: title: Infectious and parasitic diseases_female_age_5-9 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_10_14: title: Infectious and parasitic diseases_female_age_10-14 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_15_19: title: Infectious and parasitic diseases_female_age_15-19 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_20_24: title: Infectious and parasitic diseases_female_age_20-24 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_25_29: title: Infectious and parasitic diseases_female_age_25-29 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_30_34: title: Infectious and parasitic diseases_female_age_30-34 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_35_39: title: Infectious and parasitic diseases_female_age_35-39 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_40_44: title: Infectious and parasitic diseases_female_age_40-44 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_45_49: title: Infectious and parasitic diseases_female_age_45-49 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_50_54: title: Infectious and parasitic diseases_female_age_50-54 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_55_59: title: Infectious and parasitic diseases_female_age_55-59 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_60_64: title: Infectious and parasitic diseases_female_age_60-64 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_65_69: title: Infectious and parasitic diseases_female_age_65-69 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_70_74: title: Infectious and parasitic diseases_female_age_70-74 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_75_79: title: Infectious and parasitic diseases_female_age_75-79 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_80_84: title: Infectious and parasitic diseases_female_age_80-84 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_85_89: title: Infectious and parasitic diseases_female_age_85-89 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_90_94: title: Infectious and parasitic diseases_female_age_90-94 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_95_99: title: Infectious and parasitic diseases_female_age_95-99 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_female_age_100_and_over: title: Infectious and parasitic diseases_female_age_100_and_over - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. neoplasms_female_age_all_ages: title: Neoplasms_female_age_all ages + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_under_1: title: Neoplasms_female_age_under_1 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_1_4: title: Neoplasms_female_age_1-4 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_5_9: title: Neoplasms_female_age_5-9 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_10_14: title: Neoplasms_female_age_10-14 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_15_19: title: Neoplasms_female_age_15-19 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_20_24: title: Neoplasms_female_age_20-24 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_25_29: title: Neoplasms_female_age_25-29 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_30_34: title: Neoplasms_female_age_30-34 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_35_39: title: Neoplasms_female_age_35-39 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_40_44: title: Neoplasms_female_age_40-44 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_45_49: title: Neoplasms_female_age_45-49 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_50_54: title: Neoplasms_female_age_50-54 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_55_59: title: Neoplasms_female_age_55-59 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_60_64: title: Neoplasms_female_age_60-64 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_65_69: title: Neoplasms_female_age_65-69 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_70_74: title: Neoplasms_female_age_70-74 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_75_79: title: Neoplasms_female_age_75-79 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_80_84: title: Neoplasms_female_age_80-84 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_85_89: title: Neoplasms_female_age_85-89 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_90_94: title: Neoplasms_female_age_90-94 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_95_99: title: Neoplasms_female_age_95-99 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_female_age_100_and_over: title: Neoplasms_female_age_100_and_over + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_all_ages: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_all ages - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_under_1: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_under_1 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_1_4: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_1-4 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_5_9: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_5-9 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_10_14: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_10-14 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_15_19: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_15-19 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_20_24: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_20-24 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_25_29: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_25-29 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_30_34: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_30-34 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_35_39: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_35-39 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_40_44: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_40-44 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_45_49: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_45-49 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_50_54: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_50-54 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_55_59: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_55-59 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_60_64: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_60-64 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_65_69: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_65-69 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_70_74: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_70-74 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_75_79: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_75-79 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_80_84: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_80-84 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_85_89: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_85-89 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_90_94: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_90-94 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_95_99: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_95-99 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_female_age_100_and_over: title: Endocrine nutritional and metabolic diseases and immunity disorders_female_age_100_and_over - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. diseases_of_the_blood_and_blood_forming_organs_female_age_all_ages: title: Diseases of the blood and blood-forming organs_female_age_all ages - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_under_1: title: Diseases of the blood and blood-forming organs_female_age_under_1 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_1_4: title: Diseases of the blood and blood-forming organs_female_age_1-4 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_5_9: title: Diseases of the blood and blood-forming organs_female_age_5-9 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_10_14: title: Diseases of the blood and blood-forming organs_female_age_10-14 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_15_19: title: Diseases of the blood and blood-forming organs_female_age_15-19 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_20_24: title: Diseases of the blood and blood-forming organs_female_age_20-24 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_25_29: title: Diseases of the blood and blood-forming organs_female_age_25-29 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_30_34: title: Diseases of the blood and blood-forming organs_female_age_30-34 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_35_39: title: Diseases of the blood and blood-forming organs_female_age_35-39 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_40_44: title: Diseases of the blood and blood-forming organs_female_age_40-44 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_45_49: title: Diseases of the blood and blood-forming organs_female_age_45-49 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_50_54: title: Diseases of the blood and blood-forming organs_female_age_50-54 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_55_59: title: Diseases of the blood and blood-forming organs_female_age_55-59 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_60_64: title: Diseases of the blood and blood-forming organs_female_age_60-64 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_65_69: title: Diseases of the blood and blood-forming organs_female_age_65-69 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_70_74: title: Diseases of the blood and blood-forming organs_female_age_70-74 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_75_79: title: Diseases of the blood and blood-forming organs_female_age_75-79 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_80_84: title: Diseases of the blood and blood-forming organs_female_age_80-84 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_85_89: title: Diseases of the blood and blood-forming organs_female_age_85-89 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_90_94: title: Diseases of the blood and blood-forming organs_female_age_90-94 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_95_99: title: Diseases of the blood and blood-forming organs_female_age_95-99 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_female_age_100_and_over: title: Diseases of the blood and blood-forming organs_female_age_100_and_over - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. mental_disorders_female_age_all_ages: title: Mental disorders_female_age_all ages - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_under_1: title: Mental disorders_female_age_under_1 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_1_4: title: Mental disorders_female_age_1-4 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_5_9: title: Mental disorders_female_age_5-9 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_10_14: title: Mental disorders_female_age_10-14 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_15_19: title: Mental disorders_female_age_15-19 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_20_24: title: Mental disorders_female_age_20-24 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_25_29: title: Mental disorders_female_age_25-29 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_30_34: title: Mental disorders_female_age_30-34 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_35_39: title: Mental disorders_female_age_35-39 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_40_44: title: Mental disorders_female_age_40-44 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_45_49: title: Mental disorders_female_age_45-49 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_50_54: title: Mental disorders_female_age_50-54 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_55_59: title: Mental disorders_female_age_55-59 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_60_64: title: Mental disorders_female_age_60-64 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_65_69: title: Mental disorders_female_age_65-69 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_70_74: title: Mental disorders_female_age_70-74 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_75_79: title: Mental disorders_female_age_75-79 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_80_84: title: Mental disorders_female_age_80-84 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_85_89: title: Mental disorders_female_age_85-89 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_90_94: title: Mental disorders_female_age_90-94 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_95_99: title: Mental disorders_female_age_95-99 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_female_age_100_and_over: title: Mental disorders_female_age_100_and_over - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. diseases_of_the_nervous_system_female_age_all_ages: title: Diseases of the nervous system_female_age_all ages - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_under_1: title: Diseases of the nervous system_female_age_under_1 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_1_4: title: Diseases of the nervous system_female_age_1-4 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_5_9: title: Diseases of the nervous system_female_age_5-9 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_10_14: title: Diseases of the nervous system_female_age_10-14 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_15_19: title: Diseases of the nervous system_female_age_15-19 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_20_24: title: Diseases of the nervous system_female_age_20-24 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_25_29: title: Diseases of the nervous system_female_age_25-29 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_30_34: title: Diseases of the nervous system_female_age_30-34 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_35_39: title: Diseases of the nervous system_female_age_35-39 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_40_44: title: Diseases of the nervous system_female_age_40-44 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_45_49: title: Diseases of the nervous system_female_age_45-49 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_50_54: title: Diseases of the nervous system_female_age_50-54 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_55_59: title: Diseases of the nervous system_female_age_55-59 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_60_64: title: Diseases of the nervous system_female_age_60-64 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_65_69: title: Diseases of the nervous system_female_age_65-69 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_70_74: title: Diseases of the nervous system_female_age_70-74 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_75_79: title: Diseases of the nervous system_female_age_75-79 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_80_84: title: Diseases of the nervous system_female_age_80-84 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_85_89: title: Diseases of the nervous system_female_age_85-89 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_90_94: title: Diseases of the nervous system_female_age_90-94 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_95_99: title: Diseases of the nervous system_female_age_95-99 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_female_age_100_and_over: title: Diseases of the nervous system_female_age_100_and_over - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_sense_organs_female_age_all_ages: title: Diseases of the sense organs_female_age_all ages - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_under_1: title: Diseases of the sense organs_female_age_under_1 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_1_4: title: Diseases of the sense organs_female_age_1-4 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_5_9: title: Diseases of the sense organs_female_age_5-9 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_10_14: title: Diseases of the sense organs_female_age_10-14 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_15_19: title: Diseases of the sense organs_female_age_15-19 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_20_24: title: Diseases of the sense organs_female_age_20-24 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_25_29: title: Diseases of the sense organs_female_age_25-29 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_30_34: title: Diseases of the sense organs_female_age_30-34 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_35_39: title: Diseases of the sense organs_female_age_35-39 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_40_44: title: Diseases of the sense organs_female_age_40-44 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_45_49: title: Diseases of the sense organs_female_age_45-49 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_50_54: title: Diseases of the sense organs_female_age_50-54 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_55_59: title: Diseases of the sense organs_female_age_55-59 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_60_64: title: Diseases of the sense organs_female_age_60-64 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_65_69: title: Diseases of the sense organs_female_age_65-69 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_70_74: title: Diseases of the sense organs_female_age_70-74 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_75_79: title: Diseases of the sense organs_female_age_75-79 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_80_84: title: Diseases of the sense organs_female_age_80-84 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_85_89: title: Diseases of the sense organs_female_age_85-89 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_90_94: title: Diseases of the sense organs_female_age_90-94 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_95_99: title: Diseases of the sense organs_female_age_95-99 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_female_age_100_and_over: title: Diseases of the sense organs_female_age_100_and_over - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_circulatory_system_female_age_all_ages: title: Diseases of the circulatory system_female_age_all ages - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_under_1: title: Diseases of the circulatory system_female_age_under_1 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_1_4: title: Diseases of the circulatory system_female_age_1-4 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_5_9: title: Diseases of the circulatory system_female_age_5-9 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_10_14: title: Diseases of the circulatory system_female_age_10-14 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_15_19: title: Diseases of the circulatory system_female_age_15-19 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_20_24: title: Diseases of the circulatory system_female_age_20-24 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_25_29: title: Diseases of the circulatory system_female_age_25-29 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_30_34: title: Diseases of the circulatory system_female_age_30-34 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_35_39: title: Diseases of the circulatory system_female_age_35-39 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_40_44: title: Diseases of the circulatory system_female_age_40-44 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_45_49: title: Diseases of the circulatory system_female_age_45-49 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_50_54: title: Diseases of the circulatory system_female_age_50-54 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_55_59: title: Diseases of the circulatory system_female_age_55-59 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_60_64: title: Diseases of the circulatory system_female_age_60-64 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_65_69: title: Diseases of the circulatory system_female_age_65-69 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_70_74: title: Diseases of the circulatory system_female_age_70-74 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_75_79: title: Diseases of the circulatory system_female_age_75-79 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_80_84: title: Diseases of the circulatory system_female_age_80-84 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_85_89: title: Diseases of the circulatory system_female_age_85-89 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_90_94: title: Diseases of the circulatory system_female_age_90-94 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_95_99: title: Diseases of the circulatory system_female_age_95-99 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_female_age_100_and_over: title: Diseases of the circulatory system_female_age_100_and_over - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_respiratory_system_female_age_all_ages: title: Diseases of the respiratory system_female_age_all ages - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_under_1: title: Diseases of the respiratory system_female_age_under_1 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_1_4: title: Diseases of the respiratory system_female_age_1-4 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_5_9: title: Diseases of the respiratory system_female_age_5-9 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_10_14: title: Diseases of the respiratory system_female_age_10-14 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_15_19: title: Diseases of the respiratory system_female_age_15-19 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_20_24: title: Diseases of the respiratory system_female_age_20-24 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_25_29: title: Diseases of the respiratory system_female_age_25-29 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_30_34: title: Diseases of the respiratory system_female_age_30-34 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_35_39: title: Diseases of the respiratory system_female_age_35-39 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_40_44: title: Diseases of the respiratory system_female_age_40-44 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_45_49: title: Diseases of the respiratory system_female_age_45-49 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_50_54: title: Diseases of the respiratory system_female_age_50-54 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_55_59: title: Diseases of the respiratory system_female_age_55-59 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_60_64: title: Diseases of the respiratory system_female_age_60-64 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_65_69: title: Diseases of the respiratory system_female_age_65-69 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_70_74: title: Diseases of the respiratory system_female_age_70-74 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_75_79: title: Diseases of the respiratory system_female_age_75-79 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_80_84: title: Diseases of the respiratory system_female_age_80-84 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_85_89: title: Diseases of the respiratory system_female_age_85-89 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_90_94: title: Diseases of the respiratory system_female_age_90-94 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_95_99: title: Diseases of the respiratory system_female_age_95-99 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_female_age_100_and_over: title: Diseases of the respiratory system_female_age_100_and_over - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_digestive_system_female_age_all_ages: title: Diseases of the digestive system_female_age_all ages - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_under_1: title: Diseases of the digestive system_female_age_under_1 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_1_4: title: Diseases of the digestive system_female_age_1-4 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_5_9: title: Diseases of the digestive system_female_age_5-9 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_10_14: title: Diseases of the digestive system_female_age_10-14 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_15_19: title: Diseases of the digestive system_female_age_15-19 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_20_24: title: Diseases of the digestive system_female_age_20-24 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_25_29: title: Diseases of the digestive system_female_age_25-29 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_30_34: title: Diseases of the digestive system_female_age_30-34 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_35_39: title: Diseases of the digestive system_female_age_35-39 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_40_44: title: Diseases of the digestive system_female_age_40-44 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_45_49: title: Diseases of the digestive system_female_age_45-49 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_50_54: title: Diseases of the digestive system_female_age_50-54 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_55_59: title: Diseases of the digestive system_female_age_55-59 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_60_64: title: Diseases of the digestive system_female_age_60-64 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_65_69: title: Diseases of the digestive system_female_age_65-69 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_70_74: title: Diseases of the digestive system_female_age_70-74 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_75_79: title: Diseases of the digestive system_female_age_75-79 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_80_84: title: Diseases of the digestive system_female_age_80-84 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_85_89: title: Diseases of the digestive system_female_age_85-89 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_90_94: title: Diseases of the digestive system_female_age_90-94 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_95_99: title: Diseases of the digestive system_female_age_95-99 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_female_age_100_and_over: title: Diseases of the digestive system_female_age_100_and_over - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_genitourinary_system_female_age_all_ages: title: Diseases of the genitourinary system_female_age_all ages - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_under_1: title: Diseases of the genitourinary system_female_age_under_1 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_1_4: title: Diseases of the genitourinary system_female_age_1-4 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_5_9: title: Diseases of the genitourinary system_female_age_5-9 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_10_14: title: Diseases of the genitourinary system_female_age_10-14 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_15_19: title: Diseases of the genitourinary system_female_age_15-19 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_20_24: title: Diseases of the genitourinary system_female_age_20-24 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_25_29: title: Diseases of the genitourinary system_female_age_25-29 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_30_34: title: Diseases of the genitourinary system_female_age_30-34 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_35_39: title: Diseases of the genitourinary system_female_age_35-39 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_40_44: title: Diseases of the genitourinary system_female_age_40-44 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_45_49: title: Diseases of the genitourinary system_female_age_45-49 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_50_54: title: Diseases of the genitourinary system_female_age_50-54 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_55_59: title: Diseases of the genitourinary system_female_age_55-59 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_60_64: title: Diseases of the genitourinary system_female_age_60-64 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_65_69: title: Diseases of the genitourinary system_female_age_65-69 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_70_74: title: Diseases of the genitourinary system_female_age_70-74 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_75_79: title: Diseases of the genitourinary system_female_age_75-79 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_80_84: title: Diseases of the genitourinary system_female_age_80-84 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_85_89: title: Diseases of the genitourinary system_female_age_85-89 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_90_94: title: Diseases of the genitourinary system_female_age_90-94 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_95_99: title: Diseases of the genitourinary system_female_age_95-99 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_female_age_100_and_over: title: Diseases of the genitourinary system_female_age_100_and_over - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_all_ages: title: Complications of pregnancy childbirth and the puerperium_female_age_all ages - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_under_1: title: Complications of pregnancy childbirth and the puerperium_female_age_under_1 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_1_4: title: Complications of pregnancy childbirth and the puerperium_female_age_1-4 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_5_9: title: Complications of pregnancy childbirth and the puerperium_female_age_5-9 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_10_14: title: Complications of pregnancy childbirth and the puerperium_female_age_10-14 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_15_19: title: Complications of pregnancy childbirth and the puerperium_female_age_15-19 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_20_24: title: Complications of pregnancy childbirth and the puerperium_female_age_20-24 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_25_29: title: Complications of pregnancy childbirth and the puerperium_female_age_25-29 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_30_34: title: Complications of pregnancy childbirth and the puerperium_female_age_30-34 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_35_39: title: Complications of pregnancy childbirth and the puerperium_female_age_35-39 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_40_44: title: Complications of pregnancy childbirth and the puerperium_female_age_40-44 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_45_49: title: Complications of pregnancy childbirth and the puerperium_female_age_45-49 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_50_54: title: Complications of pregnancy childbirth and the puerperium_female_age_50-54 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_55_59: title: Complications of pregnancy childbirth and the puerperium_female_age_55-59 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_60_64: title: Complications of pregnancy childbirth and the puerperium_female_age_60-64 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_65_69: title: Complications of pregnancy childbirth and the puerperium_female_age_65-69 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_70_74: title: Complications of pregnancy childbirth and the puerperium_female_age_70-74 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_75_79: title: Complications of pregnancy childbirth and the puerperium_female_age_75-79 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_80_84: title: Complications of pregnancy childbirth and the puerperium_female_age_80-84 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_85_89: title: Complications of pregnancy childbirth and the puerperium_female_age_85-89 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_90_94: title: Complications of pregnancy childbirth and the puerperium_female_age_90-94 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_95_99: title: Complications of pregnancy childbirth and the puerperium_female_age_95-99 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_female_age_100_and_over: title: Complications of pregnancy childbirth and the puerperium_female_age_100_and_over - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_all_ages: title: Diseases of the skin and subcutaneous tissue_female_age_all ages - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_under_1: title: Diseases of the skin and subcutaneous tissue_female_age_under_1 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_1_4: title: Diseases of the skin and subcutaneous tissue_female_age_1-4 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_5_9: title: Diseases of the skin and subcutaneous tissue_female_age_5-9 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_10_14: title: Diseases of the skin and subcutaneous tissue_female_age_10-14 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_15_19: title: Diseases of the skin and subcutaneous tissue_female_age_15-19 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_20_24: title: Diseases of the skin and subcutaneous tissue_female_age_20-24 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_25_29: title: Diseases of the skin and subcutaneous tissue_female_age_25-29 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_30_34: title: Diseases of the skin and subcutaneous tissue_female_age_30-34 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_35_39: title: Diseases of the skin and subcutaneous tissue_female_age_35-39 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_40_44: title: Diseases of the skin and subcutaneous tissue_female_age_40-44 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_45_49: title: Diseases of the skin and subcutaneous tissue_female_age_45-49 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_50_54: title: Diseases of the skin and subcutaneous tissue_female_age_50-54 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_55_59: title: Diseases of the skin and subcutaneous tissue_female_age_55-59 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_60_64: title: Diseases of the skin and subcutaneous tissue_female_age_60-64 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_65_69: title: Diseases of the skin and subcutaneous tissue_female_age_65-69 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_70_74: title: Diseases of the skin and subcutaneous tissue_female_age_70-74 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_75_79: title: Diseases of the skin and subcutaneous tissue_female_age_75-79 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_80_84: title: Diseases of the skin and subcutaneous tissue_female_age_80-84 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_85_89: title: Diseases of the skin and subcutaneous tissue_female_age_85-89 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_90_94: title: Diseases of the skin and subcutaneous tissue_female_age_90-94 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_95_99: title: Diseases of the skin and subcutaneous tissue_female_age_95-99 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_female_age_100_and_over: title: Diseases of the skin and subcutaneous tissue_female_age_100_and_over - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_all_ages: title: Diseases of the musculoskeletal system and connective tissue_female_age_all ages - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_under_1: title: Diseases of the musculoskeletal system and connective tissue_female_age_under_1 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_1_4: title: Diseases of the musculoskeletal system and connective tissue_female_age_1-4 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_5_9: title: Diseases of the musculoskeletal system and connective tissue_female_age_5-9 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_10_14: title: Diseases of the musculoskeletal system and connective tissue_female_age_10-14 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_15_19: title: Diseases of the musculoskeletal system and connective tissue_female_age_15-19 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_20_24: title: Diseases of the musculoskeletal system and connective tissue_female_age_20-24 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_25_29: title: Diseases of the musculoskeletal system and connective tissue_female_age_25-29 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_30_34: title: Diseases of the musculoskeletal system and connective tissue_female_age_30-34 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_35_39: title: Diseases of the musculoskeletal system and connective tissue_female_age_35-39 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_40_44: title: Diseases of the musculoskeletal system and connective tissue_female_age_40-44 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_45_49: title: Diseases of the musculoskeletal system and connective tissue_female_age_45-49 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_50_54: title: Diseases of the musculoskeletal system and connective tissue_female_age_50-54 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_55_59: title: Diseases of the musculoskeletal system and connective tissue_female_age_55-59 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_60_64: title: Diseases of the musculoskeletal system and connective tissue_female_age_60-64 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_65_69: title: Diseases of the musculoskeletal system and connective tissue_female_age_65-69 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_70_74: title: Diseases of the musculoskeletal system and connective tissue_female_age_70-74 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_75_79: title: Diseases of the musculoskeletal system and connective tissue_female_age_75-79 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_80_84: title: Diseases of the musculoskeletal system and connective tissue_female_age_80-84 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_85_89: title: Diseases of the musculoskeletal system and connective tissue_female_age_85-89 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_90_94: title: Diseases of the musculoskeletal system and connective tissue_female_age_90-94 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_95_99: title: Diseases of the musculoskeletal system and connective tissue_female_age_95-99 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_female_age_100_and_over: title: Diseases of the musculoskeletal system and connective tissue_female_age_100_and_over - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. congenital_anomalies_female_age_all_ages: title: Congenital anomalies_female_age_all ages - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_under_1: title: Congenital anomalies_female_age_under_1 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_1_4: title: Congenital anomalies_female_age_1-4 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_5_9: title: Congenital anomalies_female_age_5-9 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_10_14: title: Congenital anomalies_female_age_10-14 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_15_19: title: Congenital anomalies_female_age_15-19 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_20_24: title: Congenital anomalies_female_age_20-24 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_25_29: title: Congenital anomalies_female_age_25-29 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_30_34: title: Congenital anomalies_female_age_30-34 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_35_39: title: Congenital anomalies_female_age_35-39 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_40_44: title: Congenital anomalies_female_age_40-44 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_45_49: title: Congenital anomalies_female_age_45-49 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_50_54: title: Congenital anomalies_female_age_50-54 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_55_59: title: Congenital anomalies_female_age_55-59 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_60_64: title: Congenital anomalies_female_age_60-64 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_65_69: title: Congenital anomalies_female_age_65-69 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_70_74: title: Congenital anomalies_female_age_70-74 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_75_79: title: Congenital anomalies_female_age_75-79 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_80_84: title: Congenital anomalies_female_age_80-84 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_85_89: title: Congenital anomalies_female_age_85-89 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_90_94: title: Congenital anomalies_female_age_90-94 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_95_99: title: Congenital anomalies_female_age_95-99 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_female_age_100_and_over: title: Congenital anomalies_female_age_100_and_over - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. certain_conditions_originating_in_the_perinatal_period_female_age_all_ages: title: Certain conditions originating in the perinatal period_female_age_all ages - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_under_1: title: Certain conditions originating in the perinatal period_female_age_under_1 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_1_4: title: Certain conditions originating in the perinatal period_female_age_1-4 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_5_9: title: Certain conditions originating in the perinatal period_female_age_5-9 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_10_14: title: Certain conditions originating in the perinatal period_female_age_10-14 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_15_19: title: Certain conditions originating in the perinatal period_female_age_15-19 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_20_24: title: Certain conditions originating in the perinatal period_female_age_20-24 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_25_29: title: Certain conditions originating in the perinatal period_female_age_25-29 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_30_34: title: Certain conditions originating in the perinatal period_female_age_30-34 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_35_39: title: Certain conditions originating in the perinatal period_female_age_35-39 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_40_44: title: Certain conditions originating in the perinatal period_female_age_40-44 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_45_49: title: Certain conditions originating in the perinatal period_female_age_45-49 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_50_54: title: Certain conditions originating in the perinatal period_female_age_50-54 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_55_59: title: Certain conditions originating in the perinatal period_female_age_55-59 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_60_64: title: Certain conditions originating in the perinatal period_female_age_60-64 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_65_69: title: Certain conditions originating in the perinatal period_female_age_65-69 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_70_74: title: Certain conditions originating in the perinatal period_female_age_70-74 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_75_79: title: Certain conditions originating in the perinatal period_female_age_75-79 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_80_84: title: Certain conditions originating in the perinatal period_female_age_80-84 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_85_89: title: Certain conditions originating in the perinatal period_female_age_85-89 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_90_94: title: Certain conditions originating in the perinatal period_female_age_90-94 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_95_99: title: Certain conditions originating in the perinatal period_female_age_95-99 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_female_age_100_and_over: title: Certain conditions originating in the perinatal period_female_age_100_and_over - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. symptoms_signs_and_ill_defined_conditions_female_age_all_ages: title: Symptoms signs and ill-defined conditions_female_age_all ages - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_under_1: title: Symptoms signs and ill-defined conditions_female_age_under_1 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_1_4: title: Symptoms signs and ill-defined conditions_female_age_1-4 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_5_9: title: Symptoms signs and ill-defined conditions_female_age_5-9 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_10_14: title: Symptoms signs and ill-defined conditions_female_age_10-14 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_15_19: title: Symptoms signs and ill-defined conditions_female_age_15-19 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_20_24: title: Symptoms signs and ill-defined conditions_female_age_20-24 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_25_29: title: Symptoms signs and ill-defined conditions_female_age_25-29 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_30_34: title: Symptoms signs and ill-defined conditions_female_age_30-34 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_35_39: title: Symptoms signs and ill-defined conditions_female_age_35-39 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_40_44: title: Symptoms signs and ill-defined conditions_female_age_40-44 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_45_49: title: Symptoms signs and ill-defined conditions_female_age_45-49 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_50_54: title: Symptoms signs and ill-defined conditions_female_age_50-54 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_55_59: title: Symptoms signs and ill-defined conditions_female_age_55-59 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_60_64: title: Symptoms signs and ill-defined conditions_female_age_60-64 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_65_69: title: Symptoms signs and ill-defined conditions_female_age_65-69 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_70_74: title: Symptoms signs and ill-defined conditions_female_age_70-74 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_75_79: title: Symptoms signs and ill-defined conditions_female_age_75-79 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_80_84: title: Symptoms signs and ill-defined conditions_female_age_80-84 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_85_89: title: Symptoms signs and ill-defined conditions_female_age_85-89 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_90_94: title: Symptoms signs and ill-defined conditions_female_age_90-94 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_95_99: title: Symptoms signs and ill-defined conditions_female_age_95-99 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_female_age_100_and_over: title: Symptoms signs and ill-defined conditions_female_age_100_and_over - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. injury_and_poisoning_female_age_all_ages: title: Injury and poisoning_female_age_all ages - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_under_1: title: Injury and poisoning_female_age_under_1 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_1_4: title: Injury and poisoning_female_age_1-4 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_5_9: title: Injury and poisoning_female_age_5-9 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_10_14: title: Injury and poisoning_female_age_10-14 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_15_19: title: Injury and poisoning_female_age_15-19 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_20_24: title: Injury and poisoning_female_age_20-24 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_25_29: title: Injury and poisoning_female_age_25-29 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_30_34: title: Injury and poisoning_female_age_30-34 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_35_39: title: Injury and poisoning_female_age_35-39 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_40_44: title: Injury and poisoning_female_age_40-44 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_45_49: title: Injury and poisoning_female_age_45-49 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_50_54: title: Injury and poisoning_female_age_50-54 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_55_59: title: Injury and poisoning_female_age_55-59 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_60_64: title: Injury and poisoning_female_age_60-64 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_65_69: title: Injury and poisoning_female_age_65-69 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_70_74: title: Injury and poisoning_female_age_70-74 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_75_79: title: Injury and poisoning_female_age_75-79 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_80_84: title: Injury and poisoning_female_age_80-84 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_85_89: title: Injury and poisoning_female_age_85-89 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_90_94: title: Injury and poisoning_female_age_90-94 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_95_99: title: Injury and poisoning_female_age_95-99 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_female_age_100_and_over: title: Injury and poisoning_female_age_100_and_over - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. all_causes_male_age_all_ages: title: All causes_male_age_all ages - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_under_1: title: All causes_male_age_under_1 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_1_4: title: All causes_male_age_1-4 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_5_9: title: All causes_male_age_5-9 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_10_14: title: All causes_male_age_10-14 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_15_19: title: All causes_male_age_15-19 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_20_24: title: All causes_male_age_20-24 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_25_29: title: All causes_male_age_25-29 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_30_34: title: All causes_male_age_30-34 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_35_39: title: All causes_male_age_35-39 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_40_44: title: All causes_male_age_40-44 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_45_49: title: All causes_male_age_45-49 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_50_54: title: All causes_male_age_50-54 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_55_59: title: All causes_male_age_55-59 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_60_64: title: All causes_male_age_60-64 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_65_69: title: All causes_male_age_65-69 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_70_74: title: All causes_male_age_70-74 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_75_79: title: All causes_male_age_75-79 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_80_84: title: All causes_male_age_80-84 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_85_89: title: All causes_male_age_85-89 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_90_94: title: All causes_male_age_90-94 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_95_99: title: All causes_male_age_95-99 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_male_age_100_and_over: title: All causes_male_age_100_and_over - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. infectious_and_parasitic_diseases_male_age_all_ages: title: Infectious and parasitic diseases_male_age_all ages - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_under_1: title: Infectious and parasitic diseases_male_age_under_1 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_1_4: title: Infectious and parasitic diseases_male_age_1-4 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_5_9: title: Infectious and parasitic diseases_male_age_5-9 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_10_14: title: Infectious and parasitic diseases_male_age_10-14 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_15_19: title: Infectious and parasitic diseases_male_age_15-19 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_20_24: title: Infectious and parasitic diseases_male_age_20-24 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_25_29: title: Infectious and parasitic diseases_male_age_25-29 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_30_34: title: Infectious and parasitic diseases_male_age_30-34 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_35_39: title: Infectious and parasitic diseases_male_age_35-39 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_40_44: title: Infectious and parasitic diseases_male_age_40-44 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_45_49: title: Infectious and parasitic diseases_male_age_45-49 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_50_54: title: Infectious and parasitic diseases_male_age_50-54 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_55_59: title: Infectious and parasitic diseases_male_age_55-59 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_60_64: title: Infectious and parasitic diseases_male_age_60-64 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_65_69: title: Infectious and parasitic diseases_male_age_65-69 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_70_74: title: Infectious and parasitic diseases_male_age_70-74 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_75_79: title: Infectious and parasitic diseases_male_age_75-79 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_80_84: title: Infectious and parasitic diseases_male_age_80-84 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_85_89: title: Infectious and parasitic diseases_male_age_85-89 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_90_94: title: Infectious and parasitic diseases_male_age_90-94 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_95_99: title: Infectious and parasitic diseases_male_age_95-99 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_male_age_100_and_over: title: Infectious and parasitic diseases_male_age_100_and_over - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. neoplasms_male_age_all_ages: title: Neoplasms_male_age_all ages + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_under_1: title: Neoplasms_male_age_under_1 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_1_4: title: Neoplasms_male_age_1-4 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_5_9: title: Neoplasms_male_age_5-9 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_10_14: title: Neoplasms_male_age_10-14 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_15_19: title: Neoplasms_male_age_15-19 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_20_24: title: Neoplasms_male_age_20-24 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_25_29: title: Neoplasms_male_age_25-29 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_30_34: title: Neoplasms_male_age_30-34 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_35_39: title: Neoplasms_male_age_35-39 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_40_44: title: Neoplasms_male_age_40-44 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_45_49: title: Neoplasms_male_age_45-49 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_50_54: title: Neoplasms_male_age_50-54 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_55_59: title: Neoplasms_male_age_55-59 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_60_64: title: Neoplasms_male_age_60-64 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_65_69: title: Neoplasms_male_age_65-69 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_70_74: title: Neoplasms_male_age_70-74 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_75_79: title: Neoplasms_male_age_75-79 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_80_84: title: Neoplasms_male_age_80-84 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_85_89: title: Neoplasms_male_age_85-89 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_90_94: title: Neoplasms_male_age_90-94 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_95_99: title: Neoplasms_male_age_95-99 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_male_age_100_and_over: title: Neoplasms_male_age_100_and_over + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_all_ages: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_all ages - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_under_1: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_under_1 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_1_4: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_1-4 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_5_9: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_5-9 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_10_14: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_10-14 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_15_19: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_15-19 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_20_24: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_20-24 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_25_29: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_25-29 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_30_34: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_30-34 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_35_39: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_35-39 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_40_44: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_40-44 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_45_49: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_45-49 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_50_54: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_50-54 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_55_59: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_55-59 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_60_64: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_60-64 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_65_69: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_65-69 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_70_74: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_70-74 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_75_79: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_75-79 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_80_84: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_80-84 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_85_89: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_85-89 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_90_94: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_90-94 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_95_99: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_95-99 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_male_age_100_and_over: title: Endocrine nutritional and metabolic diseases and immunity disorders_male_age_100_and_over - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. diseases_of_the_blood_and_blood_forming_organs_male_age_all_ages: title: Diseases of the blood and blood-forming organs_male_age_all ages - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_under_1: title: Diseases of the blood and blood-forming organs_male_age_under_1 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_1_4: title: Diseases of the blood and blood-forming organs_male_age_1-4 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_5_9: title: Diseases of the blood and blood-forming organs_male_age_5-9 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_10_14: title: Diseases of the blood and blood-forming organs_male_age_10-14 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_15_19: title: Diseases of the blood and blood-forming organs_male_age_15-19 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_20_24: title: Diseases of the blood and blood-forming organs_male_age_20-24 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_25_29: title: Diseases of the blood and blood-forming organs_male_age_25-29 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_30_34: title: Diseases of the blood and blood-forming organs_male_age_30-34 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_35_39: title: Diseases of the blood and blood-forming organs_male_age_35-39 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_40_44: title: Diseases of the blood and blood-forming organs_male_age_40-44 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_45_49: title: Diseases of the blood and blood-forming organs_male_age_45-49 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_50_54: title: Diseases of the blood and blood-forming organs_male_age_50-54 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_55_59: title: Diseases of the blood and blood-forming organs_male_age_55-59 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_60_64: title: Diseases of the blood and blood-forming organs_male_age_60-64 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_65_69: title: Diseases of the blood and blood-forming organs_male_age_65-69 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_70_74: title: Diseases of the blood and blood-forming organs_male_age_70-74 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_75_79: title: Diseases of the blood and blood-forming organs_male_age_75-79 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_80_84: title: Diseases of the blood and blood-forming organs_male_age_80-84 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_85_89: title: Diseases of the blood and blood-forming organs_male_age_85-89 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_90_94: title: Diseases of the blood and blood-forming organs_male_age_90-94 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_95_99: title: Diseases of the blood and blood-forming organs_male_age_95-99 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_male_age_100_and_over: title: Diseases of the blood and blood-forming organs_male_age_100_and_over - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. mental_disorders_male_age_all_ages: title: Mental disorders_male_age_all ages - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_under_1: title: Mental disorders_male_age_under_1 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_1_4: title: Mental disorders_male_age_1-4 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_5_9: title: Mental disorders_male_age_5-9 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_10_14: title: Mental disorders_male_age_10-14 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_15_19: title: Mental disorders_male_age_15-19 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_20_24: title: Mental disorders_male_age_20-24 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_25_29: title: Mental disorders_male_age_25-29 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_30_34: title: Mental disorders_male_age_30-34 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_35_39: title: Mental disorders_male_age_35-39 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_40_44: title: Mental disorders_male_age_40-44 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_45_49: title: Mental disorders_male_age_45-49 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_50_54: title: Mental disorders_male_age_50-54 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_55_59: title: Mental disorders_male_age_55-59 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_60_64: title: Mental disorders_male_age_60-64 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_65_69: title: Mental disorders_male_age_65-69 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_70_74: title: Mental disorders_male_age_70-74 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_75_79: title: Mental disorders_male_age_75-79 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_80_84: title: Mental disorders_male_age_80-84 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_85_89: title: Mental disorders_male_age_85-89 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_90_94: title: Mental disorders_male_age_90-94 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_95_99: title: Mental disorders_male_age_95-99 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_male_age_100_and_over: title: Mental disorders_male_age_100_and_over - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. diseases_of_the_nervous_system_male_age_all_ages: title: Diseases of the nervous system_male_age_all ages - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_under_1: title: Diseases of the nervous system_male_age_under_1 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_1_4: title: Diseases of the nervous system_male_age_1-4 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_5_9: title: Diseases of the nervous system_male_age_5-9 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_10_14: title: Diseases of the nervous system_male_age_10-14 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_15_19: title: Diseases of the nervous system_male_age_15-19 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_20_24: title: Diseases of the nervous system_male_age_20-24 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_25_29: title: Diseases of the nervous system_male_age_25-29 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_30_34: title: Diseases of the nervous system_male_age_30-34 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_35_39: title: Diseases of the nervous system_male_age_35-39 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_40_44: title: Diseases of the nervous system_male_age_40-44 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_45_49: title: Diseases of the nervous system_male_age_45-49 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_50_54: title: Diseases of the nervous system_male_age_50-54 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_55_59: title: Diseases of the nervous system_male_age_55-59 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_60_64: title: Diseases of the nervous system_male_age_60-64 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_65_69: title: Diseases of the nervous system_male_age_65-69 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_70_74: title: Diseases of the nervous system_male_age_70-74 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_75_79: title: Diseases of the nervous system_male_age_75-79 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_80_84: title: Diseases of the nervous system_male_age_80-84 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_85_89: title: Diseases of the nervous system_male_age_85-89 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_90_94: title: Diseases of the nervous system_male_age_90-94 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_95_99: title: Diseases of the nervous system_male_age_95-99 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_male_age_100_and_over: title: Diseases of the nervous system_male_age_100_and_over - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_sense_organs_male_age_all_ages: title: Diseases of the sense organs_male_age_all ages - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_under_1: title: Diseases of the sense organs_male_age_under_1 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_1_4: title: Diseases of the sense organs_male_age_1-4 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_5_9: title: Diseases of the sense organs_male_age_5-9 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_10_14: title: Diseases of the sense organs_male_age_10-14 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_15_19: title: Diseases of the sense organs_male_age_15-19 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_20_24: title: Diseases of the sense organs_male_age_20-24 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_25_29: title: Diseases of the sense organs_male_age_25-29 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_30_34: title: Diseases of the sense organs_male_age_30-34 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_35_39: title: Diseases of the sense organs_male_age_35-39 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_40_44: title: Diseases of the sense organs_male_age_40-44 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_45_49: title: Diseases of the sense organs_male_age_45-49 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_50_54: title: Diseases of the sense organs_male_age_50-54 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_55_59: title: Diseases of the sense organs_male_age_55-59 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_60_64: title: Diseases of the sense organs_male_age_60-64 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_65_69: title: Diseases of the sense organs_male_age_65-69 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_70_74: title: Diseases of the sense organs_male_age_70-74 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_75_79: title: Diseases of the sense organs_male_age_75-79 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_80_84: title: Diseases of the sense organs_male_age_80-84 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_85_89: title: Diseases of the sense organs_male_age_85-89 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_90_94: title: Diseases of the sense organs_male_age_90-94 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_95_99: title: Diseases of the sense organs_male_age_95-99 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_male_age_100_and_over: title: Diseases of the sense organs_male_age_100_and_over - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_circulatory_system_male_age_all_ages: title: Diseases of the circulatory system_male_age_all ages - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_under_1: title: Diseases of the circulatory system_male_age_under_1 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_1_4: title: Diseases of the circulatory system_male_age_1-4 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_5_9: title: Diseases of the circulatory system_male_age_5-9 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_10_14: title: Diseases of the circulatory system_male_age_10-14 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_15_19: title: Diseases of the circulatory system_male_age_15-19 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_20_24: title: Diseases of the circulatory system_male_age_20-24 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_25_29: title: Diseases of the circulatory system_male_age_25-29 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_30_34: title: Diseases of the circulatory system_male_age_30-34 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_35_39: title: Diseases of the circulatory system_male_age_35-39 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_40_44: title: Diseases of the circulatory system_male_age_40-44 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_45_49: title: Diseases of the circulatory system_male_age_45-49 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_50_54: title: Diseases of the circulatory system_male_age_50-54 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_55_59: title: Diseases of the circulatory system_male_age_55-59 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_60_64: title: Diseases of the circulatory system_male_age_60-64 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_65_69: title: Diseases of the circulatory system_male_age_65-69 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_70_74: title: Diseases of the circulatory system_male_age_70-74 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_75_79: title: Diseases of the circulatory system_male_age_75-79 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_80_84: title: Diseases of the circulatory system_male_age_80-84 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_85_89: title: Diseases of the circulatory system_male_age_85-89 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_90_94: title: Diseases of the circulatory system_male_age_90-94 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_95_99: title: Diseases of the circulatory system_male_age_95-99 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_male_age_100_and_over: title: Diseases of the circulatory system_male_age_100_and_over - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_respiratory_system_male_age_all_ages: title: Diseases of the respiratory system_male_age_all ages - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_under_1: title: Diseases of the respiratory system_male_age_under_1 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_1_4: title: Diseases of the respiratory system_male_age_1-4 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_5_9: title: Diseases of the respiratory system_male_age_5-9 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_10_14: title: Diseases of the respiratory system_male_age_10-14 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_15_19: title: Diseases of the respiratory system_male_age_15-19 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_20_24: title: Diseases of the respiratory system_male_age_20-24 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_25_29: title: Diseases of the respiratory system_male_age_25-29 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_30_34: title: Diseases of the respiratory system_male_age_30-34 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_35_39: title: Diseases of the respiratory system_male_age_35-39 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_40_44: title: Diseases of the respiratory system_male_age_40-44 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_45_49: title: Diseases of the respiratory system_male_age_45-49 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_50_54: title: Diseases of the respiratory system_male_age_50-54 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_55_59: title: Diseases of the respiratory system_male_age_55-59 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_60_64: title: Diseases of the respiratory system_male_age_60-64 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_65_69: title: Diseases of the respiratory system_male_age_65-69 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_70_74: title: Diseases of the respiratory system_male_age_70-74 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_75_79: title: Diseases of the respiratory system_male_age_75-79 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_80_84: title: Diseases of the respiratory system_male_age_80-84 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_85_89: title: Diseases of the respiratory system_male_age_85-89 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_90_94: title: Diseases of the respiratory system_male_age_90-94 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_95_99: title: Diseases of the respiratory system_male_age_95-99 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_male_age_100_and_over: title: Diseases of the respiratory system_male_age_100_and_over - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_digestive_system_male_age_all_ages: title: Diseases of the digestive system_male_age_all ages - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_under_1: title: Diseases of the digestive system_male_age_under_1 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_1_4: title: Diseases of the digestive system_male_age_1-4 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_5_9: title: Diseases of the digestive system_male_age_5-9 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_10_14: title: Diseases of the digestive system_male_age_10-14 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_15_19: title: Diseases of the digestive system_male_age_15-19 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_20_24: title: Diseases of the digestive system_male_age_20-24 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_25_29: title: Diseases of the digestive system_male_age_25-29 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_30_34: title: Diseases of the digestive system_male_age_30-34 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_35_39: title: Diseases of the digestive system_male_age_35-39 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_40_44: title: Diseases of the digestive system_male_age_40-44 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_45_49: title: Diseases of the digestive system_male_age_45-49 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_50_54: title: Diseases of the digestive system_male_age_50-54 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_55_59: title: Diseases of the digestive system_male_age_55-59 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_60_64: title: Diseases of the digestive system_male_age_60-64 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_65_69: title: Diseases of the digestive system_male_age_65-69 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_70_74: title: Diseases of the digestive system_male_age_70-74 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_75_79: title: Diseases of the digestive system_male_age_75-79 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_80_84: title: Diseases of the digestive system_male_age_80-84 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_85_89: title: Diseases of the digestive system_male_age_85-89 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_90_94: title: Diseases of the digestive system_male_age_90-94 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_95_99: title: Diseases of the digestive system_male_age_95-99 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_male_age_100_and_over: title: Diseases of the digestive system_male_age_100_and_over - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_genitourinary_system_male_age_all_ages: title: Diseases of the genitourinary system_male_age_all ages - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_under_1: title: Diseases of the genitourinary system_male_age_under_1 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_1_4: title: Diseases of the genitourinary system_male_age_1-4 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_5_9: title: Diseases of the genitourinary system_male_age_5-9 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_10_14: title: Diseases of the genitourinary system_male_age_10-14 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_15_19: title: Diseases of the genitourinary system_male_age_15-19 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_20_24: title: Diseases of the genitourinary system_male_age_20-24 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_25_29: title: Diseases of the genitourinary system_male_age_25-29 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_30_34: title: Diseases of the genitourinary system_male_age_30-34 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_35_39: title: Diseases of the genitourinary system_male_age_35-39 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_40_44: title: Diseases of the genitourinary system_male_age_40-44 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_45_49: title: Diseases of the genitourinary system_male_age_45-49 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_50_54: title: Diseases of the genitourinary system_male_age_50-54 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_55_59: title: Diseases of the genitourinary system_male_age_55-59 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_60_64: title: Diseases of the genitourinary system_male_age_60-64 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_65_69: title: Diseases of the genitourinary system_male_age_65-69 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_70_74: title: Diseases of the genitourinary system_male_age_70-74 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_75_79: title: Diseases of the genitourinary system_male_age_75-79 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_80_84: title: Diseases of the genitourinary system_male_age_80-84 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_85_89: title: Diseases of the genitourinary system_male_age_85-89 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_90_94: title: Diseases of the genitourinary system_male_age_90-94 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_95_99: title: Diseases of the genitourinary system_male_age_95-99 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_male_age_100_and_over: title: Diseases of the genitourinary system_male_age_100_and_over - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_all_ages: title: Complications of pregnancy childbirth and the puerperium_male_age_all ages - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_under_1: title: Complications of pregnancy childbirth and the puerperium_male_age_under_1 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_1_4: title: Complications of pregnancy childbirth and the puerperium_male_age_1-4 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_5_9: title: Complications of pregnancy childbirth and the puerperium_male_age_5-9 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_10_14: title: Complications of pregnancy childbirth and the puerperium_male_age_10-14 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_15_19: title: Complications of pregnancy childbirth and the puerperium_male_age_15-19 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_20_24: title: Complications of pregnancy childbirth and the puerperium_male_age_20-24 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_25_29: title: Complications of pregnancy childbirth and the puerperium_male_age_25-29 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_30_34: title: Complications of pregnancy childbirth and the puerperium_male_age_30-34 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_35_39: title: Complications of pregnancy childbirth and the puerperium_male_age_35-39 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_40_44: title: Complications of pregnancy childbirth and the puerperium_male_age_40-44 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_45_49: title: Complications of pregnancy childbirth and the puerperium_male_age_45-49 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_50_54: title: Complications of pregnancy childbirth and the puerperium_male_age_50-54 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_55_59: title: Complications of pregnancy childbirth and the puerperium_male_age_55-59 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_60_64: title: Complications of pregnancy childbirth and the puerperium_male_age_60-64 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_65_69: title: Complications of pregnancy childbirth and the puerperium_male_age_65-69 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_70_74: title: Complications of pregnancy childbirth and the puerperium_male_age_70-74 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_75_79: title: Complications of pregnancy childbirth and the puerperium_male_age_75-79 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_80_84: title: Complications of pregnancy childbirth and the puerperium_male_age_80-84 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_85_89: title: Complications of pregnancy childbirth and the puerperium_male_age_85-89 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_90_94: title: Complications of pregnancy childbirth and the puerperium_male_age_90-94 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_95_99: title: Complications of pregnancy childbirth and the puerperium_male_age_95-99 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_male_age_100_and_over: title: Complications of pregnancy childbirth and the puerperium_male_age_100_and_over - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_all_ages: title: Diseases of the skin and subcutaneous tissue_male_age_all ages - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_under_1: title: Diseases of the skin and subcutaneous tissue_male_age_under_1 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_1_4: title: Diseases of the skin and subcutaneous tissue_male_age_1-4 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_5_9: title: Diseases of the skin and subcutaneous tissue_male_age_5-9 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_10_14: title: Diseases of the skin and subcutaneous tissue_male_age_10-14 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_15_19: title: Diseases of the skin and subcutaneous tissue_male_age_15-19 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_20_24: title: Diseases of the skin and subcutaneous tissue_male_age_20-24 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_25_29: title: Diseases of the skin and subcutaneous tissue_male_age_25-29 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_30_34: title: Diseases of the skin and subcutaneous tissue_male_age_30-34 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_35_39: title: Diseases of the skin and subcutaneous tissue_male_age_35-39 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_40_44: title: Diseases of the skin and subcutaneous tissue_male_age_40-44 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_45_49: title: Diseases of the skin and subcutaneous tissue_male_age_45-49 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_50_54: title: Diseases of the skin and subcutaneous tissue_male_age_50-54 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_55_59: title: Diseases of the skin and subcutaneous tissue_male_age_55-59 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_60_64: title: Diseases of the skin and subcutaneous tissue_male_age_60-64 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_65_69: title: Diseases of the skin and subcutaneous tissue_male_age_65-69 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_70_74: title: Diseases of the skin and subcutaneous tissue_male_age_70-74 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_75_79: title: Diseases of the skin and subcutaneous tissue_male_age_75-79 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_80_84: title: Diseases of the skin and subcutaneous tissue_male_age_80-84 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_85_89: title: Diseases of the skin and subcutaneous tissue_male_age_85-89 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_90_94: title: Diseases of the skin and subcutaneous tissue_male_age_90-94 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_95_99: title: Diseases of the skin and subcutaneous tissue_male_age_95-99 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_male_age_100_and_over: title: Diseases of the skin and subcutaneous tissue_male_age_100_and_over - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_all_ages: title: Diseases of the musculoskeletal system and connective tissue_male_age_all ages - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_under_1: title: Diseases of the musculoskeletal system and connective tissue_male_age_under_1 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_1_4: title: Diseases of the musculoskeletal system and connective tissue_male_age_1-4 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_5_9: title: Diseases of the musculoskeletal system and connective tissue_male_age_5-9 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_10_14: title: Diseases of the musculoskeletal system and connective tissue_male_age_10-14 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_15_19: title: Diseases of the musculoskeletal system and connective tissue_male_age_15-19 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_20_24: title: Diseases of the musculoskeletal system and connective tissue_male_age_20-24 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_25_29: title: Diseases of the musculoskeletal system and connective tissue_male_age_25-29 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_30_34: title: Diseases of the musculoskeletal system and connective tissue_male_age_30-34 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_35_39: title: Diseases of the musculoskeletal system and connective tissue_male_age_35-39 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_40_44: title: Diseases of the musculoskeletal system and connective tissue_male_age_40-44 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_45_49: title: Diseases of the musculoskeletal system and connective tissue_male_age_45-49 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_50_54: title: Diseases of the musculoskeletal system and connective tissue_male_age_50-54 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_55_59: title: Diseases of the musculoskeletal system and connective tissue_male_age_55-59 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_60_64: title: Diseases of the musculoskeletal system and connective tissue_male_age_60-64 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_65_69: title: Diseases of the musculoskeletal system and connective tissue_male_age_65-69 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_70_74: title: Diseases of the musculoskeletal system and connective tissue_male_age_70-74 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_75_79: title: Diseases of the musculoskeletal system and connective tissue_male_age_75-79 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_80_84: title: Diseases of the musculoskeletal system and connective tissue_male_age_80-84 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_85_89: title: Diseases of the musculoskeletal system and connective tissue_male_age_85-89 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_90_94: title: Diseases of the musculoskeletal system and connective tissue_male_age_90-94 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_95_99: title: Diseases of the musculoskeletal system and connective tissue_male_age_95-99 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_male_age_100_and_over: title: Diseases of the musculoskeletal system and connective tissue_male_age_100_and_over - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. congenital_anomalies_male_age_all_ages: title: Congenital anomalies_male_age_all ages - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_under_1: title: Congenital anomalies_male_age_under_1 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_1_4: title: Congenital anomalies_male_age_1-4 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_5_9: title: Congenital anomalies_male_age_5-9 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_10_14: title: Congenital anomalies_male_age_10-14 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_15_19: title: Congenital anomalies_male_age_15-19 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_20_24: title: Congenital anomalies_male_age_20-24 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_25_29: title: Congenital anomalies_male_age_25-29 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_30_34: title: Congenital anomalies_male_age_30-34 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_35_39: title: Congenital anomalies_male_age_35-39 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_40_44: title: Congenital anomalies_male_age_40-44 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_45_49: title: Congenital anomalies_male_age_45-49 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_50_54: title: Congenital anomalies_male_age_50-54 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_55_59: title: Congenital anomalies_male_age_55-59 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_60_64: title: Congenital anomalies_male_age_60-64 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_65_69: title: Congenital anomalies_male_age_65-69 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_70_74: title: Congenital anomalies_male_age_70-74 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_75_79: title: Congenital anomalies_male_age_75-79 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_80_84: title: Congenital anomalies_male_age_80-84 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_85_89: title: Congenital anomalies_male_age_85-89 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_90_94: title: Congenital anomalies_male_age_90-94 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_95_99: title: Congenital anomalies_male_age_95-99 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_male_age_100_and_over: title: Congenital anomalies_male_age_100_and_over - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. certain_conditions_originating_in_the_perinatal_period_male_age_all_ages: title: Certain conditions originating in the perinatal period_male_age_all ages - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_under_1: title: Certain conditions originating in the perinatal period_male_age_under_1 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_1_4: title: Certain conditions originating in the perinatal period_male_age_1-4 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_5_9: title: Certain conditions originating in the perinatal period_male_age_5-9 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_10_14: title: Certain conditions originating in the perinatal period_male_age_10-14 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_15_19: title: Certain conditions originating in the perinatal period_male_age_15-19 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_20_24: title: Certain conditions originating in the perinatal period_male_age_20-24 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_25_29: title: Certain conditions originating in the perinatal period_male_age_25-29 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_30_34: title: Certain conditions originating in the perinatal period_male_age_30-34 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_35_39: title: Certain conditions originating in the perinatal period_male_age_35-39 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_40_44: title: Certain conditions originating in the perinatal period_male_age_40-44 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_45_49: title: Certain conditions originating in the perinatal period_male_age_45-49 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_50_54: title: Certain conditions originating in the perinatal period_male_age_50-54 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_55_59: title: Certain conditions originating in the perinatal period_male_age_55-59 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_60_64: title: Certain conditions originating in the perinatal period_male_age_60-64 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_65_69: title: Certain conditions originating in the perinatal period_male_age_65-69 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_70_74: title: Certain conditions originating in the perinatal period_male_age_70-74 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_75_79: title: Certain conditions originating in the perinatal period_male_age_75-79 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_80_84: title: Certain conditions originating in the perinatal period_male_age_80-84 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_85_89: title: Certain conditions originating in the perinatal period_male_age_85-89 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_90_94: title: Certain conditions originating in the perinatal period_male_age_90-94 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_95_99: title: Certain conditions originating in the perinatal period_male_age_95-99 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_male_age_100_and_over: title: Certain conditions originating in the perinatal period_male_age_100_and_over - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. symptoms_signs_and_ill_defined_conditions_male_age_all_ages: title: Symptoms signs and ill-defined conditions_male_age_all ages - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_under_1: title: Symptoms signs and ill-defined conditions_male_age_under_1 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_1_4: title: Symptoms signs and ill-defined conditions_male_age_1-4 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_5_9: title: Symptoms signs and ill-defined conditions_male_age_5-9 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_10_14: title: Symptoms signs and ill-defined conditions_male_age_10-14 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_15_19: title: Symptoms signs and ill-defined conditions_male_age_15-19 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_20_24: title: Symptoms signs and ill-defined conditions_male_age_20-24 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_25_29: title: Symptoms signs and ill-defined conditions_male_age_25-29 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_30_34: title: Symptoms signs and ill-defined conditions_male_age_30-34 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_35_39: title: Symptoms signs and ill-defined conditions_male_age_35-39 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_40_44: title: Symptoms signs and ill-defined conditions_male_age_40-44 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_45_49: title: Symptoms signs and ill-defined conditions_male_age_45-49 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_50_54: title: Symptoms signs and ill-defined conditions_male_age_50-54 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_55_59: title: Symptoms signs and ill-defined conditions_male_age_55-59 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_60_64: title: Symptoms signs and ill-defined conditions_male_age_60-64 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_65_69: title: Symptoms signs and ill-defined conditions_male_age_65-69 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_70_74: title: Symptoms signs and ill-defined conditions_male_age_70-74 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_75_79: title: Symptoms signs and ill-defined conditions_male_age_75-79 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_80_84: title: Symptoms signs and ill-defined conditions_male_age_80-84 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_85_89: title: Symptoms signs and ill-defined conditions_male_age_85-89 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_90_94: title: Symptoms signs and ill-defined conditions_male_age_90-94 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_95_99: title: Symptoms signs and ill-defined conditions_male_age_95-99 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_male_age_100_and_over: title: Symptoms signs and ill-defined conditions_male_age_100_and_over - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. injury_and_poisoning_male_age_all_ages: title: Injury and poisoning_male_age_all ages - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_under_1: title: Injury and poisoning_male_age_under_1 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_1_4: title: Injury and poisoning_male_age_1-4 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_5_9: title: Injury and poisoning_male_age_5-9 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_10_14: title: Injury and poisoning_male_age_10-14 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_15_19: title: Injury and poisoning_male_age_15-19 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_20_24: title: Injury and poisoning_male_age_20-24 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_25_29: title: Injury and poisoning_male_age_25-29 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_30_34: title: Injury and poisoning_male_age_30-34 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_35_39: title: Injury and poisoning_male_age_35-39 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_40_44: title: Injury and poisoning_male_age_40-44 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_45_49: title: Injury and poisoning_male_age_45-49 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_50_54: title: Injury and poisoning_male_age_50-54 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_55_59: title: Injury and poisoning_male_age_55-59 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_60_64: title: Injury and poisoning_male_age_60-64 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_65_69: title: Injury and poisoning_male_age_65-69 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_70_74: title: Injury and poisoning_male_age_70-74 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_75_79: title: Injury and poisoning_male_age_75-79 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_80_84: title: Injury and poisoning_male_age_80-84 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_85_89: title: Injury and poisoning_male_age_85-89 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_90_94: title: Injury and poisoning_male_age_90-94 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_95_99: title: Injury and poisoning_male_age_95-99 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_male_age_100_and_over: title: Injury and poisoning_male_age_100_and_over - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. all_causes_both_sexes_age_all_ages: title: All causes_both sexes_age_all ages - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_under_1: title: All causes_both sexes_age_under_1 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_1_4: title: All causes_both sexes_age_1-4 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_5_9: title: All causes_both sexes_age_5-9 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_10_14: title: All causes_both sexes_age_10-14 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_15_19: title: All causes_both sexes_age_15-19 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_20_24: title: All causes_both sexes_age_20-24 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_25_29: title: All causes_both sexes_age_25-29 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_30_34: title: All causes_both sexes_age_30-34 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_35_39: title: All causes_both sexes_age_35-39 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_40_44: title: All causes_both sexes_age_40-44 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_45_49: title: All causes_both sexes_age_45-49 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_50_54: title: All causes_both sexes_age_50-54 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_55_59: title: All causes_both sexes_age_55-59 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_60_64: title: All causes_both sexes_age_60-64 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_65_69: title: All causes_both sexes_age_65-69 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_70_74: title: All causes_both sexes_age_70-74 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_75_79: title: All causes_both sexes_age_75-79 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_80_84: title: All causes_both sexes_age_80-84 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_85_89: title: All causes_both sexes_age_85-89 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_90_94: title: All causes_both sexes_age_90-94 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_95_99: title: All causes_both sexes_age_95-99 - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. all_causes_both_sexes_age_100_and_over: title: All causes_both sexes_age_100_and_over - description: The mortality rate from all causes per 100 000 people in the whole population. unit: per 100 000 + description: The mortality rate from all causes per 100 000 people in the whole population. infectious_and_parasitic_diseases_both_sexes_age_all_ages: title: Infectious and parasitic diseases_both sexes_age_all ages - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_under_1: title: Infectious and parasitic diseases_both sexes_age_under_1 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_1_4: title: Infectious and parasitic diseases_both sexes_age_1-4 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_5_9: title: Infectious and parasitic diseases_both sexes_age_5-9 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_10_14: title: Infectious and parasitic diseases_both sexes_age_10-14 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_15_19: title: Infectious and parasitic diseases_both sexes_age_15-19 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_20_24: title: Infectious and parasitic diseases_both sexes_age_20-24 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_25_29: title: Infectious and parasitic diseases_both sexes_age_25-29 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_30_34: title: Infectious and parasitic diseases_both sexes_age_30-34 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_35_39: title: Infectious and parasitic diseases_both sexes_age_35-39 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_40_44: title: Infectious and parasitic diseases_both sexes_age_40-44 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_45_49: title: Infectious and parasitic diseases_both sexes_age_45-49 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_50_54: title: Infectious and parasitic diseases_both sexes_age_50-54 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_55_59: title: Infectious and parasitic diseases_both sexes_age_55-59 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_60_64: title: Infectious and parasitic diseases_both sexes_age_60-64 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_65_69: title: Infectious and parasitic diseases_both sexes_age_65-69 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_70_74: title: Infectious and parasitic diseases_both sexes_age_70-74 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_75_79: title: Infectious and parasitic diseases_both sexes_age_75-79 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_80_84: title: Infectious and parasitic diseases_both sexes_age_80-84 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_85_89: title: Infectious and parasitic diseases_both sexes_age_85-89 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_90_94: title: Infectious and parasitic diseases_both sexes_age_90-94 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_95_99: title: Infectious and parasitic diseases_both sexes_age_95-99 - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. infectious_and_parasitic_diseases_both_sexes_age_100_and_over: title: Infectious and parasitic diseases_both sexes_age_100_and_over - description: The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. - Infectious and parasitic diseases have the ICD death codes 001*-139*. unit: per 100 000 + description: |- + The mortality rate from infectious and parasitic diseases per 100 000 people in the whole population. Infectious and parasitic diseases have the ICD death codes 001*-139*. neoplasms_both_sexes_age_all_ages: title: Neoplasms_both sexes_age_all ages + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_under_1: title: Neoplasms_both sexes_age_under_1 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_1_4: title: Neoplasms_both sexes_age_1-4 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_5_9: title: Neoplasms_both sexes_age_5-9 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_10_14: title: Neoplasms_both sexes_age_10-14 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_15_19: title: Neoplasms_both sexes_age_15-19 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_20_24: title: Neoplasms_both sexes_age_20-24 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_25_29: title: Neoplasms_both sexes_age_25-29 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_30_34: title: Neoplasms_both sexes_age_30-34 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_35_39: title: Neoplasms_both sexes_age_35-39 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_40_44: title: Neoplasms_both sexes_age_40-44 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_45_49: title: Neoplasms_both sexes_age_45-49 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_50_54: title: Neoplasms_both sexes_age_50-54 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_55_59: title: Neoplasms_both sexes_age_55-59 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_60_64: title: Neoplasms_both sexes_age_60-64 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_65_69: title: Neoplasms_both sexes_age_65-69 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_70_74: title: Neoplasms_both sexes_age_70-74 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_75_79: title: Neoplasms_both sexes_age_75-79 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_80_84: title: Neoplasms_both sexes_age_80-84 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_85_89: title: Neoplasms_both sexes_age_85-89 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_90_94: title: Neoplasms_both sexes_age_90-94 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_95_99: title: Neoplasms_both sexes_age_95-99 + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 neoplasms_both_sexes_age_100_and_over: title: Neoplasms_both sexes_age_100_and_over + unit: per 100 000 description: The mortality rate from cancers per 100 000 people in the population. Cancers have the ICD death codes 140*-239*. - unit: per 100 000 endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_all_ages: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_all ages - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_under_1: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_under_1 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_1_4: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_1-4 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_5_9: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_5-9 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_10_14: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_10-14 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_15_19: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_15-19 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_20_24: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_20-24 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_25_29: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_25-29 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_30_34: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_30-34 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_35_39: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_35-39 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_40_44: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_40-44 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_45_49: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_45-49 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_50_54: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_50-54 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_55_59: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_55-59 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_60_64: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_60-64 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_65_69: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_65-69 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_70_74: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_70-74 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_75_79: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_75-79 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_80_84: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_80-84 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_85_89: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_85-89 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_90_94: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_90-94 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_95_99: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_95-99 - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders_both_sexes_age_100_and_over: title: Endocrine nutritional and metabolic diseases and immunity disorders_both sexes_age_100_and_over - description: The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 - 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD - death codes 240*-279*. unit: per 100 000 + description: |- + The mortality rate from endocrine, nutritional and metabolic diseases, and immunity disorders per 100 000 people in the population. Endocrine, nutritional and metabolic diseases, and immunity disorders have the ICD death codes 240*-279*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_all_ages: title: Diseases of the blood and blood-forming organs_both sexes_age_all ages - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_under_1: title: Diseases of the blood and blood-forming organs_both sexes_age_under_1 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_1_4: title: Diseases of the blood and blood-forming organs_both sexes_age_1-4 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_5_9: title: Diseases of the blood and blood-forming organs_both sexes_age_5-9 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_10_14: title: Diseases of the blood and blood-forming organs_both sexes_age_10-14 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_15_19: title: Diseases of the blood and blood-forming organs_both sexes_age_15-19 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_20_24: title: Diseases of the blood and blood-forming organs_both sexes_age_20-24 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_25_29: title: Diseases of the blood and blood-forming organs_both sexes_age_25-29 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_30_34: title: Diseases of the blood and blood-forming organs_both sexes_age_30-34 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_35_39: title: Diseases of the blood and blood-forming organs_both sexes_age_35-39 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_40_44: title: Diseases of the blood and blood-forming organs_both sexes_age_40-44 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_45_49: title: Diseases of the blood and blood-forming organs_both sexes_age_45-49 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_50_54: title: Diseases of the blood and blood-forming organs_both sexes_age_50-54 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_55_59: title: Diseases of the blood and blood-forming organs_both sexes_age_55-59 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_60_64: title: Diseases of the blood and blood-forming organs_both sexes_age_60-64 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_65_69: title: Diseases of the blood and blood-forming organs_both sexes_age_65-69 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_70_74: title: Diseases of the blood and blood-forming organs_both sexes_age_70-74 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_75_79: title: Diseases of the blood and blood-forming organs_both sexes_age_75-79 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_80_84: title: Diseases of the blood and blood-forming organs_both sexes_age_80-84 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_85_89: title: Diseases of the blood and blood-forming organs_both sexes_age_85-89 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_90_94: title: Diseases of the blood and blood-forming organs_both sexes_age_90-94 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_95_99: title: Diseases of the blood and blood-forming organs_both sexes_age_95-99 - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. diseases_of_the_blood_and_blood_forming_organs_both_sexes_age_100_and_over: title: Diseases of the blood and blood-forming organs_both sexes_age_100_and_over - description: The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. - Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. unit: per 100 000 + description: |- + The mortality rate from diseases of the blood and blood-forming organs per 100 000 people in the population. Diseases of the blood and blood-forming organs have the ICD death codes 280*-289*. mental_disorders_both_sexes_age_all_ages: title: Mental disorders_both sexes_age_all ages - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_under_1: title: Mental disorders_both sexes_age_under_1 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_1_4: title: Mental disorders_both sexes_age_1-4 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_5_9: title: Mental disorders_both sexes_age_5-9 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_10_14: title: Mental disorders_both sexes_age_10-14 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_15_19: title: Mental disorders_both sexes_age_15-19 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_20_24: title: Mental disorders_both sexes_age_20-24 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_25_29: title: Mental disorders_both sexes_age_25-29 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_30_34: title: Mental disorders_both sexes_age_30-34 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_35_39: title: Mental disorders_both sexes_age_35-39 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_40_44: title: Mental disorders_both sexes_age_40-44 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_45_49: title: Mental disorders_both sexes_age_45-49 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_50_54: title: Mental disorders_both sexes_age_50-54 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_55_59: title: Mental disorders_both sexes_age_55-59 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_60_64: title: Mental disorders_both sexes_age_60-64 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_65_69: title: Mental disorders_both sexes_age_65-69 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_70_74: title: Mental disorders_both sexes_age_70-74 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_75_79: title: Mental disorders_both sexes_age_75-79 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_80_84: title: Mental disorders_both sexes_age_80-84 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_85_89: title: Mental disorders_both sexes_age_85-89 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_90_94: title: Mental disorders_both sexes_age_90-94 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_95_99: title: Mental disorders_both sexes_age_95-99 - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. mental_disorders_both_sexes_age_100_and_over: title: Mental disorders_both sexes_age_100_and_over - description: The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have - the ICD death codes 290*-319*. unit: per 100 000 + description: |- + The mortality rate from mental illnesses per 100 000 people in the population. Mental illnesses have the ICD death codes 290*-319*. diseases_of_the_nervous_system_both_sexes_age_all_ages: title: Diseases of the nervous system_both sexes_age_all ages - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_under_1: title: Diseases of the nervous system_both sexes_age_under_1 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_1_4: title: Diseases of the nervous system_both sexes_age_1-4 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_5_9: title: Diseases of the nervous system_both sexes_age_5-9 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_10_14: title: Diseases of the nervous system_both sexes_age_10-14 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_15_19: title: Diseases of the nervous system_both sexes_age_15-19 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_20_24: title: Diseases of the nervous system_both sexes_age_20-24 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_25_29: title: Diseases of the nervous system_both sexes_age_25-29 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_30_34: title: Diseases of the nervous system_both sexes_age_30-34 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_35_39: title: Diseases of the nervous system_both sexes_age_35-39 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_40_44: title: Diseases of the nervous system_both sexes_age_40-44 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_45_49: title: Diseases of the nervous system_both sexes_age_45-49 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_50_54: title: Diseases of the nervous system_both sexes_age_50-54 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_55_59: title: Diseases of the nervous system_both sexes_age_55-59 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_60_64: title: Diseases of the nervous system_both sexes_age_60-64 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_65_69: title: Diseases of the nervous system_both sexes_age_65-69 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_70_74: title: Diseases of the nervous system_both sexes_age_70-74 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_75_79: title: Diseases of the nervous system_both sexes_age_75-79 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_80_84: title: Diseases of the nervous system_both sexes_age_80-84 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_85_89: title: Diseases of the nervous system_both sexes_age_85-89 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_90_94: title: Diseases of the nervous system_both sexes_age_90-94 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_95_99: title: Diseases of the nervous system_both sexes_age_95-99 - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_nervous_system_both_sexes_age_100_and_over: title: Diseases of the nervous system_both sexes_age_100_and_over - description: The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases - of the nervous system have the ICD death code 320*-359*. unit: per 100 000 + description: |- + The mortality rate from diseases of the nervous system per 100 000 people in the population. Diseases of the nervous system have the ICD death code 320*-359*. diseases_of_the_sense_organs_both_sexes_age_all_ages: title: Diseases of the sense organs_both sexes_age_all ages - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_under_1: title: Diseases of the sense organs_both sexes_age_under_1 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_1_4: title: Diseases of the sense organs_both sexes_age_1-4 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_5_9: title: Diseases of the sense organs_both sexes_age_5-9 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_10_14: title: Diseases of the sense organs_both sexes_age_10-14 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_15_19: title: Diseases of the sense organs_both sexes_age_15-19 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_20_24: title: Diseases of the sense organs_both sexes_age_20-24 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_25_29: title: Diseases of the sense organs_both sexes_age_25-29 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_30_34: title: Diseases of the sense organs_both sexes_age_30-34 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_35_39: title: Diseases of the sense organs_both sexes_age_35-39 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_40_44: title: Diseases of the sense organs_both sexes_age_40-44 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_45_49: title: Diseases of the sense organs_both sexes_age_45-49 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_50_54: title: Diseases of the sense organs_both sexes_age_50-54 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_55_59: title: Diseases of the sense organs_both sexes_age_55-59 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_60_64: title: Diseases of the sense organs_both sexes_age_60-64 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_65_69: title: Diseases of the sense organs_both sexes_age_65-69 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_70_74: title: Diseases of the sense organs_both sexes_age_70-74 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_75_79: title: Diseases of the sense organs_both sexes_age_75-79 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_80_84: title: Diseases of the sense organs_both sexes_age_80-84 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_85_89: title: Diseases of the sense organs_both sexes_age_85-89 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_90_94: title: Diseases of the sense organs_both sexes_age_90-94 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_95_99: title: Diseases of the sense organs_both sexes_age_95-99 - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_sense_organs_both_sexes_age_100_and_over: title: Diseases of the sense organs_both sexes_age_100_and_over - description: The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of - the nervous system have the ICD death code 360*-389*. unit: per 100 000 + description: |- + The mortality rate from diseases of the sense organs per 100 000 people in the population. Diseases of the nervous system have the ICD death code 360*-389*. diseases_of_the_circulatory_system_both_sexes_age_all_ages: title: Diseases of the circulatory system_both sexes_age_all ages - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_under_1: title: Diseases of the circulatory system_both sexes_age_under_1 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_1_4: title: Diseases of the circulatory system_both sexes_age_1-4 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_5_9: title: Diseases of the circulatory system_both sexes_age_5-9 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_10_14: title: Diseases of the circulatory system_both sexes_age_10-14 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_15_19: title: Diseases of the circulatory system_both sexes_age_15-19 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_20_24: title: Diseases of the circulatory system_both sexes_age_20-24 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_25_29: title: Diseases of the circulatory system_both sexes_age_25-29 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_30_34: title: Diseases of the circulatory system_both sexes_age_30-34 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_35_39: title: Diseases of the circulatory system_both sexes_age_35-39 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_40_44: title: Diseases of the circulatory system_both sexes_age_40-44 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_45_49: title: Diseases of the circulatory system_both sexes_age_45-49 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_50_54: title: Diseases of the circulatory system_both sexes_age_50-54 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_55_59: title: Diseases of the circulatory system_both sexes_age_55-59 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_60_64: title: Diseases of the circulatory system_both sexes_age_60-64 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_65_69: title: Diseases of the circulatory system_both sexes_age_65-69 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_70_74: title: Diseases of the circulatory system_both sexes_age_70-74 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_75_79: title: Diseases of the circulatory system_both sexes_age_75-79 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_80_84: title: Diseases of the circulatory system_both sexes_age_80-84 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_85_89: title: Diseases of the circulatory system_both sexes_age_85-89 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_90_94: title: Diseases of the circulatory system_both sexes_age_90-94 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_95_99: title: Diseases of the circulatory system_both sexes_age_95-99 - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_circulatory_system_both_sexes_age_100_and_over: title: Diseases of the circulatory system_both sexes_age_100_and_over - description: The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases - of the circulatory system have the ICD death codes 390*-459* unit: per 100 000 + description: |- + The mortality rate from diseases of the circulatory system per 100 000 people in the population. Diseases of the circulatory system have the ICD death codes 390*-459* diseases_of_the_respiratory_system_both_sexes_age_all_ages: title: Diseases of the respiratory system_both sexes_age_all ages - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_under_1: title: Diseases of the respiratory system_both sexes_age_under_1 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_1_4: title: Diseases of the respiratory system_both sexes_age_1-4 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_5_9: title: Diseases of the respiratory system_both sexes_age_5-9 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_10_14: title: Diseases of the respiratory system_both sexes_age_10-14 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_15_19: title: Diseases of the respiratory system_both sexes_age_15-19 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_20_24: title: Diseases of the respiratory system_both sexes_age_20-24 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_25_29: title: Diseases of the respiratory system_both sexes_age_25-29 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_30_34: title: Diseases of the respiratory system_both sexes_age_30-34 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_35_39: title: Diseases of the respiratory system_both sexes_age_35-39 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_40_44: title: Diseases of the respiratory system_both sexes_age_40-44 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_45_49: title: Diseases of the respiratory system_both sexes_age_45-49 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_50_54: title: Diseases of the respiratory system_both sexes_age_50-54 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_55_59: title: Diseases of the respiratory system_both sexes_age_55-59 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_60_64: title: Diseases of the respiratory system_both sexes_age_60-64 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_65_69: title: Diseases of the respiratory system_both sexes_age_65-69 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_70_74: title: Diseases of the respiratory system_both sexes_age_70-74 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_75_79: title: Diseases of the respiratory system_both sexes_age_75-79 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_80_84: title: Diseases of the respiratory system_both sexes_age_80-84 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_85_89: title: Diseases of the respiratory system_both sexes_age_85-89 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_90_94: title: Diseases of the respiratory system_both sexes_age_90-94 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_95_99: title: Diseases of the respiratory system_both sexes_age_95-99 - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_respiratory_system_both_sexes_age_100_and_over: title: Diseases of the respiratory system_both sexes_age_100_and_over - description: The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases - of the respiratory system have the ICD death codes 460*-519*. unit: per 100 000 + description: |- + The mortality rate from diseases of the respiratory system per 100 000 people in the population. Diseases of the respiratory system have the ICD death codes 460*-519*. diseases_of_the_digestive_system_both_sexes_age_all_ages: title: Diseases of the digestive system_both sexes_age_all ages - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_under_1: title: Diseases of the digestive system_both sexes_age_under_1 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_1_4: title: Diseases of the digestive system_both sexes_age_1-4 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_5_9: title: Diseases of the digestive system_both sexes_age_5-9 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_10_14: title: Diseases of the digestive system_both sexes_age_10-14 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_15_19: title: Diseases of the digestive system_both sexes_age_15-19 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_20_24: title: Diseases of the digestive system_both sexes_age_20-24 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_25_29: title: Diseases of the digestive system_both sexes_age_25-29 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_30_34: title: Diseases of the digestive system_both sexes_age_30-34 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_35_39: title: Diseases of the digestive system_both sexes_age_35-39 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_40_44: title: Diseases of the digestive system_both sexes_age_40-44 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_45_49: title: Diseases of the digestive system_both sexes_age_45-49 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_50_54: title: Diseases of the digestive system_both sexes_age_50-54 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_55_59: title: Diseases of the digestive system_both sexes_age_55-59 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_60_64: title: Diseases of the digestive system_both sexes_age_60-64 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_65_69: title: Diseases of the digestive system_both sexes_age_65-69 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_70_74: title: Diseases of the digestive system_both sexes_age_70-74 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_75_79: title: Diseases of the digestive system_both sexes_age_75-79 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_80_84: title: Diseases of the digestive system_both sexes_age_80-84 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_85_89: title: Diseases of the digestive system_both sexes_age_85-89 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_90_94: title: Diseases of the digestive system_both sexes_age_90-94 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_95_99: title: Diseases of the digestive system_both sexes_age_95-99 - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_digestive_system_both_sexes_age_100_and_over: title: Diseases of the digestive system_both sexes_age_100_and_over - description: The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases - of the digestive system have the ICD death codes 520*-579*. unit: per 100 000 + description: |- + The mortality rate from diseases of the digestive system per 100 000 people in the population. Diseases of the digestive system have the ICD death codes 520*-579*. diseases_of_the_genitourinary_system_both_sexes_age_all_ages: title: Diseases of the genitourinary system_both sexes_age_all ages - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_under_1: title: Diseases of the genitourinary system_both sexes_age_under_1 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_1_4: title: Diseases of the genitourinary system_both sexes_age_1-4 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_5_9: title: Diseases of the genitourinary system_both sexes_age_5-9 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_10_14: title: Diseases of the genitourinary system_both sexes_age_10-14 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_15_19: title: Diseases of the genitourinary system_both sexes_age_15-19 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_20_24: title: Diseases of the genitourinary system_both sexes_age_20-24 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_25_29: title: Diseases of the genitourinary system_both sexes_age_25-29 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_30_34: title: Diseases of the genitourinary system_both sexes_age_30-34 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_35_39: title: Diseases of the genitourinary system_both sexes_age_35-39 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_40_44: title: Diseases of the genitourinary system_both sexes_age_40-44 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_45_49: title: Diseases of the genitourinary system_both sexes_age_45-49 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_50_54: title: Diseases of the genitourinary system_both sexes_age_50-54 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_55_59: title: Diseases of the genitourinary system_both sexes_age_55-59 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_60_64: title: Diseases of the genitourinary system_both sexes_age_60-64 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_65_69: title: Diseases of the genitourinary system_both sexes_age_65-69 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_70_74: title: Diseases of the genitourinary system_both sexes_age_70-74 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_75_79: title: Diseases of the genitourinary system_both sexes_age_75-79 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_80_84: title: Diseases of the genitourinary system_both sexes_age_80-84 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_85_89: title: Diseases of the genitourinary system_both sexes_age_85-89 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_90_94: title: Diseases of the genitourinary system_both sexes_age_90-94 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_95_99: title: Diseases of the genitourinary system_both sexes_age_95-99 - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. diseases_of_the_genitourinary_system_both_sexes_age_100_and_over: title: Diseases of the genitourinary system_both sexes_age_100_and_over - description: The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases - of the genitourinary system have the ICD death codes 580*-629*. unit: per 100 000 + description: |- + The mortality rate from diseases of the genitourinary system per 100 000 people in the population. Diseases of the genitourinary system have the ICD death codes 580*-629*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_all_ages: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_all ages - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_under_1: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_under_1 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_1_4: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_1-4 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_5_9: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_5-9 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_10_14: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_10-14 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_15_19: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_15-19 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_20_24: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_20-24 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_25_29: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_25-29 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_30_34: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_30-34 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_35_39: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_35-39 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_40_44: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_40-44 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_45_49: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_45-49 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_50_54: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_50-54 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_55_59: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_55-59 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_60_64: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_60-64 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_65_69: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_65-69 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_70_74: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_70-74 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_75_79: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_75-79 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_80_84: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_80-84 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_85_89: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_85-89 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_90_94: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_90-94 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_95_99: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_95-99 - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. complications_of_pregnancy_childbirth_and_the_puerperium_both_sexes_age_100_and_over: title: Complications of pregnancy childbirth and the puerperium_both sexes_age_100_and_over - description: The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people - in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. unit: per 100 000 + description: |- + The mortality rate from complications of pregnancy, childbirth and the puerperium per 100 000 people in the population. Complications of pregnancy, childbirth and the puerperium have the ICD death codes 630*-679*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_all_ages: title: Diseases of the skin and subcutaneous tissue_both sexes_age_all ages - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_under_1: title: Diseases of the skin and subcutaneous tissue_both sexes_age_under_1 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_1_4: title: Diseases of the skin and subcutaneous tissue_both sexes_age_1-4 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_5_9: title: Diseases of the skin and subcutaneous tissue_both sexes_age_5-9 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_10_14: title: Diseases of the skin and subcutaneous tissue_both sexes_age_10-14 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_15_19: title: Diseases of the skin and subcutaneous tissue_both sexes_age_15-19 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_20_24: title: Diseases of the skin and subcutaneous tissue_both sexes_age_20-24 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_25_29: title: Diseases of the skin and subcutaneous tissue_both sexes_age_25-29 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_30_34: title: Diseases of the skin and subcutaneous tissue_both sexes_age_30-34 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_35_39: title: Diseases of the skin and subcutaneous tissue_both sexes_age_35-39 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_40_44: title: Diseases of the skin and subcutaneous tissue_both sexes_age_40-44 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_45_49: title: Diseases of the skin and subcutaneous tissue_both sexes_age_45-49 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_50_54: title: Diseases of the skin and subcutaneous tissue_both sexes_age_50-54 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_55_59: title: Diseases of the skin and subcutaneous tissue_both sexes_age_55-59 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_60_64: title: Diseases of the skin and subcutaneous tissue_both sexes_age_60-64 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_65_69: title: Diseases of the skin and subcutaneous tissue_both sexes_age_65-69 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_70_74: title: Diseases of the skin and subcutaneous tissue_both sexes_age_70-74 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_75_79: title: Diseases of the skin and subcutaneous tissue_both sexes_age_75-79 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_80_84: title: Diseases of the skin and subcutaneous tissue_both sexes_age_80-84 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_85_89: title: Diseases of the skin and subcutaneous tissue_both sexes_age_85-89 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_90_94: title: Diseases of the skin and subcutaneous tissue_both sexes_age_90-94 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_95_99: title: Diseases of the skin and subcutaneous tissue_both sexes_age_95-99 - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_skin_and_subcutaneous_tissue_both_sexes_age_100_and_over: title: Diseases of the skin and subcutaneous tissue_both sexes_age_100_and_over - description: The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. - Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. unit: per 100 000 + description: |- + The mortality rate from diseases of the skin and subcutaneous tissue per 100 000 people in the population. Diseases of the skin and subcutaneous tissue have the ICD death codes 680*-709*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_all_ages: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_all ages - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_under_1: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_under_1 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_1_4: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_1-4 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_5_9: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_5-9 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_10_14: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_10-14 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_15_19: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_15-19 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_20_24: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_20-24 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_25_29: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_25-29 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_30_34: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_30-34 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_35_39: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_35-39 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_40_44: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_40-44 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_45_49: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_45-49 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_50_54: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_50-54 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_55_59: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_55-59 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_60_64: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_60-64 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_65_69: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_65-69 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_70_74: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_70-74 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_75_79: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_75-79 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_80_84: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_80-84 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_85_89: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_85-89 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_90_94: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_90-94 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_95_99: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_95-99 - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. diseases_of_the_musculoskeletal_system_and_connective_tissue_both_sexes_age_100_and_over: title: Diseases of the musculoskeletal system and connective tissue_both sexes_age_100_and_over - description: The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people - in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. unit: per 100 000 + description: |- + The mortality rate from diseases of the musculoskeletal system and connective tissue per 100 000 people in the population. Diseases of the musculoskeletal system and connective tissue have the ICD death codes 710*-739*. congenital_anomalies_both_sexes_age_all_ages: title: Congenital anomalies_both sexes_age_all ages - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_under_1: title: Congenital anomalies_both sexes_age_under_1 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_1_4: title: Congenital anomalies_both sexes_age_1-4 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_5_9: title: Congenital anomalies_both sexes_age_5-9 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_10_14: title: Congenital anomalies_both sexes_age_10-14 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_15_19: title: Congenital anomalies_both sexes_age_15-19 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_20_24: title: Congenital anomalies_both sexes_age_20-24 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_25_29: title: Congenital anomalies_both sexes_age_25-29 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_30_34: title: Congenital anomalies_both sexes_age_30-34 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_35_39: title: Congenital anomalies_both sexes_age_35-39 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_40_44: title: Congenital anomalies_both sexes_age_40-44 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_45_49: title: Congenital anomalies_both sexes_age_45-49 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_50_54: title: Congenital anomalies_both sexes_age_50-54 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_55_59: title: Congenital anomalies_both sexes_age_55-59 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_60_64: title: Congenital anomalies_both sexes_age_60-64 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_65_69: title: Congenital anomalies_both sexes_age_65-69 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_70_74: title: Congenital anomalies_both sexes_age_70-74 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_75_79: title: Congenital anomalies_both sexes_age_75-79 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_80_84: title: Congenital anomalies_both sexes_age_80-84 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_85_89: title: Congenital anomalies_both sexes_age_85-89 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_90_94: title: Congenital anomalies_both sexes_age_90-94 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_95_99: title: Congenital anomalies_both sexes_age_95-99 - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. congenital_anomalies_both_sexes_age_100_and_over: title: Congenital anomalies_both sexes_age_100_and_over - description: The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies - have the ICD death codes 740*-759*. unit: per 100 000 + description: |- + The mortality rate from congenital anomalies per 100 000 people in the population. Congenital anomalies have the ICD death codes 740*-759*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_all_ages: title: Certain conditions originating in the perinatal period_both sexes_age_all ages - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_under_1: title: Certain conditions originating in the perinatal period_both sexes_age_under_1 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_1_4: title: Certain conditions originating in the perinatal period_both sexes_age_1-4 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_5_9: title: Certain conditions originating in the perinatal period_both sexes_age_5-9 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_10_14: title: Certain conditions originating in the perinatal period_both sexes_age_10-14 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_15_19: title: Certain conditions originating in the perinatal period_both sexes_age_15-19 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_20_24: title: Certain conditions originating in the perinatal period_both sexes_age_20-24 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_25_29: title: Certain conditions originating in the perinatal period_both sexes_age_25-29 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_30_34: title: Certain conditions originating in the perinatal period_both sexes_age_30-34 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_35_39: title: Certain conditions originating in the perinatal period_both sexes_age_35-39 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_40_44: title: Certain conditions originating in the perinatal period_both sexes_age_40-44 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_45_49: title: Certain conditions originating in the perinatal period_both sexes_age_45-49 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_50_54: title: Certain conditions originating in the perinatal period_both sexes_age_50-54 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_55_59: title: Certain conditions originating in the perinatal period_both sexes_age_55-59 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_60_64: title: Certain conditions originating in the perinatal period_both sexes_age_60-64 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_65_69: title: Certain conditions originating in the perinatal period_both sexes_age_65-69 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_70_74: title: Certain conditions originating in the perinatal period_both sexes_age_70-74 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_75_79: title: Certain conditions originating in the perinatal period_both sexes_age_75-79 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_80_84: title: Certain conditions originating in the perinatal period_both sexes_age_80-84 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_85_89: title: Certain conditions originating in the perinatal period_both sexes_age_85-89 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_90_94: title: Certain conditions originating in the perinatal period_both sexes_age_90-94 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_95_99: title: Certain conditions originating in the perinatal period_both sexes_age_95-99 - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. certain_conditions_originating_in_the_perinatal_period_both_sexes_age_100_and_over: title: Certain conditions originating in the perinatal period_both sexes_age_100_and_over - description: The mortality rate from certain conditions originating in the perinatal period per 100 000 people in - the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. unit: per 100 000 + description: |- + The mortality rate from certain conditions originating in the perinatal period per 100 000 people in the population. Certain conditions originating in the perinatal period have the ICD death codes 760*-779*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_all_ages: title: Symptoms signs and ill-defined conditions_both sexes_age_all ages - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_under_1: title: Symptoms signs and ill-defined conditions_both sexes_age_under_1 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_1_4: title: Symptoms signs and ill-defined conditions_both sexes_age_1-4 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_5_9: title: Symptoms signs and ill-defined conditions_both sexes_age_5-9 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_10_14: title: Symptoms signs and ill-defined conditions_both sexes_age_10-14 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_15_19: title: Symptoms signs and ill-defined conditions_both sexes_age_15-19 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_20_24: title: Symptoms signs and ill-defined conditions_both sexes_age_20-24 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_25_29: title: Symptoms signs and ill-defined conditions_both sexes_age_25-29 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_30_34: title: Symptoms signs and ill-defined conditions_both sexes_age_30-34 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_35_39: title: Symptoms signs and ill-defined conditions_both sexes_age_35-39 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_40_44: title: Symptoms signs and ill-defined conditions_both sexes_age_40-44 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_45_49: title: Symptoms signs and ill-defined conditions_both sexes_age_45-49 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_50_54: title: Symptoms signs and ill-defined conditions_both sexes_age_50-54 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_55_59: title: Symptoms signs and ill-defined conditions_both sexes_age_55-59 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_60_64: title: Symptoms signs and ill-defined conditions_both sexes_age_60-64 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_65_69: title: Symptoms signs and ill-defined conditions_both sexes_age_65-69 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_70_74: title: Symptoms signs and ill-defined conditions_both sexes_age_70-74 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_75_79: title: Symptoms signs and ill-defined conditions_both sexes_age_75-79 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_80_84: title: Symptoms signs and ill-defined conditions_both sexes_age_80-84 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_85_89: title: Symptoms signs and ill-defined conditions_both sexes_age_85-89 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_90_94: title: Symptoms signs and ill-defined conditions_both sexes_age_90-94 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_95_99: title: Symptoms signs and ill-defined conditions_both sexes_age_95-99 - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. symptoms_signs_and_ill_defined_conditions_both_sexes_age_100_and_over: title: Symptoms signs and ill-defined conditions_both sexes_age_100_and_over - description: The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. - Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. unit: per 100 000 + description: |- + The mortality rate from symptoms, signs and ill-defined conditions per 100 000 people in the population. Symptoms, signs and ill-defined conditions have the ICD death codes 780*-799*. injury_and_poisoning_both_sexes_age_all_ages: title: Injury and poisoning_both sexes_age_all ages - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_under_1: title: Injury and poisoning_both sexes_age_under_1 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_1_4: title: Injury and poisoning_both sexes_age_1-4 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_5_9: title: Injury and poisoning_both sexes_age_5-9 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_10_14: title: Injury and poisoning_both sexes_age_10-14 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_15_19: title: Injury and poisoning_both sexes_age_15-19 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_20_24: title: Injury and poisoning_both sexes_age_20-24 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_25_29: title: Injury and poisoning_both sexes_age_25-29 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_30_34: title: Injury and poisoning_both sexes_age_30-34 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_35_39: title: Injury and poisoning_both sexes_age_35-39 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_40_44: title: Injury and poisoning_both sexes_age_40-44 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_45_49: title: Injury and poisoning_both sexes_age_45-49 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_50_54: title: Injury and poisoning_both sexes_age_50-54 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_55_59: title: Injury and poisoning_both sexes_age_55-59 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_60_64: title: Injury and poisoning_both sexes_age_60-64 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_65_69: title: Injury and poisoning_both sexes_age_65-69 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_70_74: title: Injury and poisoning_both sexes_age_70-74 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_75_79: title: Injury and poisoning_both sexes_age_75-79 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_80_84: title: Injury and poisoning_both sexes_age_80-84 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_85_89: title: Injury and poisoning_both sexes_age_85-89 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_90_94: title: Injury and poisoning_both sexes_age_90-94 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_95_99: title: Injury and poisoning_both sexes_age_95-99 - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. injury_and_poisoning_both_sexes_age_100_and_over: title: Injury and poisoning_both sexes_age_100_and_over - description: The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning - have the ICD death codes 800*-999*. unit: per 100 000 + description: |- + The mortality rate from injury and poisoning per 100 000 people in the population. Injury and poisoning have the ICD death codes 800*-999*. diff --git a/etl/steps/data/grapher/fasttrack/latest/historical_france_mortality_cause.py b/etl/steps/data/grapher/fasttrack/latest/historical_france_mortality_cause.py index b54066c948a..6f8672c6cb1 100644 --- a/etl/steps/data/grapher/fasttrack/latest/historical_france_mortality_cause.py +++ b/etl/steps/data/grapher/fasttrack/latest/historical_france_mortality_cause.py @@ -1,10 +1,9 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/latest/historical_france_mortality_cause.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/infant_mortality_vaccination_shattock.py b/etl/steps/data/grapher/fasttrack/latest/infant_mortality_vaccination_shattock.py index 839a3b24e99..ca5755007a5 100644 --- a/etl/steps/data/grapher/fasttrack/latest/infant_mortality_vaccination_shattock.py +++ b/etl/steps/data/grapher/fasttrack/latest/infant_mortality_vaccination_shattock.py @@ -1,3 +1,5 @@ +import pandas as pd + from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot @@ -11,8 +13,20 @@ def run(dest_dir: str) -> None: # load data tb = snap.read_csv() + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) + # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) # override metadata if necessary meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") @@ -20,3 +34,7 @@ def run(dest_dir: str) -> None: ds.update_metadata(meta_path) ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/lead_paint_regulation_who.meta.yml b/etl/steps/data/grapher/fasttrack/latest/lead_paint_regulation_who.meta.yml index b84e3a73d99..9467e2b5fae 100644 --- a/etl/steps/data/grapher/fasttrack/latest/lead_paint_regulation_who.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/lead_paint_regulation_who.meta.yml @@ -1,28 +1,16 @@ dataset: - namespace: fasttrack - version: latest - short_name: lead_paint_regulation_who title: Lead paint regulations (WHO, 2023) description: |- The WHO collects data on which countries have legally-binding controls on lead paint. It sources this data from surveys conducted by WHO and UNEP of national authorities. The World Health Organization (WHO) tracks the introduction of legally-binding controls on lead concentrations in paint. Paint is a main contributor to harmful lead exposure. The stringency of controls on lead paint can vary by country. Maximum concentrations of lead can differ, and may only apply to particular types of paint (for example, products used in households). - sources: - - name: Lead paint regulations (WHO, 2023) - published_by: World Health Organization (WHO) - publication_year: 2023 - url: https://www.who.int/data/gho/data/themes/topics/indicator-groups/legally-binding-controls-on-lead-paint + licenses: + - {} tables: lead_paint_regulation_who: variables: lead_paint_regulation: title: lead_paint_regulation unit: '' - description: Notes whether each country has adopted legally-binding laws, - regulations, standards and/or procedures to control the production, import, - export, sale and use of lead paints. - sources: - - name: Lead paint regulations (WHO, 2023) - published_by: World Health Organization (WHO) - publication_year: 2023 - url: https://www.who.int/data/gho/data/themes/topics/indicator-groups/legally-binding-controls-on-lead-paint + description: |- + Notes whether each country has adopted legally-binding laws, regulations, standards and/or procedures to control the production, import, export, sale and use of lead paints. diff --git a/etl/steps/data/grapher/fasttrack/latest/lead_paint_regulation_who.py b/etl/steps/data/grapher/fasttrack/latest/lead_paint_regulation_who.py index 1f0c2a1a5e8..c44f9d8ac86 100644 --- a/etl/steps/data/grapher/fasttrack/latest/lead_paint_regulation_who.py +++ b/etl/steps/data/grapher/fasttrack/latest/lead_paint_regulation_who.py @@ -1,19 +1,40 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: # load snapshot - data = pd.read_csv(Snapshot("fasttrack/latest/lead_paint_regulation_who.csv").path) + snap = Snapshot("fasttrack/latest/lead_paint_regulation_who.csv") - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb]) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/lives_saved_vaccination_who.py b/etl/steps/data/grapher/fasttrack/latest/lives_saved_vaccination_who.py index 17ec65f21ba..3fcf6026ee9 100644 --- a/etl/steps/data/grapher/fasttrack/latest/lives_saved_vaccination_who.py +++ b/etl/steps/data/grapher/fasttrack/latest/lives_saved_vaccination_who.py @@ -1,3 +1,5 @@ +import pandas as pd + from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot @@ -11,8 +13,20 @@ def run(dest_dir: str) -> None: # load data tb = snap.read_csv() + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) + # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) # override metadata if necessary meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") @@ -20,3 +34,7 @@ def run(dest_dir: str) -> None: ds.update_metadata(meta_path) ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/mineral_prices_usgs.meta.yml b/etl/steps/data/grapher/fasttrack/latest/mineral_prices_usgs.meta.yml new file mode 100644 index 00000000000..b09781d919b --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/mineral_prices_usgs.meta.yml @@ -0,0 +1,68 @@ +dataset: + title: Mineral prices (USGS, 2024) + description: '' + licenses: + - {} +tables: + mineral_prices_usgs: + variables: + copper_prices: + title: Copper + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + chromium_prices: + title: Chromium + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + tin_prices: + title: Tin + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + nickel_prices: + title: Nickel + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + tungsten_prices: + title: Tungsten + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + lithium_prices: + title: Lithium + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + cobalt_prices: + title: Cobalt + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + rare_earth_prices: + title: Rare earths + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + silver_prices: + title: Silver + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + aluminum_prices: + title: Aluminum + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + silicon_prices: + title: Silicon + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. + graphite_prices: + title: Graphite + unit: $ per tonne + short_unit: $ + description: Material prices are measured in 1998 dollars per tonne, which adjusts for inflation. diff --git a/etl/steps/data/grapher/fasttrack/latest/mineral_prices_usgs.py b/etl/steps/data/grapher/fasttrack/latest/mineral_prices_usgs.py new file mode 100644 index 00000000000..c5eeef66fb9 --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/mineral_prices_usgs.py @@ -0,0 +1,40 @@ +import pandas as pd + +from etl.helpers import PathFinder, create_dataset, get_metadata_path +from etl.snapshot import Snapshot + +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # load snapshot + snap = Snapshot("fasttrack/latest/mineral_prices_usgs.csv") + + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) + + # add table, update metadata from *.meta.yml and save + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_inventories.meta.yml b/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_inventories.meta.yml new file mode 100644 index 00000000000..182af2197ab --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_inventories.meta.yml @@ -0,0 +1,46 @@ +dataset: + title: Nuclear warhead inventories – Federation of American Scientists + description: |- + This dataset provides information on the nuclear warhead inventories by the nuclear powers, using data from the Federation of American Scientists, prepared by Hans M. Kristensen, Matt Korda, and Robert Norris. + + You can download the code and complete dataset, including supplementary variables, from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/nuclear_weapons + Publisher source: None + licenses: + - {} +tables: + nuclear_warhead_inventories: + variables: + nuclear_weapons_depl_nonstrat: + title: nuclear_weapons_depl_nonstrat + unit: '' + description: |- + The variable denotes the estimated number of deployed nonstrategic nuclear warheads. + + Deployed warheads are those on ballistic missiles or bomber bases. + + Nonstrategic or tactical warheads are those for use on the battlefield. + nuclear_weapons_depl_strat: + title: nuclear_weapons_depl_strat + unit: '' + description: |- + The variable denotes the estimated number of deployed strategic nuclear warheads. + + Deployed warheads are those on ballistic missiles or bomber bases. + + Strategic warheads are those for use away from the battlefield, such as against military bases, arms industries, or infrastructure. + nuclear_weapons_inventory: + title: nuclear_weapons_inventory + unit: '' + description: |- + The variable denotes the estimated number of all nuclear warheads, be they deployed strategic, deployed nonstrategic, nondeployed, or retired. + nuclear_weapons_reserve_nondepl: + title: nuclear_weapons_reserve_nondepl + unit: '' + description: |- + The variable denotes the estimated number of nondeployed nuclear warheads. + + Nondeployed or reserve warheads are those not on ballistic missiles or bomber bases. + nuclear_weapons_retired: + title: nuclear_weapons_retired + unit: '' + description: The variable denotes the estimated number of retired nuclear warheads queued for dismantlement. diff --git a/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_inventories.py b/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_inventories.py new file mode 100644 index 00000000000..54f9902ad56 --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_inventories.py @@ -0,0 +1,40 @@ +import pandas as pd + +from etl.helpers import PathFinder, create_dataset, get_metadata_path +from etl.snapshot import Snapshot + +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # load snapshot + snap = Snapshot("fasttrack/latest/nuclear_warhead_inventories.csv") + + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) + + # add table, update metadata from *.meta.yml and save + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_stockpiles.meta.yml b/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_stockpiles.meta.yml new file mode 100644 index 00000000000..671f7b2dd74 --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_stockpiles.meta.yml @@ -0,0 +1,21 @@ +dataset: + title: Nuclear warhead stockpiles – Federation of American Scientists + description: |- + This dataset provides information on the number of stockpiled nuclear warheads by the nuclear powers, using data from the Federation of American Scientists, prepared by Hans M. Kristensen, Matt Korda, Eliana Reynolds, and Robert Norris. + + You can download the code and complete dataset, including supplementary variables, from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/nuclear_weapons + Publisher source: None + licenses: + - {} +tables: + nuclear_warhead_stockpiles: + variables: + nuclear_weapons_stockpile: + title: nuclear_weapons_stockpile + unit: '' + description: |- + The variable denotes the estimated number of nuclear warheads in the stockpiles of the nuclear powers. + + Stockpiles include warheads assigned to military forces, but exclude retired warheads queued for dismantlement. + + Retired warheads are only included in the global total. diff --git a/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_stockpiles.py b/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_stockpiles.py new file mode 100644 index 00000000000..d27ca9006ec --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/nuclear_warhead_stockpiles.py @@ -0,0 +1,40 @@ +import pandas as pd + +from etl.helpers import PathFinder, create_dataset, get_metadata_path +from etl.snapshot import Snapshot + +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # load snapshot + snap = Snapshot("fasttrack/latest/nuclear_warhead_stockpiles.csv") + + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) + + # add table, update metadata from *.meta.yml and save + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/pain_hours_hen_systems.meta.yml b/etl/steps/data/grapher/fasttrack/latest/pain_hours_hen_systems.meta.yml index f5b32980f62..83049793925 100644 --- a/etl/steps/data/grapher/fasttrack/latest/pain_hours_hen_systems.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/pain_hours_hen_systems.meta.yml @@ -1,69 +1,68 @@ dataset: - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ + title: Pain hours of hen systems (Welfare Footprint) + description: '' + licenses: + - {} tables: pain_hours_hen_systems: variables: total_hours_of_pain: title: Total hours of pain - short_unit: hours unit: hours - description: The total number of hours an average hen will spend in pain, regardless of the intensity. - display: - numDecimalPlaces: 0 - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ - excrutiating_pain: - title: Excrutiating pain short_unit: hours - unit: hours - description: The number of hours an average hen will spend in excrutiating pain. display: - numDecimalPlaces: 2 - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ - disabling_pain: - title: Disabling pain - short_unit: hours + numDecimalPlaces: 0.0 + description: The total number of hours an average hen will spend in pain, regardless of the intensity. + excrutiating_pain_in_hours: + title: Excrutiating pain in hours unit: hours - description: The number of hours an average hen will spend in disabling pain. - display: - numDecimalPlaces: 0 - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ - hurtful_pain: - title: Hurtful pain short_unit: hours + display: + numDecimalPlaces: 2.0 + description: The number of hours an average hen will spend in excrutiating pain. + disabling_pain_in_hours: + title: Disabling pain in hours unit: hours - description: The number of hours an average hen will spend in hurtful pain. + short_unit: hours display: - numDecimalPlaces: 0 - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ - annoying_pain: - title: Annoying pain + numDecimalPlaces: 0.0 + description: The number of hours an average hen will spend in disabling pain. + hurtful_pain_in_hours: + title: Hurtful pain in hours + unit: hours short_unit: hours + display: + numDecimalPlaces: 0.0 + description: The number of hours an average hen will spend in hurtful pain. + annoying_pain_in_hours: + title: Annoying pain in hours unit: hours - description: The number of hours an average hen will spend in annoying pain. + short_unit: hours display: - numDecimalPlaces: 0 - sources: - - name: Welfare Footprint based on Schuck-Paim and Alonso (2021) - published_by: Welfare Footprint - publication_year: 2021 - url: https://welfarefootprint.org/research-projects/laying-hens/ + numDecimalPlaces: 0.0 + description: The number of hours an average hen will spend in annoying pain. + total_days_of_pain: + title: Total days of pain + unit: days + short_unit: days + description: The total number of days an average hen will spend in pain, regardless of the intensity. + excrutiating_pain_in_days: + title: Excrutiating pain in days + unit: days + short_unit: days + description: The number of days an average hen will spend in excrutiating pain. + disabling_pain_in_days: + title: Disabling pain in days + unit: days + short_unit: days + description: The number of days an average hen will spend in disabling pain. + hurtful_pain_in_days: + title: Hurtful pain in days + unit: days + short_unit: days + description: The number of days an average hen will spend in hurtful pain. + annoying_pain_in_days: + title: Annoying pain in days + unit: days + short_unit: days + description: The number of days an average hen will spend in annoying pain. diff --git a/etl/steps/data/grapher/fasttrack/latest/pain_hours_hen_systems.py b/etl/steps/data/grapher/fasttrack/latest/pain_hours_hen_systems.py index 2a12110de76..b7bce222145 100644 --- a/etl/steps/data/grapher/fasttrack/latest/pain_hours_hen_systems.py +++ b/etl/steps/data/grapher/fasttrack/latest/pain_hours_hen_systems.py @@ -1,10 +1,9 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/latest/pain_hours_hen_systems.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/plastic_waste_meijer_2021.meta.yml b/etl/steps/data/grapher/fasttrack/latest/plastic_waste_meijer_2021.meta.yml index d7c1587f391..a3e10290266 100644 --- a/etl/steps/data/grapher/fasttrack/latest/plastic_waste_meijer_2021.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/plastic_waste_meijer_2021.meta.yml @@ -3,12 +3,6 @@ dataset: description: '' licenses: - {} - sources: - - name: Plastic ocean waste and pollution (Meijer et al. 2021) - url: https://www.science.org/doi/10.1126/sciadv.aaz5803 - publication_year: '2021' - published_by: Meijer, L. J., Van Emmerik, T., Van Der Ent, R., Schmidt, C., & Lebreton, L. (2021). More than 1000 rivers - account for 80% of global riverine plastic emissions into the ocean. Science Advances, 7(18), eaaz5803. tables: plastic_waste_meijer_2021: variables: diff --git a/etl/steps/data/grapher/fasttrack/latest/plastic_waste_meijer_2021.py b/etl/steps/data/grapher/fasttrack/latest/plastic_waste_meijer_2021.py index 0c199a44836..af0c97aade1 100644 --- a/etl/steps/data/grapher/fasttrack/latest/plastic_waste_meijer_2021.py +++ b/etl/steps/data/grapher/fasttrack/latest/plastic_waste_meijer_2021.py @@ -1,7 +1,6 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot paths = PathFinder(__file__) @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/latest/plastic_waste_meijer_2021.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=paths.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/sentience_institute.meta.yml b/etl/steps/data/grapher/fasttrack/latest/sentience_institute.meta.yml index 737a84df55d..269bf760b42 100644 --- a/etl/steps/data/grapher/fasttrack/latest/sentience_institute.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/sentience_institute.meta.yml @@ -3,11 +3,6 @@ dataset: description: '' licenses: - {} - sources: - - name: 'Sentience Institute. Animals, Food, and Technology (AFT) Survey: 2021 Update.' - url: https://www.sentienceinstitute.org/aft-survey-2021 - publication_year: '2021' - published_by: Sentience Institute tables: sentience_institute: variables: diff --git a/etl/steps/data/grapher/fasttrack/latest/sentience_institute.py b/etl/steps/data/grapher/fasttrack/latest/sentience_institute.py index 687cd4deb83..8a413131d69 100644 --- a/etl/steps/data/grapher/fasttrack/latest/sentience_institute.py +++ b/etl/steps/data/grapher/fasttrack/latest/sentience_institute.py @@ -1,7 +1,6 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot paths = PathFinder(__file__) @@ -12,11 +11,30 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/latest/sentience_institute.csv") # load data - data = pd.read_csv(snap.path) + tb = snap.read_csv() - # create empty dataframe and table - tb = catalog.Table(data, short_name=paths.short_name) + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.meta.yml b/etl/steps/data/grapher/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.meta.yml index a85f3078d93..ac639754121 100644 --- a/etl/steps/data/grapher/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.meta.yml @@ -1,78 +1,40 @@ dataset: - namespace: fasttrack - version: latest - short_name: treatment_gap_anxiety_disorders_world_mental_health_surveys - title: Treatment gap for anxiety disorders - World Mental Health Surveys - Alonso - et al. 2017 - description: 'This dataset comes from the World Mental Health surveys, which conducted - national studies in 21 countries, using validated structured interviews to survey - members of the general population about symptoms of mental illnesses they had - in the past 12 months and their lifetime so far. The source describes the dataset: - "Data came from 24 community epidemiological surveys administered in 21 countries - as part of the WMH surveys (Kessler & Ustun, 2004). These included 12 surveys - carried out in high-income countries, 6 surveys in upper-middle-income countries - and 6 in low or lower-middle income countries (see table 1). The majority of surveys - were based on nationally representative household samples. Three were representative - of urban areas in their countries (Colombia, Mexico, and Peru). Three were representative - of selected regions in their countries (Japan, Nigeria, and Murcia, Spain). Four - were representative of selected Metropolitan Areas (Sao Paulo, Brazil; Medellin, - Colombia; and Beijing-Shanghai and Shenzhen in the People’s Republic of China - (PRC)). Trained lay interviewers conducted face-to-face interviews with respondents, - aged 18 years and over. The interviews took place within the households of the - respondents. To reduce respondent burden, the interview was divided into two parts. - Part I assessed core mental disorders and was administered to all respondents. - Part II, which assessed additional disorders and correlates, was administered - to all Part I respondents who met lifetime criteria for any disorder plus a probability - subsample of other Part I respondents. Part II data, the focus of this report, - were weighted by the inverse of their probabilities of selection into Part II - and additionally weighted to adjust samples to match population distributions - on the cross-classification of key socio-demographic and geographic variables. - Further details about WMH sampling and weighting are available elsewhere(Heeringa - et al., 2008). Response rates ranged between 45.9% and 97.2% and had a weighted - average of 70.1% across all surveys."' - sources: - - name: Treatment gap for anxiety disorders - World Mental Health Surveys - Alonso - et al. 2017 - published_by: Alonso et al. (2017) - description: Data comes from Community surveys of the general population - publication_year: 2017 - date_accessed: 2023-05-11 - url: https://pubmed.ncbi.nlm.nih.gov/29356216/ + title: Treatment gap for anxiety disorders - World Mental Health Surveys - Alonso et al. 2017 + description: |- + This dataset comes from the World Mental Health surveys, which conducted national studies in 21 countries, using validated structured interviews to survey members of the general population about symptoms of mental illnesses they had in the past 12 months and their lifetime so far. The source describes the dataset: "Data came from 24 community epidemiological surveys administered in 21 countries as part of the WMH surveys (Kessler & Ustun, 2004). These included 12 surveys carried out in high-income countries, 6 surveys in upper-middle-income countries and 6 in low or lower-middle income countries (see table 1). The majority of surveys were based on nationally representative household samples. Three were representative of urban areas in their countries (Colombia, Mexico, and Peru). Three were representative of selected regions in their countries (Japan, Nigeria, and Murcia, Spain). Four were representative of selected Metropolitan Areas (Sao Paulo, Brazil; Medellin, Colombia; and Beijing-Shanghai and Shenzhen in the People’s Republic of China (PRC)). Trained lay interviewers conducted face-to-face interviews with respondents, aged 18 years and over. The interviews took place within the households of the respondents. To reduce respondent burden, the interview was divided into two parts. Part I assessed core mental disorders and was administered to all respondents. Part II, which assessed additional disorders and correlates, was administered to all Part I respondents who met lifetime criteria for any disorder plus a probability subsample of other Part I respondents. Part II data, the focus of this report, were weighted by the inverse of their probabilities of selection into Part II and additionally weighted to adjust samples to match population distributions on the cross-classification of key socio-demographic and geographic variables. Further details about WMH sampling and weighting are available elsewhere(Heeringa et al., 2008). Response rates ranged between 45.9% and 97.2% and had a weighted average of 70.1% across all surveys." + licenses: + - name: Made freely available by authors + url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6008788/ tables: treatment_gap_anxiety_disorders_world_mental_health_surveys: variables: _12_month_anxiety_disorder: title: 12 month anxiety disorder - short_unit: '%' unit: '%' - description: Share who met the criteria for having an anxiety disorder in - the past 12 months. The Composite International Diagnostic Interview (CIDI) - was used to interview participants about symptoms of anxiety. + short_unit: '%' + description: |- + Share who met the criteria for having an anxiety disorder in the past 12 months. The Composite International Diagnostic Interview (CIDI) was used to interview participants about symptoms of anxiety. any_treatment__conditional: title: Any treatment, conditional - short_unit: '%' unit: '%' - description: Share who received any treatment, among those who met the criteria - for an anxiety disorder in the past 12 months. + short_unit: '%' + description: Share who received any treatment, among those who met the criteria for an anxiety disorder in the past + 12 months. possibly_adequate_treatment__conditional: title: Possibly adequate treatment, conditional - short_unit: '%' unit: '%' - description: Share who received potentially adequate treatment, among those - who met the criteria for an anxiety disorder in the past 12 months. "Potentially - adequate treatment" was defined as pharmacological medication, psychotherapy, - or complementary alternative medicine. + short_unit: '%' + description: |- + Share who received potentially adequate treatment, among those who met the criteria for an anxiety disorder in the past 12 months. "Potentially adequate treatment" was defined as pharmacological medication, psychotherapy, or complementary alternative medicine. any_treatment: title: Any treatment - short_unit: '%' unit: '%' - description: Share who met the criteria for having an anxiety disorder in - the past 12 months and received any treatment for it. + short_unit: '%' + description: Share who met the criteria for having an anxiety disorder in the past 12 months and received any treatment + for it. possibly_adequate_treatment: title: Possibly adequate treatment - short_unit: '%' unit: '%' - description: Share who met the criteria for having an anxiety disorder in - the past 12 months and received potentially adequate treatment for it. "Potentially - adequate treatment" was defined as pharmacological medication, psychotherapy, - or complementary alternative medicine. + short_unit: '%' + description: |- + Share who met the criteria for having an anxiety disorder in the past 12 months and received potentially adequate treatment for it. "Potentially adequate treatment" was defined as pharmacological medication, psychotherapy, or complementary alternative medicine. diff --git a/etl/steps/data/grapher/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.py b/etl/steps/data/grapher/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.py index 667ddeb4ecd..e6fe95e7448 100644 --- a/etl/steps/data/grapher/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.py +++ b/etl/steps/data/grapher/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.py @@ -1,21 +1,40 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: # load snapshot - data = pd.read_csv( - Snapshot("fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.csv").path - ) + snap = Snapshot("fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.csv") - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb]) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/useful_energy_cost_way.meta.yml b/etl/steps/data/grapher/fasttrack/latest/useful_energy_cost_way.meta.yml new file mode 100644 index 00000000000..bbafa2e6f16 --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/useful_energy_cost_way.meta.yml @@ -0,0 +1,38 @@ +dataset: + title: Useful energy costs (Way et al. 2022) + description: '' + licenses: + - {} +tables: + useful_energy_cost_way: + variables: + oil_useful_cost: + title: Oil + unit: $ per MWh + short_unit: $ + description: Useful cost of energy, per megawatt-hour. + coal_useful_cost: + title: Coal + unit: $ per MWh + short_unit: $ + description: Useful cost of energy, per megawatt-hour. + gas_useful_cost: + title: Gas + unit: $ per MWh + short_unit: $ + description: Useful cost of energy, per megawatt-hour. + wind_useful_cost: + title: Wind + unit: $ per MWh + short_unit: $ + description: Useful cost of energy, per megawatt-hour. + solar_useful_cost: + title: Solar + unit: $ per MWh + short_unit: $ + description: Useful cost of energy, per megawatt-hour. + battery_useful_cost: + title: Lithium-ion batteries + unit: $ per MWh + short_unit: $ + description: Useful cost of energy, per megawatt-hour. diff --git a/etl/steps/data/grapher/fasttrack/latest/useful_energy_cost_way.py b/etl/steps/data/grapher/fasttrack/latest/useful_energy_cost_way.py new file mode 100644 index 00000000000..e8e86d6884e --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/useful_energy_cost_way.py @@ -0,0 +1,40 @@ +import pandas as pd + +from etl.helpers import PathFinder, create_dataset, get_metadata_path +from etl.snapshot import Snapshot + +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # load snapshot + snap = Snapshot("fasttrack/latest/useful_energy_cost_way.csv") + + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) + + # add table, update metadata from *.meta.yml and save + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/data/grapher/fasttrack/latest/welfare_broiler_chickens.py b/etl/steps/data/grapher/fasttrack/latest/welfare_broiler_chickens.py index b092ac3436f..1df43154fd3 100644 --- a/etl/steps/data/grapher/fasttrack/latest/welfare_broiler_chickens.py +++ b/etl/steps/data/grapher/fasttrack/latest/welfare_broiler_chickens.py @@ -9,7 +9,7 @@ def run(dest_dir: str) -> None: snap = Snapshot("fasttrack/latest/welfare_broiler_chickens.csv") # load data - tb = snap.read() + tb = snap.read(safe_types=False) # add table, update metadata from *.meta.yml and save ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) diff --git a/etl/steps/data/grapher/fasttrack/latest/whm_treatment_gap_anxiety_disorders.meta.yml b/etl/steps/data/grapher/fasttrack/latest/whm_treatment_gap_anxiety_disorders.meta.yml index a5e2e7d9488..8b91dd1f1ae 100644 --- a/etl/steps/data/grapher/fasttrack/latest/whm_treatment_gap_anxiety_disorders.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/whm_treatment_gap_anxiety_disorders.meta.yml @@ -1,107 +1,64 @@ dataset: - namespace: fasttrack - version: latest - short_name: whm_treatment_gap_anxiety_disorders title: Treatment gap for anxiety disorders (WMH, 2017) - description: 'This dataset comes from the World Mental Health surveys, which conducted - national studies in 21 countries, using validated structured interviews to survey - members of the general population about symptoms of mental illnesses they had - in the past 12 months and their lifetime so far. The source describes the dataset: - "Data came from 24 community epidemiological surveys administered in 21 countries - as part of the WMH surveys (Kessler & Ustun, 2004). These included 12 surveys - carried out in high-income countries, 6 surveys in upper-middle-income countries - and 6 in low or lower-middle income countries (see table 1). The majority of surveys - were based on nationally representative household samples. Three were representative - of urban areas in their countries (Colombia, Mexico, and Peru). Three were representative - of selected regions in their countries (Japan, Nigeria, and Murcia, Spain). Four - were representative of selected Metropolitan Areas (Sao Paulo, Brazil; Medellin, - Colombia; and Beijing-Shanghai and Shenzhen in the People’s Republic of China - (PRC)). Trained lay interviewers conducted face-to-face interviews with respondents, - aged 18 years and over. The interviews took place within the households of the - respondents. To reduce respondent burden, the interview was divided into two parts. - Part I assessed core mental disorders and was administered to all respondents. - Part II, which assessed additional disorders and correlates, was administered - to all Part I respondents who met lifetime criteria for any disorder plus a probability - subsample of other Part I respondents. Part II data, the focus of this report, - were weighted by the inverse of their probabilities of selection into Part II - and additionally weighted to adjust samples to match population distributions - on the cross-classification of key socio-demographic and geographic variables. - Further details about WMH sampling and weighting are available elsewhere(Heeringa - et al., 2008). Response rates ranged between 45.9% and 97.2% and had a weighted - average of 70.1% across all surveys."' - sources: - - name: Treatment gap for anxiety disorders - World Mental Health Surveys - Alonso - et al. 2017 - published_by: Alonso et al. (2017) - description: Data comes from Community surveys of the general population - publication_year: 2017 - date_accessed: 2023-05-11 - url: https://pubmed.ncbi.nlm.nih.gov/29356216/ + description: |- + This dataset comes from the World Mental Health surveys, which conducted national studies in 21 countries, using validated structured interviews to survey members of the general population about symptoms of mental illnesses they had in the past 12 months and their lifetime so far. The source describes the dataset: "Data came from 24 community epidemiological surveys administered in 21 countries as part of the WMH surveys (Kessler & Ustun, 2004). These included 12 surveys carried out in high-income countries, 6 surveys in upper-middle-income countries and 6 in low or lower-middle income countries (see table 1). The majority of surveys were based on nationally representative household samples. Three were representative of urban areas in their countries (Colombia, Mexico, and Peru). Three were representative of selected regions in their countries (Japan, Nigeria, and Murcia, Spain). Four were representative of selected Metropolitan Areas (Sao Paulo, Brazil; Medellin, Colombia; and Beijing-Shanghai and Shenzhen in the People’s Republic of China (PRC)). Trained lay interviewers conducted face-to-face interviews with respondents, aged 18 years and over. The interviews took place within the households of the respondents. To reduce respondent burden, the interview was divided into two parts. Part I assessed core mental disorders and was administered to all respondents. Part II, which assessed additional disorders and correlates, was administered to all Part I respondents who met lifetime criteria for any disorder plus a probability subsample of other Part I respondents. Part II data, the focus of this report, were weighted by the inverse of their probabilities of selection into Part II and additionally weighted to adjust samples to match population distributions on the cross-classification of key socio-demographic and geographic variables. Further details about WMH sampling and weighting are available elsewhere(Heeringa et al., 2008). Response rates ranged between 45.9% and 97.2% and had a weighted average of 70.1% across all surveys." + licenses: + - name: Made freely available by authors + url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6008788/ tables: whm_treatment_gap_anxiety_disorders: variables: _12_month_anxiety_disorder: title: 12 month anxiety disorder - short_unit: '%' unit: '%' - description: Share who met the criteria for having an anxiety disorder in - the past 12 months. The Composite International Diagnostic Interview (CIDI) - was used to interview participants about symptoms of anxiety. + short_unit: '%' + description: |- + Share who met the criteria for having an anxiety disorder in the past 12 months. The Composite International Diagnostic Interview (CIDI) was used to interview participants about symptoms of anxiety. any_treatment: title: Any treatment - short_unit: '%' unit: '%' - description: Share who met the criteria for having an anxiety disorder in - the past 12 months and received any treatment for it. + short_unit: '%' + description: Share who met the criteria for having an anxiety disorder in the past 12 months and received any treatment + for it. any_treatment__conditional: title: Any treatment, conditional - short_unit: '%' unit: '%' - description: Share who received any treatment, among those who met the criteria - for an anxiety disorder in the past 12 months. + short_unit: '%' + description: Share who received any treatment, among those who met the criteria for an anxiety disorder in the past + 12 months. possibly_adequate_treatment: title: Potentially adequate treatment - short_unit: '%' unit: '%' - description: Share who met the criteria for having an anxiety disorder in - the past 12 months and received potentially adequate treatment for it. "Potentially - adequate treatment" was defined as pharmacological medication, psychotherapy, - or complementary alternative medicine. + short_unit: '%' + description: |- + Share who met the criteria for having an anxiety disorder in the past 12 months and received potentially adequate treatment for it. "Potentially adequate treatment" was defined as pharmacological medication, psychotherapy, or complementary alternative medicine. possibly_adequate_treatment__conditional: title: Potentially adequate treatment, conditional - short_unit: '%' unit: '%' - description: Share who received potentially adequate treatment, among those - who met the criteria for an anxiety disorder in the past 12 months. "Potentially - adequate treatment" was defined as pharmacological medication, psychotherapy, - or complementary alternative medicine. + short_unit: '%' + description: |- + Share who received potentially adequate treatment, among those who met the criteria for an anxiety disorder in the past 12 months. "Potentially adequate treatment" was defined as pharmacological medication, psychotherapy, or complementary alternative medicine. other_treatments: title: Other treatments - short_unit: '%' unit: '%' - description: Share who met the criteria for having an anxiety disorder in - the past 12 months and received a treatment that potentially is not adequate. - "Other treatment" was defined as seeing a religious or spiritual advisor, - social worker, or counsellor in any setting other than specialist mental - health. + short_unit: '%' + description: |- + Share who met the criteria for having an anxiety disorder in the past 12 months and received a treatment that potentially is not adequate. "Other treatment" was defined as seeing a religious or spiritual advisor, social worker, or counsellor in any setting other than specialist mental health. other_treatments__conditional: title: Other treatments, conditional - short_unit: '%' unit: '%' - description: Share who received a treatment that potentially is not adequate, - among those who met the criteria for an anxiety disorder in the past 12 - months. "Other treatment" was defined as seeing a religious or spiritual - advisor, social worker, or counsellor in any setting other than specialist - mental health. + short_unit: '%' + description: |- + Share who received a treatment that potentially is not adequate, among those who met the criteria for an anxiety disorder in the past 12 months. "Other treatment" was defined as seeing a religious or spiritual advisor, social worker, or counsellor in any setting other than specialist mental health. untreated: title: Untreated - short_unit: '%' unit: '%' - description: Share who met the criteria for having an anxiety disorder in - the past 12 months and did not receive any treatment for it. + short_unit: '%' + description: |- + Share who met the criteria for having an anxiety disorder in the past 12 months and did not receive any treatment for it. untreated__conditional: title: Untreated, conditional - short_unit: '%' unit: '%' - description: Share who did not receive any treatment, among those who met - the criteria for an anxiety disorder in the past 12 months. + short_unit: '%' + description: Share who did not receive any treatment, among those who met the criteria for an anxiety disorder in + the past 12 months. diff --git a/etl/steps/data/grapher/fasttrack/latest/whm_treatment_gap_anxiety_disorders.py b/etl/steps/data/grapher/fasttrack/latest/whm_treatment_gap_anxiety_disorders.py index b845b23302a..b05e7621518 100644 --- a/etl/steps/data/grapher/fasttrack/latest/whm_treatment_gap_anxiety_disorders.py +++ b/etl/steps/data/grapher/fasttrack/latest/whm_treatment_gap_anxiety_disorders.py @@ -1,19 +1,40 @@ import pandas as pd -from owid import catalog -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder, create_dataset, get_metadata_path from etl.snapshot import Snapshot -P = PathFinder(__file__) +paths = PathFinder(__file__) def run(dest_dir: str) -> None: # load snapshot - data = pd.read_csv(Snapshot("fasttrack/latest/whm_treatment_gap_anxiety_disorders.csv").path) + snap = Snapshot("fasttrack/latest/whm_treatment_gap_anxiety_disorders.csv") - # create empty dataframe and table - tb = catalog.Table(data, short_name=P.short_name) + # load data + tb = snap.read_csv() + + # add dimensions with dim_ prefix + dims = [c for c in tb.columns if c.startswith("dim_")] + dims_without_prefix = [c[4:] for c in dims] + + if dims: + tb = tb.rename(columns={d: dw for d, dw in zip(dims, dims_without_prefix)}) + + if uses_dates(tb["year"]): + tb = tb.rename(columns={"year": "date"}).format(["country", "date"] + dims_without_prefix) + else: + tb = tb.format(["country", "year"] + dims_without_prefix) # add table, update metadata from *.meta.yml and save - ds = create_dataset(dest_dir, tables=[tb]) + ds = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + ds.save() + + +def uses_dates(s: pd.Series) -> bool: + return pd.to_datetime(s, errors="coerce", format="%Y-%m-%d").notnull().all() diff --git a/etl/steps/archive/grapher/gcp/2023-09-28/global_carbon_budget.py b/etl/steps/data/grapher/gcp/2024-11-13/global_carbon_budget.py similarity index 100% rename from etl/steps/archive/grapher/gcp/2023-09-28/global_carbon_budget.py rename to etl/steps/data/grapher/gcp/2024-11-13/global_carbon_budget.py diff --git a/etl/steps/archive/grapher/gcp/2023-12-05/global_carbon_budget.py b/etl/steps/data/grapher/gcp/2024-11-21/global_carbon_budget.py similarity index 100% rename from etl/steps/archive/grapher/gcp/2023-12-05/global_carbon_budget.py rename to etl/steps/data/grapher/gcp/2024-11-21/global_carbon_budget.py diff --git a/etl/steps/data/grapher/hmd/2024-11-19/hfd.py b/etl/steps/data/grapher/hmd/2024-11-19/hfd.py new file mode 100644 index 00000000000..553a3a49bbc --- /dev/null +++ b/etl/steps/data/grapher/hmd/2024-11-19/hfd.py @@ -0,0 +1,44 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("hfd") + + # Read table from garden dataset. + tables = [ + ds_garden.read("period", reset_index=False), + ds_garden.read("cohort", reset_index=False).rename_index_names( + { + "cohort": "year", + } + ), + ds_garden.read("period_ages", reset_index=False), + ds_garden.read("cohort_ages", reset_index=False).rename_index_names( + { + "cohort": "year", + } + ), + ] + # + # Process data. + # + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/hmd/2024-12-01/hmd.py b/etl/steps/data/grapher/hmd/2024-12-01/hmd.py new file mode 100644 index 00000000000..028b623f60c --- /dev/null +++ b/etl/steps/data/grapher/hmd/2024-12-01/hmd.py @@ -0,0 +1,83 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +INDICATORS_RELEVANT_LT = [ + "central_death_rate", + "life_expectancy", + "probability_of_death", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("hmd") + + # Read table from garden dataset. + tb_lt = ds_garden.read("life_tables") + tb_exposure = ds_garden.read("exposures") + tb_deaths = ds_garden.read("deaths") + tb_pop = ds_garden.read("population") + tb_births = ds_garden.read("births") + tb_ratios = ds_garden.read("diff_ratios") + + # Filter relevant dimensions + tb_lt = keep_only_relevant_dimensions(tb_lt) + tb_exposure = keep_only_relevant_dimensions(tb_exposure) + tb_deaths = keep_only_relevant_dimensions(tb_deaths) + tb_pop = keep_only_relevant_dimensions(tb_pop) + tb_ratios = keep_only_relevant_dimensions(tb_ratios) + + # + # Save outputs. + # + cols_index = ["country", "year", "sex", "age", "type"] + tables = [ + tb_lt.format(cols_index), + tb_exposure.format(cols_index), + tb_deaths.format(["country", "year", "sex", "age"]), + tb_pop.format(["country", "year", "sex", "age"]), + tb_births.format(["country", "year", "sex"]), + tb_ratios.format(["country", "year", "age", "type"]), + ] + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() + + +def keep_only_relevant_dimensions(tb): + """Keep only relevant dimensions. + + - We only preserve 5-year age groups, and specific 1-year age groups. + - We only preserve 1-year observation periods. + + """ + AGES_SINGLE = [ + 0, + 10, + 15, + 25, + 45, + 65, + 80, + "total", + ] + AGES_SINGLE = list(map(str, AGES_SINGLE)) + ["110+"] + flag_1 = tb["age"].isin(AGES_SINGLE) + flag_2 = tb["age"].str.contains( + "-", + ) + + tb = tb.loc[flag_1 | flag_2] + + return tb diff --git a/etl/steps/data/grapher/hmd/2024-12-03/hmd_country.py b/etl/steps/data/grapher/hmd/2024-12-03/hmd_country.py new file mode 100644 index 00000000000..8b3a73fafe6 --- /dev/null +++ b/etl/steps/data/grapher/hmd/2024-12-03/hmd_country.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("hmd_country") + + # Read table from garden dataset. + tables = list(ds_garden) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/homicide/2024-10-30/unodc.py b/etl/steps/data/grapher/homicide/2024-10-30/unodc.py new file mode 100644 index 00000000000..08652d4cd7e --- /dev/null +++ b/etl/steps/data/grapher/homicide/2024-10-30/unodc.py @@ -0,0 +1,24 @@ +"""Load a garden dataset and create a grapher dataset.""" +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load garden dataset. + ds_garden = paths.load_dataset("unodc") + + table_names = ds_garden.table_names + # if your data is in long format, you can use `grapher_helpers.long_to_wide_tables` + # to get into wide format + tables = [] + for table_name in table_names: + table = paths.garden_dataset[table_name] + tables.append(table) + + ds_grapher = create_dataset(dest_dir, tables=tables, default_metadata=ds_garden.metadata) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_demand_by_scenario.py b/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_demand_by_scenario.py index 8be52ef67e4..c2fb2b50686 100644 --- a/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_demand_by_scenario.py +++ b/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_demand_by_scenario.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # # Load garden dataset and read its main table. ds_garden = paths.load_dataset("critical_minerals") - tb_demand_by_scenario_flat = ds_garden.read_table("demand_by_scenario") + tb_demand_by_scenario_flat = ds_garden.read("demand_by_scenario") # # Process data. diff --git a/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_demand_by_technology.py b/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_demand_by_technology.py index 8cb23470ef6..81700c19541 100644 --- a/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_demand_by_technology.py +++ b/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_demand_by_technology.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # # Load garden dataset and read its main table. ds_garden = paths.load_dataset("critical_minerals") - tb_demand_by_technology_flat = ds_garden.read_table("demand_by_technology") + tb_demand_by_technology_flat = ds_garden.read("demand_by_technology") # # Process data. diff --git a/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_supply_by_country.py b/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_supply_by_country.py index 761454a3cbc..3d86611e80b 100644 --- a/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_supply_by_country.py +++ b/etl/steps/data/grapher/iea/2024-07-04/critical_minerals_supply_by_country.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # # Load garden dataset and read its main table. ds_garden = paths.load_dataset("critical_minerals") - tb_supply_by_country_flat = ds_garden.read_table("supply_by_country") + tb_supply_by_country_flat = ds_garden.read("supply_by_country") # # Process data. diff --git a/etl/steps/data/grapher/iea/2024-11-20/fossil_fuel_subsidies.py b/etl/steps/data/grapher/iea/2024-11-20/fossil_fuel_subsidies.py new file mode 100644 index 00000000000..ea4798a2dbd --- /dev/null +++ b/etl/steps/data/grapher/iea/2024-11-20/fossil_fuel_subsidies.py @@ -0,0 +1,26 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("fossil_fuel_subsidies") + + # Read table from garden dataset. + tb = ds_garden.read("fossil_fuel_subsidies", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/imf/2024-11-25/world_economic_outlook.py b/etl/steps/data/grapher/imf/2024-11-25/world_economic_outlook.py new file mode 100644 index 00000000000..766fc129725 --- /dev/null +++ b/etl/steps/data/grapher/imf/2024-11-25/world_economic_outlook.py @@ -0,0 +1,50 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("world_economic_outlook") + + # Read table from garden dataset. + tb = ds_garden.read("world_economic_outlook", reset_index=False) + + # For Grapher charts, we want the dashed projection line to start at the last observation so + # that the line looks continuous. For this, we take each variable's last observation per country + # and make it its first forecast as well. + indicators = tb.columns.str.replace("_observation|_forecast", "", regex=True).unique().tolist() + tb = tb.reset_index() + + for ind in indicators: + # Find the last observation year by country + last_obs = tb.loc[tb[f"{ind}_observation"].notnull()].groupby("country")["year"].max() + + # Assign that to last_obs column + tb["last_obs"] = tb["country"].map(last_obs) + + # Where the year is the last_obs year, assign the value of the last observation + tb.loc[tb["year"] == tb["last_obs"], f"{ind}_forecast"] = tb[f"{ind}_observation"] + + # Drop last_obs + tb = tb.drop(columns="last_obs") + + # Reinstate the index + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/irena/2024-11-01/renewable_capacity_statistics.py b/etl/steps/data/grapher/irena/2024-11-01/renewable_capacity_statistics.py new file mode 100644 index 00000000000..105063e9915 --- /dev/null +++ b/etl/steps/data/grapher/irena/2024-11-01/renewable_capacity_statistics.py @@ -0,0 +1,35 @@ +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Convert megawatts to gigawatts. +MW_TO_GW = 1e-3 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load dataset from Garden and read its main table. + ds_garden = paths.load_dataset("renewable_capacity_statistics") + tb = ds_garden["renewable_capacity_statistics"] + + # Add all indicators also in gigawatts. + for column in tb.columns: + new_column = column + "_gw" + tb[new_column] = tb[column] * MW_TO_GW + # Update metadata fields. + tb[new_column].metadata.title = tb[column].metadata.title + " (GW)" + tb[new_column].metadata.unit = "gigawatts" + tb[new_column].metadata.short_unit = "GW" + tb[new_column].metadata.description_short = ( + tb[column].metadata.description_short.replace("mega", "giga").replace("MW", "GW") + ) + + # + # Save outputs. + # + # Create new dataset. + ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/irena/2024-11-01/renewable_capacity_statistics_by_technology.py b/etl/steps/data/grapher/irena/2024-11-01/renewable_capacity_statistics_by_technology.py new file mode 100644 index 00000000000..5d86ea2fcf8 --- /dev/null +++ b/etl/steps/data/grapher/irena/2024-11-01/renewable_capacity_statistics_by_technology.py @@ -0,0 +1,97 @@ +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Convert megawatts to gigawatts. +MW_TO_GW = 1e-3 + +# List of technologies to include, and how to rename them. +TECHNOLOGIES = { + "Bioenergy": "Bioenergy (total)", + "Biogas": "Biogas", + # "Biogas (off-grid)": "Biogas (off-grid)", + "Concentrated solar power": "Concentrated solar power", + # "Geothermal": "Geothermal", + # "Geothermal (off-grid)": "Geothermal (off-grid)", + "Geothermal (total)": "Geothermal", + "Hydropower": "Hydropower", + # "Hydropower (off-grid)": "Hydropower (off-grid)", + "Hydropower (incl. pumped storage)": "Hydropower (total)", + # "Hydropower (excl. pumped storage)": "Hydropower (excl. pumped storage)", + "Liquid biofuels": "Liquid biofuels", + # "Liquid biofuels (off-grid)": "Liquid biofuels (off-grid)", + "Marine": "Marine", + "Mixed hydro plants": "Mixed hydro plants", + "Offshore wind": "Offshore wind", + "Onshore wind": "Onshore wind", + # "Onshore wind (off-grid)": "Onshore wind (off-grid)", + "Pumped storage": "Pumped storage", + "Renewable municipal waste": "Renewable municipal waste", + "Renewable electricity": "All renewables (total)", + "Solar": "Solar (total)", + "Solar photovoltaic": "Solar photovoltaic", + "Solar photovoltaic (off-grid)": "Solar photovoltaic (off-grid)", + "Solid biofuels": "Solid biofuels", + # "Solid biofuels (off-grid)": "Solid biofuels (off-grid)", + "Wind": "Wind (total)", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load dataset from Garden and read its main table. + ds_garden = paths.load_dataset("renewable_capacity_statistics") + tb = ds_garden["renewable_capacity_statistics"] + + # + # Process data. + # + # Get the human-readable names of the technologies from the variable metadata. + tb = tb.rename(columns={variable: tb[variable].metadata.display["name"] for variable in tb.columns}, errors="raise") + + # Select and rename columns. + tb = tb[TECHNOLOGIES.keys()].rename(columns=TECHNOLOGIES, errors="raise") + + # The original table has a column per technology, each with it's own short description. + # I will gather all descriptions and add them later as a key description of the resulting (melted) capacity indicator. + short_descriptions = {column: tb[column].metadata.description_short for column in tb.columns} + + # Simplify table to consider only the World. + # Here we use "country" to refer to a technology. + # This is a workaround, so that grapher will let us select technologies as it does with countries. + tb = tb.loc["World"].reset_index().melt(id_vars="year", var_name="country", value_name="capacity") + + # Convert units from megawatts to gigawatts. + tb["capacity"] *= MW_TO_GW + + # Set appropriate metadata. + tb["capacity"].metadata.title = "Capacity" + from owid.catalog import VariablePresentationMeta + + tb["capacity"].metadata.presentation = VariablePresentationMeta( + title_public="Installed capacity for different renewable technologies" + ) + # IRENA's data is rounded to 1 MW, with anything below 0.5 MW shown as 0. + # tb["capacity"].metadata.display = {"numDecimalPlaces": 0} + tb["capacity"].metadata.unit = "gigawatts" + tb["capacity"].metadata.short_unit = "GW" + tb["capacity"].metadata.description_short = "Measured in gigawatts." + tb["capacity"].metadata.description_key = [ + f"{technology}: {description}" for technology, description in short_descriptions.items() + ] + + # Improve table format. + tb = tb.format(short_name="renewable_capacity_statistics_by_technology") + + # Update table's metadata. + tb.metadata.title = "Renewable electricity capacity by technology" + + # + # Save outputs. + # + # Create new dataset. + ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/irena/2024-11-15/renewable_power_generation_costs.py b/etl/steps/data/grapher/irena/2024-11-15/renewable_power_generation_costs.py new file mode 100644 index 00000000000..a49d3f742a2 --- /dev/null +++ b/etl/steps/data/grapher/irena/2024-11-15/renewable_power_generation_costs.py @@ -0,0 +1,22 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("renewable_power_generation_costs") + tb = ds_garden["renewable_power_generation_costs"] + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/met_office_hadley_centre/2024-11-18/near_surface_temperature.py b/etl/steps/data/grapher/met_office_hadley_centre/2024-11-18/near_surface_temperature.py new file mode 100644 index 00000000000..e2cb875b1f0 --- /dev/null +++ b/etl/steps/data/grapher/met_office_hadley_centre/2024-11-18/near_surface_temperature.py @@ -0,0 +1,31 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("near_surface_temperature") + tb_garden = ds_garden["near_surface_temperature"].reset_index() + + # + # Process data. + # + # For compatibility with grapher, change the name of "region" column to "country". + tb_garden = tb_garden.rename(columns={"region": "country"}) + + # Set an appropriate index and sort conveniently. + tb_garden = tb_garden.set_index(["country", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/migration/2024-11-18/migration_between_regions.py b/etl/steps/data/grapher/migration/2024-11-18/migration_between_regions.py new file mode 100644 index 00000000000..b8f1dfe6832 --- /dev/null +++ b/etl/steps/data/grapher/migration/2024-11-18/migration_between_regions.py @@ -0,0 +1,40 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("migration_between_regions") + + # Read table from garden dataset + tb = ds_garden["migration_between_regions"].reset_index() + + tb = tb.pivot(columns="country_destination", index=["country_origin", "year"], values="migrants_all_sexes") + + # Add metadata. + for col in tb.columns: + tb[col].m.unit = "people" + tb[col].m.short_unit = "" + tb[col].m.title = f"Migration to {col}" + tb[col].m.description_short = f"Number of migrants to {col}" + + tb = tb.reset_index() + + tb["country"] = tb["country_origin"] + + tb = tb.drop(columns=["country_origin"]).format(["country", "year"]) + + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/migration/2024-11-19/migration_distance.py b/etl/steps/data/grapher/migration/2024-11-19/migration_distance.py new file mode 100644 index 00000000000..1529dcdeaef --- /dev/null +++ b/etl/steps/data/grapher/migration/2024-11-19/migration_distance.py @@ -0,0 +1,31 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("migration_distance") + + # Read table from garden dataset. + tb = ds_garden["migration_distance"].reset_index() + + tb = tb.rename(columns={"country_origin": "country"}) + + tb = tb.format(["country", "year"]) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/migration/2024-11-20/migrant_stock_flows.meta.yml b/etl/steps/data/grapher/migration/2024-11-20/migrant_stock_flows.meta.yml new file mode 100644 index 00000000000..3e7fc3f6f09 --- /dev/null +++ b/etl/steps/data/grapher/migration/2024-11-20/migrant_stock_flows.meta.yml @@ -0,0 +1,16 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Migration + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + title: International Migrant Stock (Origin and Destination) + + + diff --git a/etl/steps/data/grapher/migration/2024-11-20/migrant_stock_flows.py b/etl/steps/data/grapher/migration/2024-11-20/migrant_stock_flows.py new file mode 100644 index 00000000000..71ad1d1821e --- /dev/null +++ b/etl/steps/data/grapher/migration/2024-11-20/migrant_stock_flows.py @@ -0,0 +1,82 @@ +"""Load a garden dataset and create a grapher dataset. +This grapher step has two purposes: +1. Format the data in a way that is compatible with the grapher database (split into two tables and index on country and year). +2. Add metadata programmatically to the data.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("migrant_stock") + + # Read table from garden dataset. + tb = ds_garden.read("migrant_stock_dest_origin") + + tb = tb.drop(columns=["migrants_female", "migrants_male"]) + + tb_dest_cols = tb.pivot( + index=["country_origin", "year"], + columns="country_destination", + values=["migrants_all_sexes"], + ) + + tb_dest_cols.columns = [col[0] + "_to_" + col[1] for col in tb_dest_cols.columns] + + tb_origin_cols = tb.pivot( + index=["country_destination", "year"], + columns="country_origin", + values=["migrants_all_sexes"], + ) + + tb_origin_cols.columns = [col[0] + "_from_" + col[1] for col in tb_origin_cols.columns] + + # add metadata: + + for col in tb_dest_cols.columns: + dest = col.split("migrants_all_sexes_to_")[1] + tb_dest_cols[col].metadata.unit = "people" + tb_dest_cols[col].metadata.short_unit = "" + tb_dest_cols[col].metadata.title = f"Number of immigrants who moved to {dest}" + tb_dest_cols[ + col + ].metadata.description_short = f"Number of migrants who have moved to {dest}. The numbers describe cumulative migrant stock, not migrants who moved in this year." + + for col in tb_origin_cols.columns: + origin = col.split("migrants_all_sexes_from_")[1] + + tb_origin_cols[col].metadata.unit = "people" + tb_origin_cols[col].metadata.short_unit = "" + tb_origin_cols[col].metadata.title = f"Number of emigrants who moved from {origin}" + tb_origin_cols[ + col + ].metadata.description_short = f"Number of migrants who have moved to away from {origin}. The numbers describe cumulative migrant stock, not migrants who moved in this year." + + tb_dest_cols = tb_dest_cols.reset_index() + tb_dest_cols = tb_dest_cols.rename(columns={"country_origin": "country"}) + tb_dest_cols.metadata.short_name = "migrant_stock_origin" + tb_dest_cols = tb_dest_cols.format(["country", "year"]) + + tb_origin_cols = tb_origin_cols.reset_index() + tb_origin_cols = tb_origin_cols.rename(columns={"country_destination": "country"}) + tb_origin_cols.metadata.short_name = "migrant_stock_destination" + tb_origin_cols = tb_origin_cols.format(["country", "year"]) + + # Save outputs + # + # Create a new grapher dataset with the same metadata as the garden dataset + ds_grapher = create_dataset( + dest_dir, + tables=[tb_origin_cols, tb_dest_cols], + check_variables_metadata=True, + default_metadata=ds_garden.metadata, + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/minerals/2024-07-15/global_mine_production_by_mineral.py b/etl/steps/data/grapher/minerals/2024-07-15/global_mine_production_by_mineral.py new file mode 100644 index 00000000000..c048d005bd4 --- /dev/null +++ b/etl/steps/data/grapher/minerals/2024-07-15/global_mine_production_by_mineral.py @@ -0,0 +1,153 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select and how to rename them. +COLUMNS = { + "country": "country", + "year": "year", + "production_antimony_mine_tonnes": "Antimony", + "production_asbestos_mine_tonnes": "Asbestos", + "production_barite_mine_tonnes": "Barite", + "production_bauxite_mine_tonnes": "Bauxite", + "production_beryllium_mine_tonnes": "Beryllium", + "production_bismuth_mine_tonnes": "Bismuth", + "production_boron_mine_tonnes": "Boron", + "production_chromium_mine_tonnes": "Chromium", + "production_coal_mine_tonnes": "Coal", + "production_cobalt_mine_tonnes": "Cobalt", + "production_coltan_mine__columbite_tantalite_tonnes": "Coltan (columbite-tantalite)", + "production_coltan_mine__columbite_tonnes": "Coltan (columbite)", + "production_coltan_mine__tantalite_tonnes": "Coltan (tantalite)", + "production_copper_mine_tonnes": "Copper", + "production_diamond_mine__industrial_tonnes": "Diamond (industrial)", + "production_diamond_mine_and_synthetic__industrial_tonnes": "Diamond and synthetic (industrial)", + "production_feldspar_mine_tonnes": "Feldspar", + "production_fluorspar_mine_tonnes": "Fluorspar", + "production_garnet_mine_tonnes": "Garnet", + "production_gemstones_mine_tonnes": "Gemstones", + "production_gold_mine_tonnes": "Gold", + "production_graphite_mine_tonnes": "Graphite", + "production_gypsum_mine_tonnes": "Gypsum", + "production_helium_mine_tonnes": "Helium", + "production_iodine_mine_tonnes": "Iodine", + "production_iron_ore_mine__crude_ore_tonnes": "Iron ore (crude ore)", + "production_iron_ore_mine__iron_content_tonnes": "Iron ore (iron content)", + "production_lead_mine_tonnes": "Lead", + "production_lithium_mine_tonnes": "Lithium", + "production_magnesium_compounds_mine_tonnes": "Magnesium compounds", + "production_manganese_mine_tonnes": "Manganese", + "production_mercury_mine_tonnes": "Mercury", + "production_mica_mine__scrap_and_flake_tonnes": "Mica (scrap and flake)", + "production_mica_mine__sheet_tonnes": "Mica (sheet)", + "production_mica_mine_tonnes": "Mica", + "production_molybdenum_mine_tonnes": "Molybdenum", + "production_nickel_mine_tonnes": "Nickel", + "production_niobium_mine__pyrochlore_tonnes": "Niobium (pyrochlore)", + "production_niobium_mine_tonnes": "Niobium", + "production_phosphate_rock_mine__aluminum_phosphate_tonnes": "Phosphate rock (aluminum phosphate)", + "production_phosphate_rock_mine_tonnes": "Phosphate rock", + "production_platinum_group_metals_mine__iridium_tonnes": "Platinum group metals (iridium)", + "production_platinum_group_metals_mine__other_tonnes": "Platinum group metals (other)", + "production_platinum_group_metals_mine__palladium_tonnes": "Platinum group metals (palladium)", + "production_platinum_group_metals_mine__platinum_tonnes": "Platinum group metals (platinum)", + "production_platinum_group_metals_mine__rhodium_tonnes": "Platinum group metals (rhodium)", + "production_platinum_group_metals_mine__ruthenium_tonnes": "Platinum group metals (ruthenium)", + "production_potash_mine__chloride_tonnes": "Potash (chloride)", + "production_potash_mine__polyhalite_tonnes": "Potash (polyhalite)", + "production_potash_mine__potassic_salts_tonnes": "Potash (potassic salts)", + "production_potash_mine_tonnes": "Potash", + "production_rare_earths_mine_tonnes": "Rare earths", + "production_salt_mine_tonnes": "Salt", + "production_sand_and_gravel_mine__construction_tonnes": "Sand and gravel (construction)", + "production_sand_and_gravel_mine__industrial_tonnes": "Sand and gravel (industrial)", + "production_silver_mine_tonnes": "Silver", + "production_strontium_mine_tonnes": "Strontium", + "production_talc_and_pyrophyllite_mine__pyrophyllite_tonnes": "Talc and pyrophyllite (pyrophyllite)", + "production_talc_and_pyrophyllite_mine_tonnes": "Talc and pyrophyllite", + "production_tantalum_mine_tonnes": "Tantalum", + "production_tin_mine_tonnes": "Tin", + "production_titanium_mine__ilmenite_tonnes": "Titanium (ilmenite)", + "production_titanium_mine__rutile_tonnes": "Titanium (rutile)", + "production_tungsten_mine_tonnes": "Tungsten", + "production_uranium_mine_tonnes": "Uranium", + "production_vanadium_mine_tonnes": "Vanadium", + "production_zinc_mine_tonnes": "Zinc", + "production_zirconium_and_hafnium_mine_tonnes": "Zirconium and hafnium", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its flat table. + ds_garden = paths.load_dataset("minerals") + tb = ds_garden.read("minerals") + + # + # Process data. + # + # Select and rename columns. + tb = tb[COLUMNS.keys()].rename(columns=COLUMNS, errors="raise") + + # Select global data. + tb = tb[tb["country"] == "World"].drop(columns=["country"], errors="raise").reset_index(drop=True) + + # Gather all descriptions. + # NOTE: Footnotes will be gathered and used as description key. + # description_key = [] + _description_processing = [] + # _description_from_producer = [] + footnotes = [] + for column in tb.drop(columns=["year"]).columns: + grapher_config = tb[column].m.presentation.grapher_config + if grapher_config: + # NOTE: Ignore the frequent footnote saying that "The sum of all countries may exceed World data on certain years (by up to 10%), due to discrepancies between data sources." + if "The sum of all countries" not in grapher_config["note"]: + footnotes.append(f"{column} - {grapher_config['note']}") + # description_key.append(tb[column].metadata.description_short) + _description_processing.append(tb[column].metadata.description_processing) + # _description_from_producer.append(f"- {column}\n{tb[column].metadata.description_from_producer}") + _description_processing = sorted( + set([tb[column].metadata.description_processing for column in tb.drop(columns=["year"]).columns]) + ) + # By construction, processing description should be the same for all indicators. + assert len(_description_processing) == 1, "All columns were expected to have the same processing description." + description_processing = _description_processing[0] + # Gather all descriptions from producer. + # description_from_producer = "\n".join(["\n ".join(description.split("\n")) for description in _description_from_producer if description]) + + # Melt table to create a long table with mineral as "country" column. + tb_long = tb.melt(id_vars=["year"], var_name="country", value_name="mine_production") + + # Drop empty rows. + tb_long = tb_long.dropna(subset=["mine_production"]) + + # Improve metadata. + tb_long["mine_production"].metadata.title = "Global mine production of different minerals" + tb_long["mine_production"].metadata.unit = "tonnes" + tb_long["mine_production"].metadata.short_unit = "t" + tb_long[ + "mine_production" + ].metadata.description_short = ( + "Measured in tonnes of mined, rather than [refined](#dod:refined-production) production." + ) + tb_long["mine_production"].metadata.description_key = footnotes + tb_long["mine_production"].metadata.description_processing = description_processing + # NOTE: The following metadata is too long and cannot be inserted in DB. + # tb_long["mine_production"].metadata.description_from_producer = description_from_producer + tb_long.metadata.title = "Global mine production by mineral" + + # Improve table format. + tb_long = tb_long.format(short_name=paths.short_name) + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_long], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/ophi/2024-10-28/multidimensional_poverty_index.py b/etl/steps/data/grapher/ophi/2024-10-28/multidimensional_poverty_index.py new file mode 100644 index 00000000000..cdcd55d33ba --- /dev/null +++ b/etl/steps/data/grapher/ophi/2024-10-28/multidimensional_poverty_index.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("multidimensional_poverty_index") + + # Read table from garden dataset. + tb = ds_garden["multidimensional_poverty_index"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/owid/latest/ig_countries.py b/etl/steps/data/grapher/owid/latest/ig_countries.py new file mode 100644 index 00000000000..ff6b54b8d02 --- /dev/null +++ b/etl/steps/data/grapher/owid/latest/ig_countries.py @@ -0,0 +1,29 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("ig_countries") + + # + # Process data. + # + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=list(ds_garden), check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/owid/latest/key_indicators.py b/etl/steps/data/grapher/owid/latest/key_indicators.py index 4971e99b265..17311c164b9 100644 --- a/etl/steps/data/grapher/owid/latest/key_indicators.py +++ b/etl/steps/data/grapher/owid/latest/key_indicators.py @@ -1,7 +1,6 @@ from copy import deepcopy from typing import Any, List -import numpy as np from owid import catalog from etl.paths import DATA_DIR @@ -91,10 +90,6 @@ def _add_metric_new( display_name_suffix: str, description: str = "", ) -> catalog.Table: - # Get dtype - dtype = table[metric].dtype - if np.issubdtype(table[metric].dtype, np.integer): - dtype = "Int64" metric_new = f"{metric}_{metric_suffix}" table.loc[mask, metric_new] = deepcopy(table.loc[mask, metric]) table[metric_new].metadata = deepcopy(table[metric].metadata) @@ -105,4 +100,10 @@ def _add_metric_new( "name" ] = f"{table[metric_new].metadata.display['name']} {display_name_suffix}" table[metric_new].metadata.description = description + + # Get dtype + dtype = table[metric].dtype + if "int" in str(dtype).lower(): + dtype = "Int64" + return table.astype({metric_new: dtype}) diff --git a/etl/steps/data/grapher/un/2024-01-17/urban_agglomerations_largest_cities.py b/etl/steps/data/grapher/un/2024-01-17/urban_agglomerations_largest_cities.py index e922b1b7ebe..10e396c4de9 100644 --- a/etl/steps/data/grapher/un/2024-01-17/urban_agglomerations_largest_cities.py +++ b/etl/steps/data/grapher/un/2024-01-17/urban_agglomerations_largest_cities.py @@ -31,15 +31,15 @@ def run(dest_dir: str) -> None: ) tb = tb.drop(columns=["rank_order", "population_capital", "country_code"]) - # Create two new dataframes to separate data into estimates and projections (pre-2019 and post-2019) - past_estimates = tb[tb["year"] < 2019].copy() - future_projections = tb[tb["year"] >= 2019].copy() + # Create two new dataframes to separate data into estimates and projections + past_estimates = tb[tb["year"] <= 2015].copy() + future_projections = tb[tb["year"] >= 2015].copy() # Now, for each column in the original dataframe, split it into two for col in tb.columns: if col not in ["country", "year"]: - past_estimates[f"{col}_estimates"] = tb.loc[tb["year"] < 2019, col] - future_projections[f"{col}_projections"] = tb.loc[tb["year"] >= 2019, col] + past_estimates[f"{col}_estimates"] = tb.loc[tb["year"] <= 2015, col] + future_projections[f"{col}_projections"] = tb.loc[tb["year"] >= 2015, col] past_estimates = past_estimates.drop(columns=[col]) future_projections = future_projections.drop(columns=[col]) diff --git a/etl/steps/data/grapher/un/2024-03-14/un_wpp_most.py b/etl/steps/data/grapher/un/2024-03-14/un_wpp_most.py index d3919bc2c93..dd1bd352efd 100644 --- a/etl/steps/data/grapher/un/2024-03-14/un_wpp_most.py +++ b/etl/steps/data/grapher/un/2024-03-14/un_wpp_most.py @@ -15,12 +15,12 @@ def run(dest_dir: str) -> None: # Read five-year age-group table from garden dataset. tb_five = ds_garden["population_5_year_age_groups"].reset_index() - tb_five = tb_five.rename(columns={"location": "country"}) + # tb_five = tb_five.rename(columns={"location": "country"}) tb_five = tb_five.set_index(["country", "year"], verify_integrity=True) # Read ten-year age-group table from garden dataset. tb_ten = ds_garden["population_10_year_age_groups"].reset_index() - tb_ten = tb_ten.rename(columns={"location": "country"}) + # tb_ten = tb_ten.rename(columns={"location": "country"}) tb_ten = tb_ten.set_index(["country", "year"], verify_integrity=True) # Save outputs. # diff --git a/etl/steps/data/grapher/un/2024-08-27/un_sdg.py b/etl/steps/data/grapher/un/2024-08-27/un_sdg.py index 2f03b35820d..92aa4d73b94 100644 --- a/etl/steps/data/grapher/un/2024-08-27/un_sdg.py +++ b/etl/steps/data/grapher/un/2024-08-27/un_sdg.py @@ -51,7 +51,7 @@ def run(dest_dir: str) -> None: log.info("un_sdg.process", table_name=var) - tb = ds_garden.read_table(var) + tb = ds_garden.read(var, safe_types=False) tb = create_table(tb) diff --git a/etl/steps/data/grapher/un/2024-09-16/long_run_child_mortality.meta.yml b/etl/steps/data/grapher/un/2024-09-16/long_run_child_mortality.meta.yml deleted file mode 100644 index 7d495c14975..00000000000 --- a/etl/steps/data/grapher/un/2024-09-16/long_run_child_mortality.meta.yml +++ /dev/null @@ -1,70 +0,0 @@ -definitions: - common: - processing_level: major - presentation: - topic_tags: - - Child & Infant Mortality -dataset: - update_period_days: 365 -tables: - long_run_child_mortality_selected: - variables: - under_five_mortality_selected: - title: Under-five mortality rate - unit: deaths per 100 live births - short_unit: "%" - display: - numDecimalPlaces: 1 - description_short: The long-run estimated share of [newborns](#dod:newborn) who die before reaching the age of five. - description_key: - - This long-run indicator is a combination of two data sources, Gapminder and the UN Inter-agency Group for Child Mortality Estimation (UN IGME). - - The historical data is compiled by Gapminder, the full range of sources used can be found in the [Gapminder documentation](https://www.gapminder.org/data/documentation/gd005/). - description_processing: >- - This indicator is a combination of data from two sources: - - The UN Inter-agency Group for Child Mortality Estimation (UN IGME) provides estimates of child mortality rates, which is available for some countries from 1932. - - Gapminder provides estimates of child mortality rates for the years 1800 to 2015. - - We combine the two datasets, for years where both are available, we have a preference for the UN IGME data. - presentation: - title_public: Under-five mortality rate - title_variant: Long-run data - attribution_short: UN IGME; Gapminder - grapher_config: - title: Child mortality rate - subtitle: The estimated share of [newborns](#dod:newborn) who die before reaching the age of five. - variantName: Long-run data; Gapminder & UN IGME - sourceDesc: UN IGME (2023); Gapminder (2015) - originUrl: https://ourworldindata.org/child-mortality - hasMapTab: true - yAxis: - max: 0 - min: 0 - minTime: 1800 - map: - time: latest - colorScale: - baseColorScheme: YlOrRd - binningStrategy: manual - customNumericColors: - - null - - null - - null - customNumericValues: - - 0.3 - - 0.5 - - 1 - - 3 - - 5 - - 10 - - 30 - - 50 - customNumericMinValue: 0 - timeTolerance: 0 - selectedEntityNames: - - United States - - United Kingdom - - Sweden - - France - - Brazil - - India - $schema: https://files.ourworldindata.org/schemas/grapher-schema.003.json diff --git a/etl/steps/data/grapher/un/2024-10-21/census_dates.meta.yml b/etl/steps/data/grapher/un/2024-10-21/census_dates.meta.yml new file mode 100644 index 00000000000..b210efdccc1 --- /dev/null +++ b/etl/steps/data/grapher/un/2024-10-21/census_dates.meta.yml @@ -0,0 +1,34 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - State Capacity + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + years_since_last_census: + variables: + years_since_last_census: + title: Years since last census + description_short: The number of years since the last census of this country + unit: years + short_unit: "" + last_census: + title: Year of the last census + description_short: Year of the last census in this country + unit: year + short_unit: "" + recent_census: + title: Population census recently completed + description_short: Population census completed in the last 10 years. Population censuses collect data on the size, distribution and composition of the population. + unit: "" + short_unit: "" + + diff --git a/etl/steps/data/grapher/un/2024-10-21/census_dates.py b/etl/steps/data/grapher/un/2024-10-21/census_dates.py new file mode 100644 index 00000000000..b02693da893 --- /dev/null +++ b/etl/steps/data/grapher/un/2024-10-21/census_dates.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("census_dates") + + # Read table from garden dataset. + tb = ds_garden["years_since_last_census"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/unesco/2024-11-21/enrolment_rates.py b/etl/steps/data/grapher/unesco/2024-11-21/enrolment_rates.py new file mode 100644 index 00000000000..7c53b2ca945 --- /dev/null +++ b/etl/steps/data/grapher/unesco/2024-11-21/enrolment_rates.py @@ -0,0 +1,27 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("enrolment_rates") + + # Read table from garden dataset. + tb = ds_garden["enrolment_rates"] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/urbanization/2024-12-02/ghsl_urban_centers.py b/etl/steps/data/grapher/urbanization/2024-12-02/ghsl_urban_centers.py new file mode 100644 index 00000000000..7ed2180485c --- /dev/null +++ b/etl/steps/data/grapher/urbanization/2024-12-02/ghsl_urban_centers.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("ghsl_urban_centers") + + # Read table from garden dataset. + tb = ds_garden.read("ghsl_urban_centers", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/war/2024-11-22/ucdp_preview.py b/etl/steps/data/grapher/war/2024-11-22/ucdp_preview.py new file mode 100644 index 00000000000..7fb6dfc6f26 --- /dev/null +++ b/etl/steps/data/grapher/war/2024-11-22/ucdp_preview.py @@ -0,0 +1,50 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("ucdp_preview") + + # Read table from garden dataset. + tb = ds_garden["ucdp_preview"] + + # Process data. + # + # Rename index column `region` to `country`. + tb = tb.reset_index().rename(columns={"region": "country"}) + # Remove suffixes in region names + tb["country"] = tb["country"].str.replace(r" \(.+\)", "", regex=True) + # Set index + tb = tb.set_index(["year", "country", "conflict_type"]) + + # Get country-level data + tb_participants = ds_garden["ucdp_preview_country"] + tb_locations = ds_garden["ucdp_preview_locations"] + + tables = [ + tb, + tb_participants, + tb_locations, + ] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Remove source description so that it doesn't get appended to the dataset description. + # ds_grapher.metadata.sources[0].description = "" + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/wb/2022-10-03/extreme_poverty_by_region.meta.yml b/etl/steps/data/grapher/wb/2022-10-03/extreme_poverty_by_region.meta.yml index f5add1f2cf3..b511440225d 100644 --- a/etl/steps/data/grapher/wb/2022-10-03/extreme_poverty_by_region.meta.yml +++ b/etl/steps/data/grapher/wb/2022-10-03/extreme_poverty_by_region.meta.yml @@ -80,7 +80,7 @@ tables: This data is expressed in [international-$](#dod:int_dollar_abbreviation) at 2017 prices. Depending on the country and year, it relates to income measured after taxes and benefits, or to consumption, [per capita](#dod:per-capita). - type: StackedArea + chartTypes: ["StackedArea"] addCountryMode: disabled hideRelativeToggle: false originUrl: https://ourworldindata.org/poverty diff --git a/etl/steps/data/grapher/wb/2024-11-04/edstats.py b/etl/steps/data/grapher/wb/2024-11-04/edstats.py new file mode 100644 index 00000000000..4ff01c79b9a --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-11-04/edstats.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("edstats") + + # Read table from garden dataset. + tb = ds_garden["edstats"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/wb/2024-12-03/poverty_projections.py b/etl/steps/data/grapher/wb/2024-12-03/poverty_projections.py new file mode 100644 index 00000000000..08e7178862b --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-12-03/poverty_projections.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("poverty_projections") + + # Read table from garden dataset. + tb = ds_garden.read("poverty_projections", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml index ac77909851f..ebbbfe45d30 100644 --- a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml +++ b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml @@ -3,6 +3,21 @@ tables: variables: prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct: title: Share of adults who are overweight (age-standardized) + maternal_mortality_ratio__per_100_000_live_births: + variables: + maternal_mortality_ratio__per_100_000_live_births: + display: + entityAnnotationsMap: 'United States: Values from 2003–2017 affected by measurement change' + maternal_mortality_ratio__per_100_000_live_births__country_reported_estimates: + variables: + maternal_mortality_ratio__per_100_000_live_births__country_reported_estimates: + display: + entityAnnotationsMap: 'United States: Values from 2003–2017 affected by measurement change' + number_of_maternal_deaths: + variables: + number_of_maternal_deaths: + display: + entityAnnotationsMap: 'United States: Values from 2003–2017 affected by measurement change' stunting_prevalence_among_children_under_5_years_of_age__pct_height_for_age__lt__2_sd__model_based_estimates: variables: stunting_prevalence_among_children_under_5_years_of_age__pct_height_for_age__lt__2_sd__model_based_estimates: diff --git a/etl/steps/data/grapher/who/2024-07-30/ghe.py b/etl/steps/data/grapher/who/2024-07-30/ghe.py index 1c5b6eaa58c..a4fba2838fc 100644 --- a/etl/steps/data/grapher/who/2024-07-30/ghe.py +++ b/etl/steps/data/grapher/who/2024-07-30/ghe.py @@ -20,8 +20,8 @@ def run(dest_dir: str) -> None: # Save outputs. # tables = [ - tb_garden, tb_garden_ratio, + tb_garden, ] # Create a new grapher dataset with the same metadata as the garden dataset. diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py b/etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py index 0e965f59f58..4ad3adcb1ed 100644 --- a/etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py +++ b/etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("fao_1949.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py b/etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py index 7c2b95c3e46..57380e279b2 100644 --- a/etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py +++ b/etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("fao_2000.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py b/etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py index 45bcd82f1a2..37003fd7d70 100644 --- a/etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py +++ b/etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("fogel_2004.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py b/etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py index b455d709750..82caf2002e5 100644 --- a/etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py +++ b/etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("grigg_1995.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py index 6fb275bf11a..4a6660f21da 100644 --- a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py +++ b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("harris_et_al_2015.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py b/etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py index ae84f20378c..f7b97a260a4 100644 --- a/etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py +++ b/etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("jonsson_1998.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/ahdi/2023-09-08/augmented_hdi.py b/etl/steps/data/meadow/ahdi/2023-09-08/augmented_hdi.py index 0abcdac55a1..580b24811a5 100644 --- a/etl/steps/data/meadow/ahdi/2023-09-08/augmented_hdi.py +++ b/etl/steps/data/meadow/ahdi/2023-09-08/augmented_hdi.py @@ -32,7 +32,7 @@ def run(dest_dir: str) -> None: for sheet, sheet_name in excel_sheets.items(): # Retrieve snapshots snap = paths.load_snapshot("augmented_hdi.xlsx") - tb = snap.read(sheet_name=sheet, skiprows=1) + tb = snap.read(safe_types=False, sheet_name=sheet, skiprows=1) snap_region = paths.load_snapshot("augmented_hdi_region.xlsx") tb_region = snap_region.read(sheet_name=sheet + 1, skiprows=2) diff --git a/etl/steps/data/meadow/animal_welfare/2023-10-24/fur_laws.py b/etl/steps/data/meadow/animal_welfare/2023-10-24/fur_laws.py index ae6806773f1..d6c740e6a3e 100644 --- a/etl/steps/data/meadow/animal_welfare/2023-10-24/fur_laws.py +++ b/etl/steps/data/meadow/animal_welfare/2023-10-24/fur_laws.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read data. snap = paths.load_snapshot("fur_laws.xlsx") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/antibiotics/2024-10-09/gram.py b/etl/steps/data/meadow/antibiotics/2024-10-09/gram.py index ebd206e7fa1..112da53f0a5 100644 --- a/etl/steps/data/meadow/antibiotics/2024-10-09/gram.py +++ b/etl/steps/data/meadow/antibiotics/2024-10-09/gram.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gram.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/antibiotics/2024-10-09/gram_children.py b/etl/steps/data/meadow/antibiotics/2024-10-09/gram_children.py index 80e1b7be5b4..dc5a63d7d25 100644 --- a/etl/steps/data/meadow/antibiotics/2024-10-09/gram_children.py +++ b/etl/steps/data/meadow/antibiotics/2024-10-09/gram_children.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gram_children.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/antibiotics/2024-10-09/gram_level.py b/etl/steps/data/meadow/antibiotics/2024-10-09/gram_level.py index 7a7f8a92853..507ac2e7650 100644 --- a/etl/steps/data/meadow/antibiotics/2024-10-09/gram_level.py +++ b/etl/steps/data/meadow/antibiotics/2024-10-09/gram_level.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gram_level.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/antibiotics/2024-10-18/who_glass.py b/etl/steps/data/meadow/antibiotics/2024-10-18/who_glass.py new file mode 100644 index 00000000000..53541eb2943 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-10-18/who_glass.py @@ -0,0 +1,43 @@ +"""Load a snapshot and create a meadow dataset.""" + +from owid.catalog import processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("who_glass.zip") + + tables = [] + # Load data from snapshot. + for syndrome in ["URINE", "BLOOD", "STOOL", "UROGENITAL"]: + for year in range(2016, 2023): + tb = snap.read_in_archive( + filename=f"{syndrome}_{year}.csv", + skiprows=4, + ) + tb["syndrome"] = syndrome + tb["year"] = year + tables.append(tb) + + tb = pr.concat(tables) + # Process data. + tb = tb.rename(columns={"CountryTerritoryArea": "country"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "syndrome"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-10-18/who_glass_by_antibiotic.py b/etl/steps/data/meadow/antibiotics/2024-10-18/who_glass_by_antibiotic.py new file mode 100644 index 00000000000..44734a61c30 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-10-18/who_glass_by_antibiotic.py @@ -0,0 +1,64 @@ +"""Load a snapshot and create a meadow dataset.""" + +import zipfile + +from owid.catalog import processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("who_glass_by_antibiotic.zip") + + tables = [] + # Load data from snapshot. + with zipfile.ZipFile(snap.path, "r") as zip_file: + # Get all csv files in the zip file - for some reason there are duplicated with __MACOSX at the start, we'll drop these + csv_files = [ + file_name for file_name in zip_file.namelist() if file_name.endswith(".csv") and "__MACOSX" not in file_name + ] + for file_name in csv_files: + tb = snap.read_in_archive( + filename=file_name, + skiprows=8, + encoding="ISO-8859-1", + ) + # Read in the filters from the csv file which contain important information on the slice of data + filters = snap.read_in_archive(filename=file_name, nrows=6, header=None, usecols=[0], encoding="ISO-8859-1") + tb.columns = [ + "country", + "bcis_per_million", + "total_bcis", + "bcis_with_ast_per_million", + "total_bcis_with_ast", + "share_bcis_with_ast", + ] + # adding additional columns of key information stored in the csv + tb["year"] = filters.iloc[1, 0].split(":")[-1] + tb["syndrome"] = filters.iloc[3, 0].split(":")[-1] + tb["pathogen"] = filters.iloc[4, 0].split(":")[-1] + tb["antibiotic"] = filters.iloc[5, 0].split(":")[-1] + assert all( + tb[["year", "syndrome", "pathogen", "antibiotic"]].notna() + ), f"missing key information in {file_name}" + tables.append(tb) + + tb = pr.concat(tables) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "syndrome", "pathogen", "antibiotic"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-10-23/animuse_year.py b/etl/steps/data/meadow/antibiotics/2024-10-23/animuse_year.py index 357cb41bc56..8792124f04c 100644 --- a/etl/steps/data/meadow/antibiotics/2024-10-23/animuse_year.py +++ b/etl/steps/data/meadow/antibiotics/2024-10-23/animuse_year.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("animuse_year.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/antibiotics/2024-10-23/tracss.py b/etl/steps/data/meadow/antibiotics/2024-10-23/tracss.py index 06adbb1c066..bc5441a7307 100644 --- a/etl/steps/data/meadow/antibiotics/2024-10-23/tracss.py +++ b/etl/steps/data/meadow/antibiotics/2024-10-23/tracss.py @@ -30,7 +30,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("tracss.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/antibiotics/2024-11-12/antimicrobial_usage.py b/etl/steps/data/meadow/antibiotics/2024-11-12/antimicrobial_usage.py new file mode 100644 index 00000000000..c05d7c4d719 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-11-12/antimicrobial_usage.py @@ -0,0 +1,47 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("antimicrobial_usage.xlsx") + + # Load data from snapshot. + tb_class = snap.read(sheet_name="Antimicrobial_Use_ATC4") + tb_aware = snap.read(sheet_name="Antibiotic_Use_AWaRe") + # The sheet containing notes on the data which can go in the metadata + tb_note = snap.read(sheet_name="Notes") + note_dict = dict(zip(tb_note["Notes"], tb_note["Explenation"])) + + tb_class = tb_class.rename(columns={"CountryTerritoryArea": "country", "Year": "year"}) + tb_aware = tb_aware.drop(columns=["COUNTRY"]) + tb_aware = tb_aware.rename(columns={"CountryTerritoryArea": "country", "Year": "year"}) + + tb_class["Notes"] = tb_class["Notes"].map(note_dict) + tb_aware["Notes"] = tb_aware["Notes"].map(note_dict) + + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb_class = tb_class.format( + ["country", "year", "antimicrobialclass", "atc4name", "routeofadministration"], short_name="class" + ) + tb_aware = tb_aware.format(["country", "year", "awarelabel"], short_name="aware") + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, tables=[tb_class, tb_aware], check_variables_metadata=True, default_metadata=snap.metadata + ) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-11-15/testing_coverage.py b/etl/steps/data/meadow/antibiotics/2024-11-15/testing_coverage.py new file mode 100644 index 00000000000..c993d0ada08 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-11-15/testing_coverage.py @@ -0,0 +1,51 @@ +"""Load a snapshot and create a meadow dataset.""" + +from owid.catalog import processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("testing_coverage.zip") + + # Load data from snapshot. + regions = [ + "African Region", + "Region of the Americas", + "South-East Asia Region", + "European Region", + "Eastern Mediterranean Region", + "Western Pacific Region", + "All", + ] + tables = [] + for region in regions: + tb = snap.read_in_archive( + filename=f"who_glass_testing_coverage/Testing coverage by infectious syndrome_{region}.csv", skiprows=4 + ) + tb["country"] = region + tables.append(tb) + + tb = pr.concat(tables) + tb = tb.rename(columns={"Year": "year"}) + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "specimen"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-11-20/microbe.py b/etl/steps/data/meadow/antibiotics/2024-11-20/microbe.py new file mode 100644 index 00000000000..406587008d8 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-11-20/microbe.py @@ -0,0 +1,38 @@ +"""Load a snapshot and create a meadow dataset.""" + +from owid.catalog import processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +YEARS = range(1990, 2022) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("microbe.zip") + tables = [] + for year in YEARS: + # Load data from snapshot. + tb = snap.read_in_archive(filename=f"neonatal/pathogen_{year}.csv") + tables.append(tb) + tb = pr.concat(tables) + # + # Process data. + tb = tb.rename(columns={"Location": "country", "Year": "year"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "pathogen"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-11-20/pathogen_bloodstream.py b/etl/steps/data/meadow/antibiotics/2024-11-20/pathogen_bloodstream.py new file mode 100644 index 00000000000..8fcfab4386d --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-11-20/pathogen_bloodstream.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("pathogen_bloodstream.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + tb = tb.rename(columns={"Location": "country", "Year": "year"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "pathogen"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-11-24/total_syndrome.py b/etl/steps/data/meadow/antibiotics/2024-11-24/total_syndrome.py new file mode 100644 index 00000000000..9449bd70a0a --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-11-24/total_syndrome.py @@ -0,0 +1,31 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("total_syndrome.csv") + + # Load data from snapshot. + tb = snap.read() + + tb = tb.drop(columns=["Age", "Sex", "Measure", "Metric", "Pathogen", "Counterfactual"]) + tb = tb.rename(columns={"Location": "country", "Year": "year"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "infectious_syndrome"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-12-02/microbe_amr.py b/etl/steps/data/meadow/antibiotics/2024-12-02/microbe_amr.py new file mode 100644 index 00000000000..6dd02b0b3a3 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-12-02/microbe_amr.py @@ -0,0 +1,36 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("microbe_amr.csv") + + # Load data from snapshot. + tb = snap.read() + assert all(tb["Age"] == "Neonatal") + + # + # Process data. + # + tb = tb.drop(columns=["Age", "Sex", "Measure", "Metric", "Pathogen"]) + tb = tb.rename(columns={"Location": "country", "Year": "year"}) + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "infectious_syndrome", "counterfactual"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-12-02/microbe_neonatal_amr.py b/etl/steps/data/meadow/antibiotics/2024-12-02/microbe_neonatal_amr.py new file mode 100644 index 00000000000..a1126c3a231 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-12-02/microbe_neonatal_amr.py @@ -0,0 +1,40 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("microbe_neonatal_amr.csv") + + # Load data from snapshot. + tb = snap.read() + assert all(tb["Age"] == "Neonatal") + assert len(tb["Counterfactual"].unique()) == 1 + assert all(tb["Infectious syndrome"] == "Bloodstream infections") + # + # Process data. + # + tb = tb.drop(columns=["Age", "Sex", "Measure", "Metric", "Infectious syndrome", "Pathogen Type", "Counterfactual"]) + tb = tb.rename(columns={"Location": "country", "Year": "year"}) + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "pathogen"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-12-02/total_pathogen_bloodstream.py b/etl/steps/data/meadow/antibiotics/2024-12-02/total_pathogen_bloodstream.py new file mode 100644 index 00000000000..63ef94b626b --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-12-02/total_pathogen_bloodstream.py @@ -0,0 +1,40 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("total_pathogen_bloodstream.csv") + + # Load data from snapshot. + tb = snap.read() + # Checking the right slice of data has been uploaded + assert all(tb["Age"] == "All Ages") + assert all(tb["Sex"] == "Both sexes") + assert all(tb["Measure"] == "Deaths") + assert all(tb["Metric"] == "Number") + assert all(tb["Counterfactual"] == "Total") + assert all(tb["Infectious syndrome"] == "Bloodstream infections") + + # + # Process data. + tb = tb.drop(columns=["Age", "Sex", "Measure", "Metric", "Infectious syndrome", "Counterfactual"]) + tb = tb.rename(columns={"Location": "country", "Year": "year"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "pathogen"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py b/etl/steps/data/meadow/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py new file mode 100644 index 00000000000..80b21a69747 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py @@ -0,0 +1,42 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("total_pathogen_bloodstream_amr.csv") + + # Load data from snapshot. + tb = snap.read() + # Checking the right slice of data has been uploaded + assert all(tb["Age"] == "All Ages") + assert all(tb["Sex"] == "Both sexes") + assert all(tb["Measure"] == "Deaths") + assert all(tb["Metric"] == "Number") + assert all(tb["Counterfactual"] == "Attributable") + assert all(tb["Infectious syndrome"] == "Bloodstream infections") + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.drop(columns=["Age", "Sex", "Measure", "Metric", "Infectious syndrome", "Counterfactual"]) + tb = tb.rename(columns={"Location": "country", "Year": "year"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "pathogen"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-12-03/glass_enrolment.py b/etl/steps/data/meadow/antibiotics/2024-12-03/glass_enrolment.py new file mode 100644 index 00000000000..9c94caea506 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-12-03/glass_enrolment.py @@ -0,0 +1,38 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("glass_enrolment.xlsx") + + # Load data from snapshot. + tb = snap.read() + # Drop the rows where there isn't a country name + tb = tb.dropna(subset=["Code"]) + + # Check the number of countries + assert len(tb["Country"] == 197) + # Rename columns + tb = tb.drop(columns=["Country"]).rename(columns={"Label": "country"}) + tb["year"] = snap.metadata.origin.date_published.split("-")[0] + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-12-04/microbe_total_pathogens.py b/etl/steps/data/meadow/antibiotics/2024-12-04/microbe_total_pathogens.py new file mode 100644 index 00000000000..f03016ef003 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-12-04/microbe_total_pathogens.py @@ -0,0 +1,39 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("microbe_total_pathogens.csv") + + # Load data from snapshot. + tb = snap.read() + assert all(tb["Age"] == "All Ages") + assert all(tb["Sex"] == "Both sexes") + assert all(tb["Measure"] == "Deaths") + assert all(tb["Metric"] == "Number") + assert all(tb["Counterfactual"] == "Total") + assert all(tb["Infectious syndrome"] == "All infectious syndromes") + + # + # Process data. + tb = tb.drop(columns=["Age", "Sex", "Measure", "Metric", "Infectious syndrome", "Counterfactual"]) + tb = tb.rename(columns={"Location": "country", "Year": "year", "Pathogen": "pathogen"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "pathogen"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-12-04/microbe_total_pathogens_amr.py b/etl/steps/data/meadow/antibiotics/2024-12-04/microbe_total_pathogens_amr.py new file mode 100644 index 00000000000..0e88870141e --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-12-04/microbe_total_pathogens_amr.py @@ -0,0 +1,39 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("microbe_total_pathogens_amr.csv") + + # Load data from snapshot. + tb = snap.read() + assert all(tb["Age"] == "All Ages") + assert all(tb["Sex"] == "Both sexes") + assert all(tb["Measure"] == "Deaths") + assert all(tb["Metric"] == "Number") + assert all(tb["Counterfactual"] == "Attributable") + assert all(tb["Infectious syndrome"] == "All infectious syndromes") + + # + # Process data. + tb = tb.drop(columns=["Age", "Sex", "Measure", "Metric", "Infectious syndrome", "Counterfactual"]) + tb = tb.rename(columns={"Location": "country", "Year": "year", "Pathogen": "pathogen"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "pathogen"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-12-05/microbe_neonatal_total_amr.py b/etl/steps/data/meadow/antibiotics/2024-12-05/microbe_neonatal_total_amr.py new file mode 100644 index 00000000000..b79dae1416b --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-12-05/microbe_neonatal_total_amr.py @@ -0,0 +1,39 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("microbe_neonatal_total_amr.csv") + + # Load data from snapshot. + tb = snap.read() + assert all(tb["Age"] == "Neonatal") + assert all(tb["Sex"] == "Both sexes") + assert all(tb["Measure"] == "Deaths") + assert all(tb["Metric"] == "Number") + assert all(tb["Counterfactual"] == "Attributable") + assert all(tb["Infectious syndrome"] == "All infectious syndromes") + + # + # Process data. + tb = tb.drop(columns=["Age", "Sex", "Measure", "Metric", "Infectious syndrome", "Counterfactual"]) + tb = tb.rename(columns={"Location": "country", "Year": "year", "Pathogen": "pathogen"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "pathogen"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.py b/etl/steps/data/meadow/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.py new file mode 100644 index 00000000000..1f3c017f086 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.py @@ -0,0 +1,39 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("microbe_total_deaths_by_syndrome.csv") + + # Load data from snapshot. + tb = snap.read() + assert all(tb["Age"] == "All Ages") + assert all(tb["Sex"] == "Both sexes") + assert all(tb["Measure"] == "Deaths") + assert all(tb["Metric"] == "Number") + assert all(tb["Pathogen"] == "All pathogens") + assert all(tb["Counterfactual"] == "Total") + + # + # Process data. + tb = tb.drop(columns=["Age", "Sex", "Measure", "Metric", "Pathogen", "Counterfactual"]) + tb = tb.rename(columns={"Location": "country", "Year": "year"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "infectious_syndrome"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py b/etl/steps/data/meadow/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py new file mode 100644 index 00000000000..476dfee1a85 --- /dev/null +++ b/etl/steps/data/meadow/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py @@ -0,0 +1,39 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("microbe_total_deaths_by_syndrome_amr.csv") + + # Load data from snapshot. + tb = snap.read() + assert all(tb["Age"] == "All Ages") + assert all(tb["Sex"] == "Both sexes") + assert all(tb["Measure"] == "Deaths") + assert all(tb["Metric"] == "Number") + assert all(tb["Pathogen"] == "All pathogens") + assert all(tb["Counterfactual"] == "Attributable") + + # + # Process data. + tb = tb.drop(columns=["Age", "Sex", "Measure", "Metric", "Pathogen", "Counterfactual"]) + tb = tb.rename(columns={"Location": "country", "Year": "year"}) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "infectious_syndrome"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/artificial_intelligence/2023-06-21/epoch.py b/etl/steps/data/meadow/artificial_intelligence/2023-06-21/epoch.py index a2f4d64924d..68f7786b5bf 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2023-06-21/epoch.py +++ b/etl/steps/data/meadow/artificial_intelligence/2023-06-21/epoch.py @@ -24,7 +24,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch.csv") # Read snapshot - df = snap.read() + df = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-04-02/dynabench.py b/etl/steps/data/meadow/artificial_intelligence/2024-04-02/dynabench.py index 0f82bab1401..ee9ee751dc9 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-04-02/dynabench.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-04-02/dynabench.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. column_names = ["benchmark", "date", "performance"] - tb = snap.read(sheet_name="Chart Data", header=None, names=column_names) + tb = snap.read(safe_types=False, sheet_name="Chart Data", header=None, names=column_names) tb["date"] = tb["date"].astype(str) # Convert to string for extracting year tb["date"] = tb["date"].str[:4] # Extract year from date diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-06-03/epoch.py b/etl/steps/data/meadow/artificial_intelligence/2024-06-03/epoch.py index 0a18c82226a..6d6640eb26c 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-06-03/epoch.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-06-03/epoch.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch.csv") # Read snapshot - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-06-06/epoch_compute_cost.py b/etl/steps/data/meadow/artificial_intelligence/2024-06-06/epoch_compute_cost.py index 8b6240d2b37..a933f095d30 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-06-06/epoch_compute_cost.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-06-06/epoch_compute_cost.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch_compute_cost.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-06-19/epoch_compute_intensive.py b/etl/steps/data/meadow/artificial_intelligence/2024-06-19/epoch_compute_intensive.py index 6f5dbf32892..aeee8bd6f60 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-06-19/epoch_compute_intensive.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-06-19/epoch_compute_intensive.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch_compute_intensive.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-07-10/epoch.py b/etl/steps/data/meadow/artificial_intelligence/2024-07-10/epoch.py index 0a18c82226a..6d6640eb26c 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-07-10/epoch.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-07-10/epoch.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch.csv") # Read snapshot - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-07-11/epoch_gpus.py b/etl/steps/data/meadow/artificial_intelligence/2024-07-11/epoch_gpus.py index c17c3518f59..0720239a90a 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-07-11/epoch_gpus.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-07-11/epoch_gpus.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch_gpus.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-07-16/cset.py b/etl/steps/data/meadow/artificial_intelligence/2024-07-16/cset.py index 6273fe89b55..b4912f2b12a 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-07-16/cset.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-07-16/cset.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("cset.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-08-05/epoch.py b/etl/steps/data/meadow/artificial_intelligence/2024-08-05/epoch.py index 0a18c82226a..6d6640eb26c 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-08-05/epoch.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-08-05/epoch.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch.csv") # Read snapshot - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive.py b/etl/steps/data/meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive.py index bfb27c7bb46..1626df7ac13 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch_compute_intensive.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch.py b/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch.py index 0a18c82226a..6d6640eb26c 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch.csv") # Read snapshot - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive.py b/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive.py index bfb27c7bb46..1626df7ac13 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch_compute_intensive.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-10-01/epoch.py b/etl/steps/data/meadow/artificial_intelligence/2024-10-01/epoch.py index 0a18c82226a..6d6640eb26c 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-10-01/epoch.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-10-01/epoch.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch.csv") # Read snapshot - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive.py b/etl/steps/data/meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive.py index bfb27c7bb46..1626df7ac13 100644 --- a/etl/steps/data/meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive.py +++ b/etl/steps/data/meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch_compute_intensive.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-11-03/epoch.py b/etl/steps/data/meadow/artificial_intelligence/2024-11-03/epoch.py new file mode 100644 index 00000000000..0a18c82226a --- /dev/null +++ b/etl/steps/data/meadow/artificial_intelligence/2024-11-03/epoch.py @@ -0,0 +1,73 @@ +"""Load a snapshot and create a meadow dataset.""" + +import numpy as np + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("epoch.csv") + + # Read snapshot + tb = snap.read() + + # + # Process data. + # + # Define columns of interest. + cols = [ + "System", + "Domain", + "Authors", + "Country (from Organization)", + "Organization", + "Organization categorization", + "Publication date", + "Parameters", + "Training compute (FLOP)", + "Training dataset size (datapoints)", + "Notability criteria", + ] + + # Check that the columns of interest are present + for col in cols: + assert col in tb.columns, f"Column '{col}' is missing from the dataframe." + + # Select the columns of interest + tb = tb[cols] + # Replace empty strings with NaN values + tb = tb.replace("", np.nan) + # Remove rows where all values are NaN + tb = tb.dropna(how="all") + + # Convert the training compute column to float + tb["Training compute (FLOP)"] = tb["Training compute (FLOP)"].astype(float) + + # Replace the missing values in the system column with the organization column. If organization column is NaN as well replace the missing values in the system column with the authors column + tb["System"] = tb["System"].fillna(tb["Organization"]).fillna(tb["Authors"]) + # Check that there are no NaN values in the system column + assert not tb["System"].isna().any(), "NaN values found in 'System' column after processing." + # + # Create a new table. + # + tb = tb.format(["system", "publication_date"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # Save changes in the new garden dataset. + ds_meadow.save() + + paths.log.info("epoch.end") diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive.py b/etl/steps/data/meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive.py new file mode 100644 index 00000000000..bfb27c7bb46 --- /dev/null +++ b/etl/steps/data/meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive.py @@ -0,0 +1,66 @@ +"""Load a snapshot and create a meadow dataset.""" + +import numpy as np + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("epoch_compute_intensive.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + # Define columns of interest. + cols = [ + "System", + "Domain", + "Authors", + "Country (from Organization)", + "Organization", + "Publication date", + "Parameters", + "Training compute (FLOP)", + "Training dataset size (datapoints)", + ] + + # Check that the columns of interest are present + for col in cols: + assert col in tb.columns, f"Column '{col}' is missing from the dataframe." + + # Select the columns of interest + tb = tb[cols] + # Replace empty strings with NaN values + tb = tb.replace("", np.nan) + # Remove rows where all values are NaN + tb = tb.dropna(how="all") + + # Convert the training compute column to float + tb["Training compute (FLOP)"] = tb["Training compute (FLOP)"].astype(float) + + # Replace the missing values in the system column with the organization column. If organization column is NaN as well replace the missing values in the system column with the authors column + tb["System"] = tb["System"].fillna(tb["Organization"]).fillna(tb["Authors"]) + # Check that there are no NaN values in the system column + assert not tb["System"].isna().any(), "NaN values found in 'System' column after processing." + # + # Create a new table. + # + tb = tb.format(["system", "publication_date"]) + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch.py b/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch.py new file mode 100644 index 00000000000..916eff1f4e7 --- /dev/null +++ b/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch.py @@ -0,0 +1,73 @@ +"""Load a snapshot and create a meadow dataset.""" + +import numpy as np + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("epoch.csv") + + # Read snapshot + tb = snap.read() + + # + # Process data. + # + # Define columns of interest. + cols = [ + "Model", + "Domain", + "Authors", + "Country (from Organization)", + "Organization", + "Organization categorization", + "Publication date", + "Parameters", + "Training compute (FLOP)", + "Training dataset size (datapoints)", + "Notability criteria", + ] + + # Check that the columns of interest are present + for col in cols: + assert col in tb.columns, f"Column '{col}' is missing from the dataframe." + + # Select the columns of interest + tb = tb[cols] + # Replace empty strings with NaN values + tb = tb.replace("", np.nan) + # Remove rows where all values are NaN + tb = tb.dropna(how="all") + + # Convert the training compute column to float + tb["Training compute (FLOP)"] = tb["Training compute (FLOP)"].astype(float) + + # Replace the missing values in the system column with the organization column. If organization column is NaN as well replace the missing values in the system column with the authors column + tb["Model"] = tb["Model"].fillna(tb["Organization"]).fillna(tb["Authors"]) + # Check that there are no NaN values in the system column + assert not tb["Model"].isna().any(), "NaN values found in 'Model' column after processing." + # + # Create a new table. + # + tb = tb.format(["model", "publication_date"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # Save changes in the new garden dataset. + ds_meadow.save() + + paths.log.info("epoch.end") diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive.py b/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive.py new file mode 100644 index 00000000000..a8509aef960 --- /dev/null +++ b/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive.py @@ -0,0 +1,66 @@ +"""Load a snapshot and create a meadow dataset.""" + +import numpy as np + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("epoch_compute_intensive.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + # Define columns of interest. + cols = [ + "Model", + "Domain", + "Authors", + "Country (from Organization)", + "Organization", + "Publication date", + "Parameters", + "Training compute (FLOP)", + "Training dataset size (datapoints)", + ] + + # Check that the columns of interest are present + for col in cols: + assert col in tb.columns, f"Column '{col}' is missing from the dataframe." + + # Select the columns of interest + tb = tb[cols] + # Replace empty strings with NaN values + tb = tb.replace("", np.nan) + # Remove rows where all values are NaN + tb = tb.dropna(how="all") + + # Convert the training compute column to float + tb["Training compute (FLOP)"] = tb["Training compute (FLOP)"].astype(float) + + # Replace the missing values in the system column with the organization column. If organization column is NaN as well replace the missing values in the system column with the authors column + tb["Model"] = tb["Model"].fillna(tb["Organization"]).fillna(tb["Authors"]) + # Check that there are no NaN values in the system column + assert not tb["Model"].isna().any(), "NaN values found in 'Model' column after processing." + # + # Create a new table. + # + tb = tb.format(["model", "publication_date"]) + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/artificial_intelligence/latest/epoch.py b/etl/steps/data/meadow/artificial_intelligence/latest/epoch.py index 383928d042a..69a0be83f78 100644 --- a/etl/steps/data/meadow/artificial_intelligence/latest/epoch.py +++ b/etl/steps/data/meadow/artificial_intelligence/latest/epoch.py @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("epoch.csv") # Read snapshot - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/aviation_safety_network/2024-06-05/aviation_statistics.py b/etl/steps/data/meadow/aviation_safety_network/2024-06-05/aviation_statistics.py index bbc39f7554c..18be6848558 100644 --- a/etl/steps/data/meadow/aviation_safety_network/2024-06-05/aviation_statistics.py +++ b/etl/steps/data/meadow/aviation_safety_network/2024-06-05/aviation_statistics.py @@ -38,7 +38,7 @@ def run(dest_dir: str) -> None: snap_by_nature = paths.load_snapshot("aviation_statistics_by_nature.csv") # Load data from snapshots. - tb = snap.read() + tb = snap.read(safe_types=False) tb_by_period = snap_by_period.read() tb_by_nature = snap_by_nature.read() diff --git a/etl/steps/data/meadow/biodiversity/2023-01-11/cherry_blossom.py b/etl/steps/data/meadow/biodiversity/2023-01-11/cherry_blossom.py index 9c2be6c086d..73d54b7e381 100644 --- a/etl/steps/data/meadow/biodiversity/2023-01-11/cherry_blossom.py +++ b/etl/steps/data/meadow/biodiversity/2023-01-11/cherry_blossom.py @@ -17,7 +17,7 @@ def run(dest_dir: str) -> None: # retrieve snapshot snap = Snapshot("biodiversity/2023-01-11/cherry_blossom.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # clean and transform data tb = clean_data(tb) diff --git a/etl/steps/data/meadow/biodiversity/2024-08-12/invasive_species.py b/etl/steps/data/meadow/biodiversity/2024-08-12/invasive_species.py index bc13fbd6cff..79f36ae96c0 100644 --- a/etl/steps/data/meadow/biodiversity/2024-08-12/invasive_species.py +++ b/etl/steps/data/meadow/biodiversity/2024-08-12/invasive_species.py @@ -18,11 +18,11 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("invasive_species.xlsx") origins = [snap.metadata.origin] - tb_cont = snap.read(sheet_name="ContinentalTrends") + tb_cont = snap.read(safe_types=False, sheet_name="ContinentalTrends") tb_cont = format_continental_trends(tb_cont) tb_cont = tb_cont.format(["year", "continent"], short_name="continental") - tb_glob = snap.read(sheet_name="GlobalTrends") + tb_glob = snap.read(safe_types=False, sheet_name="GlobalTrends") tb_glob["country"] = "World" tb_glob = tb_glob.format(["country", "year"], short_name="global") # Adding the origins in diff --git a/etl/steps/data/meadow/biodiversity/2024-09-30/living_planet_index.py b/etl/steps/data/meadow/biodiversity/2024-09-30/living_planet_index.py index 7b67d59ea74..405d171e7ea 100644 --- a/etl/steps/data/meadow/biodiversity/2024-09-30/living_planet_index.py +++ b/etl/steps/data/meadow/biodiversity/2024-09-30/living_planet_index.py @@ -27,7 +27,7 @@ def run(dest_dir: str) -> None: all_tbs = Table() # Load data from snapshot. for sheet_name in sheet_names: - tb = snap.read(sheet_name=sheet_name) + tb = snap.read(safe_types=False, sheet_name=sheet_name) tb = tb[ [ "Unnamed: 0", diff --git a/etl/steps/data/meadow/bls/2024-05-16/us_consumer_prices.py b/etl/steps/data/meadow/bls/2024-05-16/us_consumer_prices.py index fc4ab83e808..e1215e15160 100644 --- a/etl/steps/data/meadow/bls/2024-05-16/us_consumer_prices.py +++ b/etl/steps/data/meadow/bls/2024-05-16/us_consumer_prices.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("us_consumer_prices.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # Process data. tb = tb.set_index(["Series ID", "Year", "Period"], verify_integrity=True) diff --git a/etl/steps/data/meadow/cancer/2024-08-30/gco_alcohol.py b/etl/steps/data/meadow/cancer/2024-08-30/gco_alcohol.py index 91fa0b37c47..5e97919d9ec 100644 --- a/etl/steps/data/meadow/cancer/2024-08-30/gco_alcohol.py +++ b/etl/steps/data/meadow/cancer/2024-08-30/gco_alcohol.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gco_alcohol.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/cancer/2024-09-06/gco_infections.py b/etl/steps/data/meadow/cancer/2024-09-06/gco_infections.py index 6b0c970bd36..dd7275f1c0c 100644 --- a/etl/steps/data/meadow/cancer/2024-09-06/gco_infections.py +++ b/etl/steps/data/meadow/cancer/2024-09-06/gco_infections.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gco_infections.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None: # Drop unnecessary columns. tb = tb.drop(columns=["id", "code", "ncases_sites", "ncases_all", "ir_att", "ir", "asr"]) - tb["sex"] = tb["sex"].replace({0: "both", 1: "males", 2: "females"}) + tb["sex"] = tb["sex"].astype(str).replace({"0": "both", "1": "males", "2": "females"}) # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. tb = tb.format(["country", "year", "sex", "agent", "cancer"]) diff --git a/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_by_route.py b/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_by_route.py index be90a73a276..2289fc78699 100644 --- a/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_by_route.py +++ b/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_by_route.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("diagnosis_routes_by_route.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb[["Year", "Site", "Stage", "Route", "Count", "Percentage"]] tb = tb.rename(columns={"Count": "count_by_route", "Percentage": "percentage_by_route"}) diff --git a/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_by_stage.py b/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_by_stage.py index b555c3462bb..396cd860e8d 100644 --- a/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_by_stage.py +++ b/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_by_stage.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("diagnosis_routes_by_stage.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb[["Year", "Site", "Stage", "Route", "Count", "Percentage"]] tb = tb.rename(columns={"Count": "count_by_stage", "Percentage": "percentage_by_stage"}) diff --git a/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_survival.py b/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_survival.py index f479a623e18..35deae8b702 100644 --- a/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_survival.py +++ b/etl/steps/data/meadow/cancer/2024-09-13/diagnosis_routes_survival.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("diagnosis_routes_survival.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb[["Year", "Site", "Geography", "Gender", "Length", "Route", "Patients", "Survival"]] tb = tb.rename(columns={"Geography": "country"}) diff --git a/etl/steps/data/meadow/cancer/2024-10-13/gco_cancer_over_time_cervical.py b/etl/steps/data/meadow/cancer/2024-10-13/gco_cancer_over_time_cervical.py index e18e7f03026..835470cfa77 100644 --- a/etl/steps/data/meadow/cancer/2024-10-13/gco_cancer_over_time_cervical.py +++ b/etl/steps/data/meadow/cancer/2024-10-13/gco_cancer_over_time_cervical.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gco_cancer_over_time_cervical.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/cancer/2024-10-13/gco_cancer_today_cervical.py b/etl/steps/data/meadow/cancer/2024-10-13/gco_cancer_today_cervical.py index 3bcf35bca1f..d6608894900 100644 --- a/etl/steps/data/meadow/cancer/2024-10-13/gco_cancer_today_cervical.py +++ b/etl/steps/data/meadow/cancer/2024-10-13/gco_cancer_today_cervical.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gco_cancer_today_cervical.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/cedlas/2024-03-08/sedlac.py b/etl/steps/data/meadow/cedlas/2024-03-08/sedlac.py index 0407942d781..e41a4b6524a 100644 --- a/etl/steps/data/meadow/cedlas/2024-03-08/sedlac.py +++ b/etl/steps/data/meadow/cedlas/2024-03-08/sedlac.py @@ -199,7 +199,7 @@ def load_tables_from_snapshot(snap: Snapshot, sheets: Dict) -> List[Table]: """Load all the sheets from the snapshot.""" tables = [] for sheet in sheets: - tb = snap.read(sheet_name=sheet, header=sheets[sheet]["header"]) + tb = snap.read(safe_types=False, sheet_name=sheet, header=sheets[sheet]["header"]) tb.metadata.short_name = sheets[sheet]["short_name"] tables.append(tb) diff --git a/etl/steps/data/meadow/cedlas/2024-07-31/sedlac_poverty_2016.py b/etl/steps/data/meadow/cedlas/2024-07-31/sedlac_poverty_2016.py index f5e2159047f..e4517d8dc17 100644 --- a/etl/steps/data/meadow/cedlas/2024-07-31/sedlac_poverty_2016.py +++ b/etl/steps/data/meadow/cedlas/2024-07-31/sedlac_poverty_2016.py @@ -106,7 +106,7 @@ def load_tables_from_snapshot(snap: Snapshot, sheets: Dict) -> List[Table]: """Load all the sheets from the snapshot.""" tables = [] for sheet in sheets: - tb = snap.read(sheet_name=sheet, header=sheets[sheet]["header"]) + tb = snap.read(safe_types=False, sheet_name=sheet, header=sheets[sheet]["header"]) tb.metadata.short_name = sheets[sheet]["short_name"] tables.append(tb) diff --git a/etl/steps/data/meadow/cedlas/2024-07-31/sedlac_poverty_2018.py b/etl/steps/data/meadow/cedlas/2024-07-31/sedlac_poverty_2018.py index e59037895f4..30a70f5871a 100644 --- a/etl/steps/data/meadow/cedlas/2024-07-31/sedlac_poverty_2018.py +++ b/etl/steps/data/meadow/cedlas/2024-07-31/sedlac_poverty_2018.py @@ -106,7 +106,7 @@ def load_tables_from_snapshot(snap: Snapshot, sheets: Dict) -> List[Table]: """Load all the sheets from the snapshot.""" tables = [] for sheet in sheets: - tb = snap.read(sheet_name=sheet, header=sheets[sheet]["header"]) + tb = snap.read(safe_types=False, sheet_name=sheet, header=sheets[sheet]["header"]) tb.metadata.short_name = sheets[sheet]["short_name"] tables.append(tb) diff --git a/etl/steps/data/meadow/chartbook/2024-03-21/altimir_1986.py b/etl/steps/data/meadow/chartbook/2024-03-21/altimir_1986.py index 91a3c28b4ab..a12b6dabe41 100644 --- a/etl/steps/data/meadow/chartbook/2024-03-21/altimir_1986.py +++ b/etl/steps/data/meadow/chartbook/2024-03-21/altimir_1986.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("altimir_1986.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-04-22/concialdi.py b/etl/steps/data/meadow/chartbook/2024-04-22/concialdi.py index ae67f6415cb..89e795e51f9 100644 --- a/etl/steps/data/meadow/chartbook/2024-04-22/concialdi.py +++ b/etl/steps/data/meadow/chartbook/2024-04-22/concialdi.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("concialdi.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. tb = tb.format(["country", "year"]) diff --git a/etl/steps/data/meadow/chartbook/2024-05-23/wealth_france.py b/etl/steps/data/meadow/chartbook/2024-05-23/wealth_france.py index 79b7d5abd00..436a3d4f51d 100644 --- a/etl/steps/data/meadow/chartbook/2024-05-23/wealth_france.py +++ b/etl/steps/data/meadow/chartbook/2024-05-23/wealth_france.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("wealth_france.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # Add country tb["country"] = "France" diff --git a/etl/steps/data/meadow/chartbook/2024-08-01/hancock_1971_australia.py b/etl/steps/data/meadow/chartbook/2024-08-01/hancock_1971_australia.py index ae0711580a6..0af432124bf 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-01/hancock_1971_australia.py +++ b/etl/steps/data/meadow/chartbook/2024-08-01/hancock_1971_australia.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("hancock_1971_australia.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-02/ingles_1981_australia.py b/etl/steps/data/meadow/chartbook/2024-08-02/ingles_1981_australia.py index 652dc04bb26..46a08cbff4c 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-02/ingles_1981_australia.py +++ b/etl/steps/data/meadow/chartbook/2024-08-02/ingles_1981_australia.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ingles_1981_australia.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-05/atkinson_2008_australia.py b/etl/steps/data/meadow/chartbook/2024-08-05/atkinson_2008_australia.py index 8f291861436..d359ff52eed 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-05/atkinson_2008_australia.py +++ b/etl/steps/data/meadow/chartbook/2024-08-05/atkinson_2008_australia.py @@ -16,8 +16,8 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("atkinson_2008_australia.xls") # Load data from snapshot. - tb_oecd_lms = snap.read(sheet_name="Table A.3 (OECD LMS)", usecols="C:I", skiprows=2) - tb_eeh = snap.read(sheet_name="Table A.5 (EEH)", usecols="C:I", skiprows=3) + tb_oecd_lms = snap.read(safe_types=False, sheet_name="Table A.3 (OECD LMS)", usecols="C:I", skiprows=2) + tb_eeh = snap.read(safe_types=False, sheet_name="Table A.5 (EEH)", usecols="C:I", skiprows=3) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-05/katic_leigh_2015_australia.py b/etl/steps/data/meadow/chartbook/2024-08-05/katic_leigh_2015_australia.py index 6718170c87e..469a75e4002 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-05/katic_leigh_2015_australia.py +++ b/etl/steps/data/meadow/chartbook/2024-08-05/katic_leigh_2015_australia.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("katic_leigh_2015_australia.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="Top wealth data", usecols="B,I", skiprows=4) + tb = snap.read(safe_types=False, sheet_name="Top wealth data", usecols="B,I", skiprows=4) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-08/langoni_1972_brazil.py b/etl/steps/data/meadow/chartbook/2024-08-08/langoni_1972_brazil.py index b6bb954c702..57cb1cc5a43 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-08/langoni_1972_brazil.py +++ b/etl/steps/data/meadow/chartbook/2024-08-08/langoni_1972_brazil.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("langoni_1972_brazil.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-09/love_1979_canada.py b/etl/steps/data/meadow/chartbook/2024-08-09/love_1979_canada.py index 2b82fd467ad..d2df8190f4c 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-09/love_1979_canada.py +++ b/etl/steps/data/meadow/chartbook/2024-08-09/love_1979_canada.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("love_1979_canada.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-09/wolfson_1986_canada.py b/etl/steps/data/meadow/chartbook/2024-08-09/wolfson_1986_canada.py index 7c1f8cd42f6..91a3045d898 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-09/wolfson_1986_canada.py +++ b/etl/steps/data/meadow/chartbook/2024-08-09/wolfson_1986_canada.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("wolfson_1986_canada.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-14/atkinson_2008_canada.py b/etl/steps/data/meadow/chartbook/2024-08-14/atkinson_2008_canada.py index ce6d2c80661..f7166224326 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-14/atkinson_2008_canada.py +++ b/etl/steps/data/meadow/chartbook/2024-08-14/atkinson_2008_canada.py @@ -16,9 +16,9 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("atkinson_2008_canada.xls") # Load data from snapshot. - tb_oecd_lms = snap.read(sheet_name="Table C.3 (OECD LMS)", usecols="C,G", skiprows=2) - tb_census = snap.read(sheet_name="Table C.4 (Census)", usecols="C,Y", skiprows=3) - tb_manufacturing = snap.read(sheet_name="Table C.5 (Manf)", usecols="C,I", skiprows=2) + tb_oecd_lms = snap.read(safe_types=False, sheet_name="Table C.3 (OECD LMS)", usecols="C,G", skiprows=2) + tb_census = snap.read(safe_types=False, sheet_name="Table C.4 (Census)", usecols="C,Y", skiprows=3) + tb_manufacturing = snap.read(safe_types=False, sheet_name="Table C.5 (Manf)", usecols="C,I", skiprows=2) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-15/davies_di_matteo_2020_canada.py b/etl/steps/data/meadow/chartbook/2024-08-15/davies_di_matteo_2020_canada.py index caf0cf60113..d4b087d58e7 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-15/davies_di_matteo_2020_canada.py +++ b/etl/steps/data/meadow/chartbook/2024-08-15/davies_di_matteo_2020_canada.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("davies_di_matteo_2020_canada.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-15/jantti_2010_finland.py b/etl/steps/data/meadow/chartbook/2024-08-15/jantti_2010_finland.py index 7b1f231a8af..d63dc916ea1 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-15/jantti_2010_finland.py +++ b/etl/steps/data/meadow/chartbook/2024-08-15/jantti_2010_finland.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("jantti_2010_finland.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-16/atkinson_2008_finland.py b/etl/steps/data/meadow/chartbook/2024-08-16/atkinson_2008_finland.py index c881c3fc759..dab42d1ac03 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-16/atkinson_2008_finland.py +++ b/etl/steps/data/meadow/chartbook/2024-08-16/atkinson_2008_finland.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("atkinson_2008_finland.xls") # Load data from snapshot - tb = snap.read(sheet_name="Table F.3 (E and J", usecols="C,E", skiprows=2) + tb = snap.read(safe_types=False, sheet_name="Table F.3 (E and J", usecols="C,E", skiprows=2) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-19/becker_1997_germany.py b/etl/steps/data/meadow/chartbook/2024-08-19/becker_1997_germany.py index 0a30ea6eb3e..6f8005d6366 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-19/becker_1997_germany.py +++ b/etl/steps/data/meadow/chartbook/2024-08-19/becker_1997_germany.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("becker_1997_germany.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-19/riihela_et_al_2003_finland.py b/etl/steps/data/meadow/chartbook/2024-08-19/riihela_et_al_2003_finland.py index 9661a3c7c8f..bd25130cc03 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-19/riihela_et_al_2003_finland.py +++ b/etl/steps/data/meadow/chartbook/2024-08-19/riihela_et_al_2003_finland.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("riihela_et_al_2003_finland.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-08-19/roine_waldenstrom_2015.py b/etl/steps/data/meadow/chartbook/2024-08-19/roine_waldenstrom_2015.py index 5e89483cd35..ec9c4c5a448 100644 --- a/etl/steps/data/meadow/chartbook/2024-08-19/roine_waldenstrom_2015.py +++ b/etl/steps/data/meadow/chartbook/2024-08-19/roine_waldenstrom_2015.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("roine_waldenstrom_2015.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-10-24/atkinson_2008_germany.py b/etl/steps/data/meadow/chartbook/2024-10-24/atkinson_2008_germany.py index 76fbafaf609..7a34a16086f 100644 --- a/etl/steps/data/meadow/chartbook/2024-10-24/atkinson_2008_germany.py +++ b/etl/steps/data/meadow/chartbook/2024-10-24/atkinson_2008_germany.py @@ -19,7 +19,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("atkinson_2008_germany.xls") # Load data from snapshot - tb = snap.read(sheet_name="Table H.4", usecols="C,I", skiprows=2) + tb = snap.read(safe_types=False, sheet_name="Table H.4", usecols="C,I", skiprows=2) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-10-24/becker_1997_germany_relative_poverty.py b/etl/steps/data/meadow/chartbook/2024-10-24/becker_1997_germany_relative_poverty.py index 9c07c91af19..d4059e72f7a 100644 --- a/etl/steps/data/meadow/chartbook/2024-10-24/becker_1997_germany_relative_poverty.py +++ b/etl/steps/data/meadow/chartbook/2024-10-24/becker_1997_germany_relative_poverty.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("becker_1997_germany_relative_poverty.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/chartbook/2024-10-25/albers_et_al_2020_germany.py b/etl/steps/data/meadow/chartbook/2024-10-25/albers_et_al_2020_germany.py index b7acb0b381a..5087c2c5917 100644 --- a/etl/steps/data/meadow/chartbook/2024-10-25/albers_et_al_2020_germany.py +++ b/etl/steps/data/meadow/chartbook/2024-10-25/albers_et_al_2020_germany.py @@ -17,7 +17,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("albers_et_al_2020_germany.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="Tabelle1") + tb = snap.read(safe_types=False, sheet_name="Tabelle1") # # Process data. diff --git a/etl/steps/data/meadow/climate/2023-12-20/surface_temperature.py b/etl/steps/data/meadow/climate/2023-12-20/surface_temperature.py index 2a4096dec51..6e0fa1f3be6 100644 --- a/etl/steps/data/meadow/climate/2023-12-20/surface_temperature.py +++ b/etl/steps/data/meadow/climate/2023-12-20/surface_temperature.py @@ -64,7 +64,7 @@ def run(dest_dir: str) -> None: # Read surface temperature data from snapshot da = _load_data_array(snap) - # Read the shapefile to extract country informaiton + # Read the shapefile to extract country information snap_geo = paths.load_snapshot("world_bank.zip") shapefile_name = "WB_countries_Admin0_10m/WB_countries_Admin0_10m.shp" diff --git a/etl/steps/data/meadow/climate/2024-01-26/antarctic_ice_core_co2_concentration.py b/etl/steps/data/meadow/climate/2024-01-26/antarctic_ice_core_co2_concentration.py index 136cb194d7f..5099295b0dd 100644 --- a/etl/steps/data/meadow/climate/2024-01-26/antarctic_ice_core_co2_concentration.py +++ b/etl/steps/data/meadow/climate/2024-01-26/antarctic_ice_core_co2_concentration.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and load data. snap = paths.load_snapshot("antarctic_ice_core_co2_concentration.xls") - tb = snap.read(sheet_name="CO2 Composite", skiprows=14) + tb = snap.read(safe_types=False, sheet_name="CO2 Composite", skiprows=14) # # Process data. diff --git a/etl/steps/data/meadow/climate/2024-11-18/ghg_concentration.py b/etl/steps/data/meadow/climate/2024-11-18/ghg_concentration.py new file mode 100644 index 00000000000..1ca24557052 --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-11-18/ghg_concentration.py @@ -0,0 +1,42 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "co2_concentration_monthly", + "ch4_concentration_monthly", + "n2o_concentration_monthly", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Initialize dictionary to store raw tables. + tables = {} + for file_name in FILES: + # Retrieve snapshot. + snap = paths.load_snapshot(f"{file_name}.csv") + + # Load data from snapshot. + tables[file_name] = snap.read(comment="#", na_values="-9.99") + + # + # Process data. + # + for file_name, tb in tables.items(): + # Set an appropriate index and sort conveniently. + tables[file_name] = tb.set_index(["year", "month"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset with one table for each gas. + ds_meadow = create_dataset(dest_dir, tables=tables.values(), check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-11-18/hawaii_ocean_time_series.py b/etl/steps/data/meadow/climate/2024-11-18/hawaii_ocean_time_series.py new file mode 100644 index 00000000000..0544b0cb638 --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-11-18/hawaii_ocean_time_series.py @@ -0,0 +1,29 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load snapshot and read its data. + tb = paths.load_snapshot("hawaii_ocean_time_series.csv").read(skiprows=8, sep="\t", na_values=[-999]) + + # + # Process data. + # + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["date"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-11-18/ocean_heat_content.py b/etl/steps/data/meadow/climate/2024-11-18/ocean_heat_content.py new file mode 100644 index 00000000000..844f5d34220 --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-11-18/ocean_heat_content.py @@ -0,0 +1,75 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "ocean_heat_content_monthly_world_700m", + "ocean_heat_content_monthly_world_2000m", + "ocean_heat_content_annual_world_700m", + "ocean_heat_content_annual_world_2000m", +] + +# Columns to select from annual data, and how to rename them. +COLUMNS_ANNUAL = { + "YEAR": "date", + "WO": "ocean_heat_content", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load data from snapshots. + tables_monthly = [] + tables_annual = [] + for file_name in FILES: + # Extract depth and location from file name. + depth = int(file_name.split("_")[-1].replace("m", "")) + location = file_name.split("_")[-2].title() + if "monthly" in file_name: + # Read data. + new_table = paths.load_snapshot(f"{file_name}.csv").read(names=["date", "ocean_heat_content"]) + # Add columns for location and depth. + new_table = new_table.assign(**{"depth": depth, "location": location}) + # Add monthly table to list. + tables_monthly.append(new_table) + elif "annual" in file_name: + # Read data, select and rename columns. + new_table = ( + paths.load_snapshot(f"{file_name}.csv") + .read_fwf()[list(COLUMNS_ANNUAL)] + .rename(columns=COLUMNS_ANNUAL, errors="raise") + ) + # Add columns for location and depth. + new_table = new_table.assign(**{"depth": depth, "location": location}) + # Add annual table to list. + tables_annual.append(new_table) + else: + raise ValueError(f"Unexpected file name: {file_name}") + + # + # Process data. + # + # Combine monthly data and add a column for location. + tb_monthly = pr.concat(tables_monthly, short_name="ocean_heat_content_monthly") + + # Combine annual data. + tb_annual = pr.concat(tables_annual, short_name="ocean_heat_content_annual") + + # Set an appropriate index and sort conveniently. + tb_monthly = tb_monthly.set_index(["location", "depth", "date"], verify_integrity=True).sort_index() + tb_annual = tb_annual.set_index(["location", "depth", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb_annual, tb_monthly], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-11-18/sea_ice_index.py b/etl/steps/data/meadow/climate/2024-11-18/sea_ice_index.py new file mode 100644 index 00000000000..d4ded1a7859 --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-11-18/sea_ice_index.py @@ -0,0 +1,51 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("sea_ice_index.xlsx") + + # Read data from snapshot. + data = snap.ExcelFile() + + # + # Process data. + # + # Load sheet of northern hemisphere sea ice extent. + tb_nh = data.parse("NH-Extent").assign(**{"location": "Northern Hemisphere"}) + tb_sh = data.parse("SH-Extent").assign(**{"location": "Southern Hemisphere"}) + + # Sanity check. + assert tb_nh.iloc[0, 0] == 1978, "First cell in NH spreadsheet was expected to be 1978. Data has changed." + assert tb_sh.iloc[0, 0] == 1978, "First cell in SH spreadsheet was expected to be 1978. Data has changed." + + # Concatenate both tables. + tb = pr.concat([tb_sh, tb_nh], ignore_index=True, short_name=paths.short_name) + + # Fix column names. + tb = tb.rename(columns={tb.columns[0]: "year"}) + + # Drop empty rows and columns. + tb = tb.dropna(how="all").dropna(axis=1, how="all").reset_index(drop=True) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-11-18/sea_surface_temperature.py b/etl/steps/data/meadow/climate/2024-11-18/sea_surface_temperature.py new file mode 100644 index 00000000000..50623be8b7a --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-11-18/sea_surface_temperature.py @@ -0,0 +1,49 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "sea_surface_temperature_world", + "sea_surface_temperature_northern_hemisphere", + "sea_surface_temperature_southern_hemisphere", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load data from each of the snapshots, and add a column with the region name. + tables = [ + paths.load_snapshot(f"{file_name}.csv") + .read() + .assign(**{"location": file_name.split("sea_surface_temperature_")[-1].replace("_", " ").title()}) + for file_name in FILES + ] + + # + # Process data. + # + # Concatenate all tables. + tb = pr.concat(tables) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "year", "month"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb.metadata.short_name = paths.short_name + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-11-18/snow_cover_extent.py b/etl/steps/data/meadow/climate/2024-11-18/snow_cover_extent.py new file mode 100644 index 00000000000..86e0d707a8b --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-11-18/snow_cover_extent.py @@ -0,0 +1,50 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "snow_cover_extent_north_america", + "snow_cover_extent_northern_hemisphere", +] + +# Names of columns in the data. +COLUMNS = ["year", "month", "snow_cover_extent"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load snapshot and read its data. + tables = [] + for file_name in FILES: + tb = paths.load_snapshot(f"{file_name}.csv").read_fwf(names=COLUMNS) + # Add a column for location. + tb["location"] = file_name.split("snow_cover_extent_")[-1].replace("_", " ").title() + # Add table to list. + tables.append(tb) + + # + # Process data. + # + # Combine data from all tables. + tb = pr.concat(tables) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "year", "month"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Update table name. + tb.metadata.short_name = paths.short_name + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-11-18/surface_temperature_analysis.py b/etl/steps/data/meadow/climate/2024-11-18/surface_temperature_analysis.py new file mode 100644 index 00000000000..88791a644b7 --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-11-18/surface_temperature_analysis.py @@ -0,0 +1,62 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "surface_temperature_analysis_world", + "surface_temperature_analysis_northern_hemisphere", + "surface_temperature_analysis_southern_hemisphere", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Initialize dictionary to store raw tables. + tables = {} + for file_name in FILES: + # Retrieve snapshot. + snap = paths.load_snapshot(f"{file_name}.csv") + + # Load data from snapshot. + tables[file_name] = snap.read( + skiprows=1, + na_values="***", + usecols=[ + "Year", + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ], + ) + + # + # Process data. + # + for file_name, tb in tables.items(): + # Set an appropriate index and sort conveniently. + tables[file_name] = tb.set_index(["Year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=tables.values(), check_variables_metadata=True) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-11-19/total_precipitation.py b/etl/steps/data/meadow/climate/2024-11-19/total_precipitation.py new file mode 100644 index 00000000000..46d314e7d2c --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-11-19/total_precipitation.py @@ -0,0 +1,166 @@ +"""Load a snapshot and create a meadow dataset.""" + +import io +import zipfile + +import geopandas as gpd +import numpy as np +import pandas as pd +import pyproj +import xarray as xr +from owid.catalog import Table +from rioxarray.exceptions import NoDataInBounds, OneDimensionalRaster +from shapely.geometry import mapping +from structlog import get_logger +from tqdm import tqdm + +from etl.helpers import PathFinder, create_dataset +from etl.snapshot import Snapshot + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Initialize logger. +log = get_logger() + + +def _load_data_array(snap: Snapshot) -> xr.DataArray: + log.info("load_data_array.start") + # Load data from snapshot. + with zipfile.ZipFile(snap.path, "r") as zip_file: + # Iterate through all files in the zip archive + for file_info in zip_file.infolist(): + with zip_file.open(file_info) as file: + file_content = file.read() + # Create an in-memory bytes file and load the dataset + with io.BytesIO(file_content) as memfile: + da = xr.open_dataset(memfile).load() # .load() ensures data is eagerly loaded + da = da["tp"] + # Set the coordinate reference system for the precipitation data to EPSG 4326. + da = da.rio.write_crs("epsg:4326") + + return da + + +def _load_shapefile(file_path: str) -> gpd.GeoDataFrame: + log.info("load_shapefile.start") + shapefile = gpd.read_file(file_path) + return shapefile[["geometry", "WB_NAME"]] + + +def run(dest_dir: str) -> None: + # Activates the usage of the global context. Using this option can enhance the performance + # of initializing objects in single-threaded applications. + pyproj.set_use_global_context(True) # type: ignore + + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("total_precipitation.zip") + + # Read data from snapshot + da = _load_data_array(snap) + + # Read the shapefile to extract country informaiton + snap_geo = paths.load_snapshot("world_bank.zip") + shapefile_name = "WB_countries_Admin0_10m/WB_countries_Admin0_10m.shp" + + # Check if the shapefile exists in the ZIP archive + with zipfile.ZipFile(snap_geo.path, "r"): + # Construct the correct path for Geopandas + file_path = f"zip://{snap_geo.path}!/{shapefile_name}" + + # Read the shapefile directly from the ZIP archive + shapefile = _load_shapefile(file_path) + + # + # Process data. + # + + # Initialize an empty dictionary to store the country-wise average precipitation. + temp_country = {} + + # Add Global mean precipitation + weights = np.cos(np.deg2rad(da.latitude)) + weights.name = "weights" + clim_month_weighted = da.weighted(weights) + global_mean = clim_month_weighted.mean(["longitude", "latitude"]) + temp_country["World"] = global_mean + + # Initialize a list to keep track of small countries where precipitation data extraction fails. + small_countries = [] + + # Iterate over each row in the shapefile data. + for i in tqdm(range(shapefile.shape[0])): + # Extract the data for the current row. + geometry = shapefile.iloc[i]["geometry"] + country_name = shapefile.iloc[i]["WB_NAME"] + + try: + # Clip to the bounding box for the country's shape to significantly improve performance. + xmin, ymin, xmax, ymax = geometry.bounds + clip = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax) + + # Clip data to the country's shape. + # NOTE: if memory is an issue, we could use `from_disk=True` arg + clip = clip.rio.clip([mapping(geometry)], shapefile.crs) + + # Calculate weights based on latitude to account for area distortion in latitude-longitude grids. + weights = np.cos(np.deg2rad(clip.latitude)) + weights.name = "weights" + + # Apply the weights to the clipped precipitation data. + clim_month_weighted = clip.weighted(weights) + + # Calculate the weighted mean precipitation for the country. + country_weighted_mean = clim_month_weighted.mean(dim=["longitude", "latitude"]).values + + # Store the calculated mean precipitation in the dictionary with the country's name as the key. + temp_country[country_name] = country_weighted_mean + + # Clean up the memory + del clip + del weights + del clim_month_weighted + + except (NoDataInBounds, OneDimensionalRaster): + log.info( + f"No data was found in the specified bounds for {country_name}." + ) # If an error occurs (usually due to small size of the country), add the country's name to the small_countries list. # If an error occurs (usually due to small size of the country), add the country's name to the small_countries list. + small_countries.append(shapefile.iloc[i]["WB_NAME"]) + + # Log information about countries for which precipitation data could not be extracted. + log.info( + f"It wasn't possible to extract precipitation data for {len(small_countries)} small countries as they are too small for the resolution of the Copernicus data." + ) + # Define the start and end dates + da["date"] = pd.to_datetime(da["date"].astype(str), format="%Y%m%d") + + # Now you can access the 'dt' accessor + start_time = da["date"].min().dt.date.astype(str).item() + end_time = da["date"].max().dt.date.astype(str).item() + + # Generate a date range from start_time to end_time with monthly frequency + month_middles = pd.date_range(start=start_time, end=end_time, freq="MS") + pd.offsets.Day(14) + + # month_starts is a DateTimeIndex object; you can convert it to a list if needed + month_starts_list = month_middles.tolist() + + df_temp = pd.DataFrame(temp_country) + df_temp["time"] = month_starts_list + + melted_df = df_temp.melt(id_vars=["time"], var_name="country", value_name="total_precipitation") + + # Create a new table and ensure all columns are snake-case and add relevant metadata. + tb = Table(melted_df, short_name=paths.short_name, underscore=True) + tb = tb.format(["time", "country"]) + + tb["total_precipitation"].metadata.origins = [snap.metadata.origin] + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new garden dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate_watch/2024-11-21/emissions_by_sector.py b/etl/steps/data/meadow/climate_watch/2024-11-21/emissions_by_sector.py new file mode 100644 index 00000000000..69072db0a22 --- /dev/null +++ b/etl/steps/data/meadow/climate_watch/2024-11-21/emissions_by_sector.py @@ -0,0 +1,47 @@ +"""Load a snapshot and create a meadow dataset.""" +import gzip +import json + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("emissions_by_sector.gz") + + # Load data from snapshot. + with gzip.open(snap.path) as _file: + data = json.loads(_file.read()) + + # Create table with data and metadata. + tb = snap.read_from_dict(data, underscore=True) + + # + # Process data. + # + # Extract data from column "emissions", which is given as a list of dictionaries with year and value. + tb = tb.explode("emissions").reset_index(drop=True) + + # Extract data for year and values, and add the original metadata to the newly created columns. + for column in ["year", "value"]: + tb[column] = [emissions[column] for emissions in tb["emissions"]] + tb[column] = tb[column].copy_metadata(tb["emissions"]) + + # Drop unnecessary columns. + tb = tb.drop(columns="emissions", errors="raise") + + # Improve table format. + tb = tb.format(["country", "year", "gas", "sector", "data_source"]) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/countries/2023-09-25/isd.py b/etl/steps/data/meadow/countries/2023-09-25/isd.py index 0408270bec6..ac0fc4f07f2 100644 --- a/etl/steps/data/meadow/countries/2023-09-25/isd.py +++ b/etl/steps/data/meadow/countries/2023-09-25/isd.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("isd.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/2024-11-05/github_stats.py b/etl/steps/data/meadow/covid/2024-11-05/github_stats.py new file mode 100644 index 00000000000..7e75dca35db --- /dev/null +++ b/etl/steps/data/meadow/covid/2024-11-05/github_stats.py @@ -0,0 +1,61 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + # Issues + tb_issues = paths.read_snap_table("github_stats_issues.csv", safe_types=False).sort_values("date_created") + tb_issues_com = paths.read_snap_table("github_stats_issues_comments.csv", safe_types=False).sort_values( + "date_created" + ) + tb_issues_usr = paths.read_snap_table("github_stats_issues_users.csv", safe_types=False).rename( + columns={ + "index": "user_id", + } + ) + + # PRs + tb_pr = paths.read_snap_table("github_stats_pr.csv", safe_types=False).sort_values("date_created") + tb_pr_com = paths.read_snap_table("github_stats_pr_comments.csv", safe_types=False).sort_values("date_created") + tb_pr_usr = paths.read_snap_table("github_stats_pr_users.csv", safe_types=False).rename( + columns={ + "index": "user_id", + } + ) + + # Commits + tb_commits = paths.read_snap_table("github_stats_commits.csv", safe_types=False).sort_values("date") + tb_commits_usr = paths.read_snap_table("github_stats_commits_users.csv", safe_types=False).rename( + columns={ + "index": "user_id", + } + ) + + # Join tables + tables = [ + tb_issues.format(["issue_id"]), + tb_issues_com.format(["comment_id"]), + tb_issues_usr.format(["user_id"]), + tb_pr.format(["issue_id"]), + tb_pr_com.format(["comment_id"]), + tb_pr_usr.format(["user_id"]), + tb_commits.format(["sha"]).sort_values("date"), + tb_commits_usr.format(["user_id"]), + ] + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=tables, check_variables_metadata=True) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/covid/latest/cases_deaths.py b/etl/steps/data/meadow/covid/latest/cases_deaths.py index 271db8d0c75..586d73e5063 100644 --- a/etl/steps/data/meadow/covid/latest/cases_deaths.py +++ b/etl/steps/data/meadow/covid/latest/cases_deaths.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("cases_deaths.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/latest/countries_reporting.py b/etl/steps/data/meadow/covid/latest/countries_reporting.py new file mode 100644 index 00000000000..38373dc146a --- /dev/null +++ b/etl/steps/data/meadow/covid/latest/countries_reporting.py @@ -0,0 +1,56 @@ +"""Load a snapshot and create a meadow dataset.""" + +import os + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("github_stats_vax_reporting.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + tb["country"] = tb["country"].apply(extract_filename_without_extension) + tb["date"] = tb["date_first_reported"] + tb = tb[["country", "date", "date_first_reported", "date_first_value"]] + + # Dtypes + tb = tb.astype( + { + "country": "string", + "date": "datetime64[ns]", + "date_first_reported": "datetime64[ns]", + "date_first_value": "datetime64[ns]", + } + ) + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "date"], short_name="vaccinations") + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() + + +def extract_filename_without_extension(file_path): + # Get the base name (filename with extension) + base_name = os.path.basename(file_path) + # Split the base name into name and extension + name, _ = os.path.splitext(base_name) + return name diff --git a/etl/steps/data/meadow/covid/latest/covax.py b/etl/steps/data/meadow/covid/latest/covax.py index d8dba11550c..d4a89f20ab9 100644 --- a/etl/steps/data/meadow/covid/latest/covax.py +++ b/etl/steps/data/meadow/covid/latest/covax.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("covax.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/latest/hospital.py b/etl/steps/data/meadow/covid/latest/hospital.py index fd1fa523aad..27fa04ba55d 100644 --- a/etl/steps/data/meadow/covid/latest/hospital.py +++ b/etl/steps/data/meadow/covid/latest/hospital.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("hospital.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/latest/john_hopkins_university.py b/etl/steps/data/meadow/covid/latest/john_hopkins_university.py index 7bfcfa0916c..f43e424360b 100644 --- a/etl/steps/data/meadow/covid/latest/john_hopkins_university.py +++ b/etl/steps/data/meadow/covid/latest/john_hopkins_university.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("john_hopkins_university.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/latest/sweden_covid.py b/etl/steps/data/meadow/covid/latest/sweden_covid.py index 9cb513a8eb8..6a3e3cc7975 100644 --- a/etl/steps/data/meadow/covid/latest/sweden_covid.py +++ b/etl/steps/data/meadow/covid/latest/sweden_covid.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("sweden_covid.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/latest/testing.py b/etl/steps/data/meadow/covid/latest/testing.py index 18f452fe66f..1a9d5dda528 100644 --- a/etl/steps/data/meadow/covid/latest/testing.py +++ b/etl/steps/data/meadow/covid/latest/testing.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("testing.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/latest/tracking_r.py b/etl/steps/data/meadow/covid/latest/tracking_r.py index 634cb1e8407..6ca1b7db8e2 100644 --- a/etl/steps/data/meadow/covid/latest/tracking_r.py +++ b/etl/steps/data/meadow/covid/latest/tracking_r.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("tracking_r.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/latest/uk_covid.py b/etl/steps/data/meadow/covid/latest/uk_covid.py index f7ca759b1f0..713f4f57ab4 100644 --- a/etl/steps/data/meadow/covid/latest/uk_covid.py +++ b/etl/steps/data/meadow/covid/latest/uk_covid.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("uk_covid.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # Format date tb = year_to_date(tb, zero_day="2020-01-01") diff --git a/etl/steps/data/meadow/covid/latest/vaccinations_age.py b/etl/steps/data/meadow/covid/latest/vaccinations_age.py index bbdf2b980e5..125986c49ec 100644 --- a/etl/steps/data/meadow/covid/latest/vaccinations_age.py +++ b/etl/steps/data/meadow/covid/latest/vaccinations_age.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("vaccinations_age.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/latest/vaccinations_global.py b/etl/steps/data/meadow/covid/latest/vaccinations_global.py index 3a73f3e5ffe..834399ae797 100644 --- a/etl/steps/data/meadow/covid/latest/vaccinations_global.py +++ b/etl/steps/data/meadow/covid/latest/vaccinations_global.py @@ -23,7 +23,7 @@ def run(dest_dir: str) -> None: snap_who = paths.load_snapshot("vaccinations_global_who.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb_who = snap_who.read() # diff --git a/etl/steps/data/meadow/covid/latest/vaccinations_manufacturer.py b/etl/steps/data/meadow/covid/latest/vaccinations_manufacturer.py index 1bce35227b5..a8cac7782b9 100644 --- a/etl/steps/data/meadow/covid/latest/vaccinations_manufacturer.py +++ b/etl/steps/data/meadow/covid/latest/vaccinations_manufacturer.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("vaccinations_manufacturer.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/latest/vaccinations_us.py b/etl/steps/data/meadow/covid/latest/vaccinations_us.py index bada62c77c8..5be88340ee7 100644 --- a/etl/steps/data/meadow/covid/latest/vaccinations_us.py +++ b/etl/steps/data/meadow/covid/latest/vaccinations_us.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("vaccinations_us.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/covid/latest/yougov.py b/etl/steps/data/meadow/covid/latest/yougov.py index e0dd351af6c..cc9c97ac5c5 100644 --- a/etl/steps/data/meadow/covid/latest/yougov.py +++ b/etl/steps/data/meadow/covid/latest/yougov.py @@ -27,7 +27,7 @@ def run(dest_dir: str) -> None: # # Retrieve aux table snap = paths.load_snapshot("yougov_extra_mapping.csv") - tb_mapping = snap.read() + tb_mapping = snap.read(safe_types=False) tb_mapping = process_mapping_table(tb_mapping) # Retrieve country snapshots. @@ -45,7 +45,7 @@ def run(dest_dir: str) -> None: # Retrieve composite table snap_composite = paths.load_snapshot("yougov_composite.csv") - tb_composite = snap_composite.read() + tb_composite = snap_composite.read(safe_types=False) # # Process data. @@ -58,6 +58,7 @@ def run(dest_dir: str) -> None: tb = tb.dropna(subset=["date"]) # Format + tb = tb.sort_values(["country", "date"]).reset_index(drop=True) tb["identifier"] = tb.index tb = tb.format(["identifier"]) tb_mapping = tb_mapping.format(["code_name"]) diff --git a/etl/steps/data/meadow/democracy/2024-03-07/bmr.py b/etl/steps/data/meadow/democracy/2024-03-07/bmr.py index 96cfdf49947..55ac1efea6c 100644 --- a/etl/steps/data/meadow/democracy/2024-03-07/bmr.py +++ b/etl/steps/data/meadow/democracy/2024-03-07/bmr.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("bmr.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/democracy/2024-05-01/ert.py b/etl/steps/data/meadow/democracy/2024-05-01/ert.py index c97630c0028..7866f82a575 100644 --- a/etl/steps/data/meadow/democracy/2024-05-01/ert.py +++ b/etl/steps/data/meadow/democracy/2024-05-01/ert.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ert.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py b/etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py index cb047dee3e2..ac3631838ec 100644 --- a/etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py +++ b/etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("lexical_index.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/democracy/2024-05-13/polity.py b/etl/steps/data/meadow/democracy/2024-05-13/polity.py index c9b66a5ba14..250a38f6183 100644 --- a/etl/steps/data/meadow/democracy/2024-05-13/polity.py +++ b/etl/steps/data/meadow/democracy/2024-05-13/polity.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("polity.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/democracy/2024-05-16/fh.py b/etl/steps/data/meadow/democracy/2024-05-16/fh.py index e7fa96d301f..7cf2c323388 100644 --- a/etl/steps/data/meadow/democracy/2024-05-16/fh.py +++ b/etl/steps/data/meadow/democracy/2024-05-16/fh.py @@ -18,12 +18,12 @@ def run(dest_dir: str) -> None: # # Load ratings snapshot as a table snap = paths.load_snapshot("fh_ratings.xlsx") - tb_ratings_countries = snap.read(sheet_name="Country Ratings, Statuses ", header=[1, 2]) - tb_ratings_territories = snap.read(sheet_name="Territory Ratings, Statuses", header=[1, 2]) + tb_ratings_countries = snap.read(safe_types=False, sheet_name="Country Ratings, Statuses ", header=[1, 2]) + tb_ratings_territories = snap.read(safe_types=False, sheet_name="Territory Ratings, Statuses", header=[1, 2]) # Load scores snapshot as a table snap = paths.load_snapshot("fh_scores.xlsx") - tb_scores = snap.read(sheet_name="FIW06-24") + tb_scores = snap.read(safe_types=False, sheet_name="FIW06-24") # # Process data. diff --git a/etl/steps/data/meadow/democracy/2024-05-21/bti.py b/etl/steps/data/meadow/democracy/2024-05-21/bti.py index 99f7aff0aec..81e1fbbfd7b 100644 --- a/etl/steps/data/meadow/democracy/2024-05-21/bti.py +++ b/etl/steps/data/meadow/democracy/2024-05-21/bti.py @@ -64,7 +64,7 @@ def load_data(snap: Snapshot) -> Table: tbs = [] for year in range(2006, YEAR_MAX + 1, 2): # Read - tb_ = snap.read(sheet_name=f"BTI {year}") + tb_ = snap.read(safe_types=False, sheet_name=f"BTI {year}") # Column check columns_missing = set(COLUMNS) - set(tb_.columns) if columns_missing: diff --git a/etl/steps/data/meadow/democracy/2024-05-22/claassen_mood.py b/etl/steps/data/meadow/democracy/2024-05-22/claassen_mood.py index b825dec8628..5216995ab6d 100644 --- a/etl/steps/data/meadow/democracy/2024-05-22/claassen_mood.py +++ b/etl/steps/data/meadow/democracy/2024-05-22/claassen_mood.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("claassen_mood.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/democracy/2024-05-22/claassen_satisfaction.py b/etl/steps/data/meadow/democracy/2024-05-22/claassen_satisfaction.py index 081bc45eabf..1af7f145efd 100644 --- a/etl/steps/data/meadow/democracy/2024-05-22/claassen_satisfaction.py +++ b/etl/steps/data/meadow/democracy/2024-05-22/claassen_satisfaction.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("claassen_satisfaction.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/democracy/2024-05-22/eiu.py b/etl/steps/data/meadow/democracy/2024-05-22/eiu.py index f582e288d79..5c1513451f4 100644 --- a/etl/steps/data/meadow/democracy/2024-05-22/eiu.py +++ b/etl/steps/data/meadow/democracy/2024-05-22/eiu.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: # Retieve data from Gapminder snap = paths.load_snapshot("eiu_gapminder.csv") - tb_gm = snap.read() + tb_gm = snap.read(safe_types=False) # Retrieve data from EIU (single year reports) shortnames = [ @@ -26,7 +26,7 @@ def run(dest_dir: str) -> None: tbs = [] for name in shortnames: snap = paths.load_snapshot(f"{name}.csv") - tb = snap.read() + tb = snap.read(safe_types=False) tbs.append(tb) # Correct data by Gapminder diff --git a/etl/steps/data/meadow/demography/2023-10-10/zijdeman_et_al_2015.py b/etl/steps/data/meadow/demography/2023-10-10/zijdeman_et_al_2015.py index ce7891819ae..7d85509518a 100644 --- a/etl/steps/data/meadow/demography/2023-10-10/zijdeman_et_al_2015.py +++ b/etl/steps/data/meadow/demography/2023-10-10/zijdeman_et_al_2015.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("zijdeman_et_al_2015.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="Data Long Format") + tb = snap.read(safe_types=False, sheet_name="Data Long Format") # # Process data. diff --git a/etl/steps/data/meadow/demography/2023-11-08/modal_age_death.py b/etl/steps/data/meadow/demography/2023-11-08/modal_age_death.py index a09ebea25fb..73a2e2d1fbf 100644 --- a/etl/steps/data/meadow/demography/2023-11-08/modal_age_death.py +++ b/etl/steps/data/meadow/demography/2023-11-08/modal_age_death.py @@ -19,7 +19,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("modal_age_death.xlsx") # Load data from snapshot. - tb = snap.read(header=1) + tb = snap.read(safe_types=False, header=1) # # Process data. diff --git a/etl/steps/data/meadow/demography/2023-12-20/population_fariss.py b/etl/steps/data/meadow/demography/2023-12-20/population_fariss.py index 4183a90c584..f47d8dacfab 100644 --- a/etl/steps/data/meadow/demography/2023-12-20/population_fariss.py +++ b/etl/steps/data/meadow/demography/2023-12-20/population_fariss.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("population_fariss.rds") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/demography/2024-11-26/multiple_births.py b/etl/steps/data/meadow/demography/2024-11-26/multiple_births.py new file mode 100644 index 00000000000..460a983484a --- /dev/null +++ b/etl/steps/data/meadow/demography/2024-11-26/multiple_births.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("multiple_births.7z") + + # Load data from snapshot. + tb = snap.read_in_archive("HMBD_pooled_data_30.09.2024.csv") + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "stillbirths"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/dummy/2020-01-01/dummy.py b/etl/steps/data/meadow/dummy/2020-01-01/dummy.py index 8a372b666bf..75c3e02bcd0 100644 --- a/etl/steps/data/meadow/dummy/2020-01-01/dummy.py +++ b/etl/steps/data/meadow/dummy/2020-01-01/dummy.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("dummy.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/dummy/2020-01-01/dummy_full.py b/etl/steps/data/meadow/dummy/2020-01-01/dummy_full.py index 65b128afa61..a6404d7dd08 100644 --- a/etl/steps/data/meadow/dummy/2020-01-01/dummy_full.py +++ b/etl/steps/data/meadow/dummy/2020-01-01/dummy_full.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("dummy_full.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/ember/2024-05-08/yearly_electricity.py b/etl/steps/data/meadow/ember/2024-05-08/yearly_electricity.py index d9c266730b4..11ccaed2eac 100644 --- a/etl/steps/data/meadow/ember/2024-05-08/yearly_electricity.py +++ b/etl/steps/data/meadow/ember/2024-05-08/yearly_electricity.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # # Load snapshot and read its data. snap = paths.load_snapshot("yearly_electricity.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/ember/2024-11-20/european_wholesale_electricity_prices.py b/etl/steps/data/meadow/ember/2024-11-20/european_wholesale_electricity_prices.py new file mode 100644 index 00000000000..f8ae1613de9 --- /dev/null +++ b/etl/steps/data/meadow/ember/2024-11-20/european_wholesale_electricity_prices.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("european_wholesale_electricity_prices.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "date"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/emdat/2023-09-20/natural_disasters.py b/etl/steps/data/meadow/emdat/2023-09-20/natural_disasters.py index 8f387734418..5ef3c057326 100644 --- a/etl/steps/data/meadow/emdat/2023-09-20/natural_disasters.py +++ b/etl/steps/data/meadow/emdat/2023-09-20/natural_disasters.py @@ -44,7 +44,7 @@ def run(dest_dir: str) -> None: # Load snapshot. snap = paths.load_snapshot("natural_disasters.xlsx") with warnings.catch_warnings(record=True): - tb = snap.read(sheet_name="emdat data", skiprows=6) + tb = snap.read(safe_types=False, sheet_name="emdat data", skiprows=6) # # Process data. diff --git a/etl/steps/data/meadow/emdat/2024-04-11/natural_disasters.py b/etl/steps/data/meadow/emdat/2024-04-11/natural_disasters.py index 3b1275e8992..f7bf13921d0 100644 --- a/etl/steps/data/meadow/emdat/2024-04-11/natural_disasters.py +++ b/etl/steps/data/meadow/emdat/2024-04-11/natural_disasters.py @@ -42,7 +42,7 @@ def run(dest_dir: str) -> None: # # Load snapshot. snap = paths.load_snapshot("natural_disasters.xlsx") - tb = snap.read(sheet_name="EM-DAT Data") + tb = snap.read(safe_types=False, sheet_name="EM-DAT Data") # # Process data. diff --git a/etl/steps/data/meadow/emissions/2023-10-24/emission_factors.py b/etl/steps/data/meadow/emissions/2023-10-24/emission_factors.py index df67342637a..fbb25c2918e 100644 --- a/etl/steps/data/meadow/emissions/2023-10-24/emission_factors.py +++ b/etl/steps/data/meadow/emissions/2023-10-24/emission_factors.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("emission_factors.xls") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/emissions/2023-11-06/global_warming_potential_factors.py b/etl/steps/data/meadow/emissions/2023-11-06/global_warming_potential_factors.py index d1078f41037..e85d1917cf1 100644 --- a/etl/steps/data/meadow/emissions/2023-11-06/global_warming_potential_factors.py +++ b/etl/steps/data/meadow/emissions/2023-11-06/global_warming_potential_factors.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("global_warming_potential_factors.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/emissions/2024-11-21/national_contributions.py b/etl/steps/data/meadow/emissions/2024-11-21/national_contributions.py new file mode 100644 index 00000000000..df58d26b5f6 --- /dev/null +++ b/etl/steps/data/meadow/emissions/2024-11-21/national_contributions.py @@ -0,0 +1,50 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve all snapshots of the dataset. + snap_annual = paths.load_snapshot("national_contributions_annual_emissions.csv") + snap_cumulative = paths.load_snapshot("national_contributions_cumulative_emissions.csv") + snap_temperature = paths.load_snapshot("national_contributions_temperature_response.csv") + + # Load data from snapshots. + tb_annual = snap_annual.read(underscore=True) + tb_cumulative = snap_cumulative.read(underscore=True) + tb_temperature = snap_temperature.read(underscore=True) + + # + # Process data. + # + # Combine all data into one table. + tb = pr.concat( + [ + tb_annual.assign(**{"file": "annual_emissions"}), + tb_cumulative.assign(**{"file": "cumulative_emissions"}), + tb_temperature.assign(**{"file": "temperature_response"}), + ], + ignore_index=True, + short_name=paths.short_name, + ) + + # Rename columns conveniently. + tb = tb.rename(columns={"cntr_name": "country"}, errors="raise") + + # Set an appropriate index and sort conveniently. + tb = tb.format(keys=["country", "year", "file", "gas", "component"]) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/ess/2023-08-02/ess_trust.py b/etl/steps/data/meadow/ess/2023-08-02/ess_trust.py index 6f356f55a4f..27b0aba56a1 100644 --- a/etl/steps/data/meadow/ess/2023-08-02/ess_trust.py +++ b/etl/steps/data/meadow/ess/2023-08-02/ess_trust.py @@ -87,7 +87,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ess_trust.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/eurostat/2024-11-05/gas_and_electricity_prices.py b/etl/steps/data/meadow/eurostat/2024-11-05/gas_and_electricity_prices.py new file mode 100644 index 00000000000..071b4143ba3 --- /dev/null +++ b/etl/steps/data/meadow/eurostat/2024-11-05/gas_and_electricity_prices.py @@ -0,0 +1,78 @@ +"""Load a snapshot and create a meadow dataset.""" + +import zipfile + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("gas_and_electricity_prices.zip") + + # Create a list to store each table. + tables = [] + # Open the ZIP file and read each TSV file. + with zipfile.ZipFile(snap.path, "r") as zip_file: + for file_name in zip_file.namelist(): + # Read each TSV file into a table. + with zip_file.open(file_name) as file: + dataset_code = file_name.split(".")[0] + # Each data file starts with comma-separated index columns, followed by tab-separated time data. + # Example: + # freq,product,nrg_cons,unit,tax,currency,geo\TIME_PERIOD 2007 2008... + # And for some datasets, there is annual data, and for others bi-annual data, e.g. 2007-S1 2007-S2 2008-S1... + # First, load this file as a table. + _tb = pr.read_csv( + file, sep=r",|\t", engine="python", metadata=snap.to_table_metadata(), origin=snap.metadata.origin + ) + # Identify index columns. + index_columns = [column for column in _tb.columns if not column[0].isdigit()] + # Melt the table to have a single "time" column. + _tb = _tb.melt(id_vars=index_columns, var_name="time", value_name="value") + # Remove spurious "TIME_PERIOD" from one of the columns. + _tb = _tb.rename(columns={column: column.replace("\\TIME_PERIOD", "") for column in _tb.columns}) + # Add the dataset code as a column. + _tb = _tb.assign(**{"dataset_code": dataset_code}) + # Append current table to the list. + tables.append(_tb) + + # Concatenate all tables. + tb = pr.concat(tables, ignore_index=True) + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format( + [ + "freq", + "product", + "nrg_cons", + "unit", + "tax", + "currency", + "geo", + "time", + "dataset_code", + "nrg_prc", + "customer", + "consom", + ] + ) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/forests/2024-05-08/ifl.py b/etl/steps/data/meadow/forests/2024-05-08/ifl.py index 3c5469b2af2..755e5212b85 100644 --- a/etl/steps/data/meadow/forests/2024-05-08/ifl.py +++ b/etl/steps/data/meadow/forests/2024-05-08/ifl.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ifl.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/forests/2024-07-10/tree_cover_loss_by_driver.py b/etl/steps/data/meadow/forests/2024-07-10/tree_cover_loss_by_driver.py index 35f10312805..bb843447fc2 100644 --- a/etl/steps/data/meadow/forests/2024-07-10/tree_cover_loss_by_driver.py +++ b/etl/steps/data/meadow/forests/2024-07-10/tree_cover_loss_by_driver.py @@ -17,7 +17,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot. snap = paths.load_snapshot("dominant_driver.xlsx") - tb = snap.read(sheet_name="data") + tb = snap.read(safe_types=False, sheet_name="data") tb = tb.drop(columns=["iso"]) tb = tb.rename(columns={"loss year": "year", "Driver of loss": "category", "Tree cover loss (ha)": "area"}) # Some large countries are broken down into smaller regions in the dataset, so we need to aggregate them here diff --git a/etl/steps/data/meadow/gapminder/2023-09-18/under_five_mortality.py b/etl/steps/data/meadow/gapminder/2023-09-18/under_five_mortality.py index 6853e0d73da..a4411af39d7 100644 --- a/etl/steps/data/meadow/gapminder/2023-09-18/under_five_mortality.py +++ b/etl/steps/data/meadow/gapminder/2023-09-18/under_five_mortality.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("under_five_mortality.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="Data & sources by observation") + tb = snap.read(safe_types=False, sheet_name="Data & sources by observation") # # Process data. diff --git a/etl/steps/data/meadow/gapminder/2023-09-21/under_five_mortality.py b/etl/steps/data/meadow/gapminder/2023-09-21/under_five_mortality.py index 35d035c01a2..c00e066719d 100644 --- a/etl/steps/data/meadow/gapminder/2023-09-21/under_five_mortality.py +++ b/etl/steps/data/meadow/gapminder/2023-09-21/under_five_mortality.py @@ -21,7 +21,7 @@ def run(dest_dir: str) -> None: tb = Table() # Load data from snapshot. for sheet in sheet_names: - tb_sheet = snap.read(sheet_name=sheet) + tb_sheet = snap.read(safe_types=False, sheet_name=sheet) tb = pr.concat([tb, tb_sheet]) tb.metadata = snap.to_table_metadata() diff --git a/etl/steps/data/meadow/gapminder/2023-09-22/total_fertility_rate.py b/etl/steps/data/meadow/gapminder/2023-09-22/total_fertility_rate.py index 1ae337e80b0..f9b5118f0fd 100644 --- a/etl/steps/data/meadow/gapminder/2023-09-22/total_fertility_rate.py +++ b/etl/steps/data/meadow/gapminder/2023-09-22/total_fertility_rate.py @@ -21,7 +21,7 @@ def run(dest_dir: str) -> None: tb = Table() # Load data from snapshot. for sheet in sheet_names: - tb_sheet = snap.read(sheet_name=sheet) + tb_sheet = snap.read(safe_types=False, sheet_name=sheet) tb = pr.concat([tb, tb_sheet]) tb.metadata = snap.to_table_metadata() diff --git a/etl/steps/data/meadow/gapminder/2024-07-08/maternal_mortality.py b/etl/steps/data/meadow/gapminder/2024-07-08/maternal_mortality.py index 817208128aa..6bd017d45c6 100644 --- a/etl/steps/data/meadow/gapminder/2024-07-08/maternal_mortality.py +++ b/etl/steps/data/meadow/gapminder/2024-07-08/maternal_mortality.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("maternal_mortality.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # drop source & comment columns tb = tb.drop( diff --git a/etl/steps/data/meadow/gcp/2024-11-13/global_carbon_budget.py b/etl/steps/data/meadow/gcp/2024-11-13/global_carbon_budget.py new file mode 100644 index 00000000000..1289f28ad70 --- /dev/null +++ b/etl/steps/data/meadow/gcp/2024-11-13/global_carbon_budget.py @@ -0,0 +1,212 @@ +"""Load a snapshot and create a meadow dataset. + +It combines the following snapshots: +- GCP's Fossil CO2 emissions (long-format csv). +- GCP's official GCB global emissions (excel file) containing global bunker fuel and land-use change emissions. +- GCP's official GCB national emissions (excel file) containing consumption-based emissions for each country. + - Production-based emissions from this file are also used, but just to include total emissions of regions + according to GCP (e.g. "Africa (GCP)") and for sanity checks. +- GCP's official GCB national land-use change emissions (excel file) with land-use change emissions for each country. + +""" + +from owid.catalog import Table +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +# Initialize logger. +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def prepare_fossil_co2(tb_fossil_co2: Table) -> Table: + # Set an appropriate index and sort conveniently. + tb_fossil_co2 = tb_fossil_co2.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Ensure all columns are snake-case. + tb_fossil_co2 = tb_fossil_co2.underscore() + + return tb_fossil_co2 + + +def prepare_historical_budget(tb_historical: Table) -> Table: + """Select variables and prepare the historical budget sheet of GCB's raw global data file. + + Parameters + ---------- + tb_historical : Table + Historical budget sheet of GCB's raw global data file. + + Returns + ------- + tb_historical : Table + Historical budget after selecting variables and processing them. + + """ + # Sanity check. + error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." + assert tb_historical.columns[0] == "Year", error + + # Columns to select in historical budget and how to rename them. + columns = { + "Year": "year", + "fossil emissions excluding carbonation": "global_fossil_emissions", + "land-use change emissions": "global_land_use_change_emissions", + } + tb_historical = tb_historical[list(columns)].rename(columns=columns) + + # Add column for country (to be able to combine this with the national data). + tb_historical["country"] = "World" + + # Set an index and sort row and columns conveniently. + tb_historical = tb_historical.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb_historical.metadata.short_name = "global_carbon_budget_historical_budget" + + return tb_historical + + +def prepare_land_use_emissions(tb_land_use: Table) -> Table: + """Prepare data from a specific sheet of the land-use change data file. + + Parameters + ---------- + tb_land_use : Table + Data from a specific sheet of the land-use change emissions data file. + + Returns + ------- + tb_land_use : Table + Processed land-use change emissions data. + + """ + tb_land_use = tb_land_use.copy() + + # Sanity check. + error = "'BLUE' sheet in national land-use change data file has changed (consider changing 'skiprows')." + assert tb_land_use.columns[1] == "Afghanistan", error + + # Rename year column. + tb_land_use = tb_land_use.rename(columns={tb_land_use.columns[0]: "year"}) + + # Ignore countries that have no data. + tb_land_use = tb_land_use.dropna(axis=1, how="all") + + # Remove rows that are either empty, or have some other additional operation (e.g. 2013-2022). + tb_land_use = tb_land_use[tb_land_use["year"].astype(str).str.match(r"^\d{4}$")].reset_index(drop=True) + + # Restructure data to have a column for country and another for emissions. + tb_land_use = tb_land_use.melt(id_vars="year", var_name="country", value_name="emissions") + + # Set an index and sort row and columns conveniently. + tb_land_use = tb_land_use.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb_land_use.metadata.short_name = "global_carbon_budget_land_use_change" + + return tb_land_use + + +def prepare_national_emissions(tb: Table, column_name: str) -> Table: + """Select variables and prepare the territorial emissions (or the consumption emissions) sheet of GCB's raw national + data file. + + Parameters + ---------- + tb : Table + Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. + column_name : str + Name to assign to emissions column to be generated. + + Returns + ------- + tb_national : Table + Processed territorial (or consumption) emissions sheet of GCB's raw national data file. + + """ + tb = tb.copy() + + error = f"Sheet in national data file for {column_name} has changed (consider changing 'skiprows')." + assert tb.columns[1] == "Afghanistan", error + + # The zeroth column is expected to be year. + tb = tb.rename(columns={tb.columns[0]: "year"}) + + # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". + # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). + # In fact "Bunkers" is a global variable (I don't know why it is included at the national level), but this will be + # handled at the garden step. + + # Remove unnecessary column. + tb = tb.drop(columns=["Statistical Difference"]) + + # Convert from wide to long format dataframe. + tb = tb.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) + + # Set an index and sort row and columns conveniently. + tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb.metadata.short_name = f"global_carbon_budget_{column_name}" + + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots. + snap_fossil_co2 = paths.load_snapshot("global_carbon_budget_fossil_co2_emissions.csv") + snap_global = paths.load_snapshot("global_carbon_budget_global_emissions.xlsx") + snap_national = paths.load_snapshot("global_carbon_budget_national_emissions.xlsx") + snap_land_use = paths.load_snapshot("global_carbon_budget_land_use_change_emissions.xlsx") + + # Load data from fossil CO2 emissions. + tb_fossil_co2 = snap_fossil_co2.read() + + # Load historical budget from the global emissions file. + tb_historical = snap_global.read(sheet_name="Historical Budget", skiprows=15) + + # Load land-use emissions. + tb_land_use = snap_land_use.read(sheet_name="BLUE", skiprows=7) + + # Load production-based national emissions. + tb_production = snap_national.read(sheet_name="Territorial Emissions", skiprows=11) + + # Load consumption-based national emissions. + tb_consumption = snap_national.read(sheet_name="Consumption Emissions", skiprows=8) + + # + # Process data. + # + # Prepare data for fossil CO2 emissions. + tb_fossil_co2 = prepare_fossil_co2(tb_fossil_co2=tb_fossil_co2) + + # Prepare data for historical emissions. + tb_historical = prepare_historical_budget(tb_historical=tb_historical) + + # Prepare data for land-use emissions. + tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) + + # Prepare data for production-based emissions, from the file of national emissions. + tb_production = prepare_national_emissions(tb=tb_production, column_name="production_emissions") + + # Prepare data for consumption-based emissions, from the file of national emissions. + tb_consumption = prepare_national_emissions(tb=tb_consumption, column_name="consumption_emissions") + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, + tables=[tb_fossil_co2, tb_historical, tb_land_use, tb_production, tb_consumption], + default_metadata=snap_fossil_co2.metadata, + check_variables_metadata=True, + ) + ds_meadow.save() diff --git a/etl/steps/data/meadow/gcp/2024-11-21/global_carbon_budget.py b/etl/steps/data/meadow/gcp/2024-11-21/global_carbon_budget.py new file mode 100644 index 00000000000..1289f28ad70 --- /dev/null +++ b/etl/steps/data/meadow/gcp/2024-11-21/global_carbon_budget.py @@ -0,0 +1,212 @@ +"""Load a snapshot and create a meadow dataset. + +It combines the following snapshots: +- GCP's Fossil CO2 emissions (long-format csv). +- GCP's official GCB global emissions (excel file) containing global bunker fuel and land-use change emissions. +- GCP's official GCB national emissions (excel file) containing consumption-based emissions for each country. + - Production-based emissions from this file are also used, but just to include total emissions of regions + according to GCP (e.g. "Africa (GCP)") and for sanity checks. +- GCP's official GCB national land-use change emissions (excel file) with land-use change emissions for each country. + +""" + +from owid.catalog import Table +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +# Initialize logger. +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def prepare_fossil_co2(tb_fossil_co2: Table) -> Table: + # Set an appropriate index and sort conveniently. + tb_fossil_co2 = tb_fossil_co2.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Ensure all columns are snake-case. + tb_fossil_co2 = tb_fossil_co2.underscore() + + return tb_fossil_co2 + + +def prepare_historical_budget(tb_historical: Table) -> Table: + """Select variables and prepare the historical budget sheet of GCB's raw global data file. + + Parameters + ---------- + tb_historical : Table + Historical budget sheet of GCB's raw global data file. + + Returns + ------- + tb_historical : Table + Historical budget after selecting variables and processing them. + + """ + # Sanity check. + error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." + assert tb_historical.columns[0] == "Year", error + + # Columns to select in historical budget and how to rename them. + columns = { + "Year": "year", + "fossil emissions excluding carbonation": "global_fossil_emissions", + "land-use change emissions": "global_land_use_change_emissions", + } + tb_historical = tb_historical[list(columns)].rename(columns=columns) + + # Add column for country (to be able to combine this with the national data). + tb_historical["country"] = "World" + + # Set an index and sort row and columns conveniently. + tb_historical = tb_historical.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb_historical.metadata.short_name = "global_carbon_budget_historical_budget" + + return tb_historical + + +def prepare_land_use_emissions(tb_land_use: Table) -> Table: + """Prepare data from a specific sheet of the land-use change data file. + + Parameters + ---------- + tb_land_use : Table + Data from a specific sheet of the land-use change emissions data file. + + Returns + ------- + tb_land_use : Table + Processed land-use change emissions data. + + """ + tb_land_use = tb_land_use.copy() + + # Sanity check. + error = "'BLUE' sheet in national land-use change data file has changed (consider changing 'skiprows')." + assert tb_land_use.columns[1] == "Afghanistan", error + + # Rename year column. + tb_land_use = tb_land_use.rename(columns={tb_land_use.columns[0]: "year"}) + + # Ignore countries that have no data. + tb_land_use = tb_land_use.dropna(axis=1, how="all") + + # Remove rows that are either empty, or have some other additional operation (e.g. 2013-2022). + tb_land_use = tb_land_use[tb_land_use["year"].astype(str).str.match(r"^\d{4}$")].reset_index(drop=True) + + # Restructure data to have a column for country and another for emissions. + tb_land_use = tb_land_use.melt(id_vars="year", var_name="country", value_name="emissions") + + # Set an index and sort row and columns conveniently. + tb_land_use = tb_land_use.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb_land_use.metadata.short_name = "global_carbon_budget_land_use_change" + + return tb_land_use + + +def prepare_national_emissions(tb: Table, column_name: str) -> Table: + """Select variables and prepare the territorial emissions (or the consumption emissions) sheet of GCB's raw national + data file. + + Parameters + ---------- + tb : Table + Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. + column_name : str + Name to assign to emissions column to be generated. + + Returns + ------- + tb_national : Table + Processed territorial (or consumption) emissions sheet of GCB's raw national data file. + + """ + tb = tb.copy() + + error = f"Sheet in national data file for {column_name} has changed (consider changing 'skiprows')." + assert tb.columns[1] == "Afghanistan", error + + # The zeroth column is expected to be year. + tb = tb.rename(columns={tb.columns[0]: "year"}) + + # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". + # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). + # In fact "Bunkers" is a global variable (I don't know why it is included at the national level), but this will be + # handled at the garden step. + + # Remove unnecessary column. + tb = tb.drop(columns=["Statistical Difference"]) + + # Convert from wide to long format dataframe. + tb = tb.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) + + # Set an index and sort row and columns conveniently. + tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb.metadata.short_name = f"global_carbon_budget_{column_name}" + + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots. + snap_fossil_co2 = paths.load_snapshot("global_carbon_budget_fossil_co2_emissions.csv") + snap_global = paths.load_snapshot("global_carbon_budget_global_emissions.xlsx") + snap_national = paths.load_snapshot("global_carbon_budget_national_emissions.xlsx") + snap_land_use = paths.load_snapshot("global_carbon_budget_land_use_change_emissions.xlsx") + + # Load data from fossil CO2 emissions. + tb_fossil_co2 = snap_fossil_co2.read() + + # Load historical budget from the global emissions file. + tb_historical = snap_global.read(sheet_name="Historical Budget", skiprows=15) + + # Load land-use emissions. + tb_land_use = snap_land_use.read(sheet_name="BLUE", skiprows=7) + + # Load production-based national emissions. + tb_production = snap_national.read(sheet_name="Territorial Emissions", skiprows=11) + + # Load consumption-based national emissions. + tb_consumption = snap_national.read(sheet_name="Consumption Emissions", skiprows=8) + + # + # Process data. + # + # Prepare data for fossil CO2 emissions. + tb_fossil_co2 = prepare_fossil_co2(tb_fossil_co2=tb_fossil_co2) + + # Prepare data for historical emissions. + tb_historical = prepare_historical_budget(tb_historical=tb_historical) + + # Prepare data for land-use emissions. + tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) + + # Prepare data for production-based emissions, from the file of national emissions. + tb_production = prepare_national_emissions(tb=tb_production, column_name="production_emissions") + + # Prepare data for consumption-based emissions, from the file of national emissions. + tb_consumption = prepare_national_emissions(tb=tb_consumption, column_name="consumption_emissions") + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, + tables=[tb_fossil_co2, tb_historical, tb_land_use, tb_production, tb_consumption], + default_metadata=snap_fossil_co2.metadata, + check_variables_metadata=True, + ) + ds_meadow.save() diff --git a/etl/steps/data/meadow/ggdc/2022-12-23/maddison_database.py b/etl/steps/data/meadow/ggdc/2022-12-23/maddison_database.py index 54f5097a4e1..e6c471374f3 100644 --- a/etl/steps/data/meadow/ggdc/2022-12-23/maddison_database.py +++ b/etl/steps/data/meadow/ggdc/2022-12-23/maddison_database.py @@ -14,9 +14,9 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("maddison_database.xlsx") # Load data from snapshot. - tb_pop = snap.read(sheet_name="Population", skiprows=2) - tb_gdp = snap.read(sheet_name="GDP", skiprows=2) - tb_gdppc = snap.read(sheet_name="PerCapita GDP", skiprows=2) + tb_pop = snap.read(safe_types=False, sheet_name="Population", skiprows=2) + tb_gdp = snap.read(safe_types=False, sheet_name="GDP", skiprows=2) + tb_gdppc = snap.read(safe_types=False, sheet_name="PerCapita GDP", skiprows=2) # # Process data. diff --git a/etl/steps/data/meadow/ggdc/2024-01-19/maddison_federico_paper.py b/etl/steps/data/meadow/ggdc/2024-01-19/maddison_federico_paper.py index 717befc130f..adb3869e9f9 100644 --- a/etl/steps/data/meadow/ggdc/2024-01-19/maddison_federico_paper.py +++ b/etl/steps/data/meadow/ggdc/2024-01-19/maddison_federico_paper.py @@ -17,11 +17,11 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("maddison_federico_paper.xlsx") # Load data from snapshot. - tb_africa = snap.read(sheet_name="Africa", skiprows=3) - tb_americas = snap.read(sheet_name="Americas", skiprows=3) - tb_asia = snap.read(sheet_name="Asia", skiprows=3) - tb_europe = snap.read(sheet_name="Europe", skiprows=3) - tb_oceania = snap.read(sheet_name="Oceania", skiprows=3) + tb_africa = snap.read(safe_types=False, sheet_name="Africa", skiprows=3) + tb_americas = snap.read(safe_types=False, sheet_name="Americas", skiprows=3) + tb_asia = snap.read(safe_types=False, sheet_name="Asia", skiprows=3) + tb_europe = snap.read(safe_types=False, sheet_name="Europe", skiprows=3) + tb_oceania = snap.read(safe_types=False, sheet_name="Oceania", skiprows=3) # # Process data. diff --git a/etl/steps/data/meadow/ggdc/2024-04-26/maddison_project_database.py b/etl/steps/data/meadow/ggdc/2024-04-26/maddison_project_database.py index 0f6d18ac706..3d27ffcd7bf 100644 --- a/etl/steps/data/meadow/ggdc/2024-04-26/maddison_project_database.py +++ b/etl/steps/data/meadow/ggdc/2024-04-26/maddison_project_database.py @@ -36,8 +36,8 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("maddison_project_database.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="Full data") - tb_regions = snap.read(sheet_name="Regional data", skiprows=1) + tb = snap.read(safe_types=False, sheet_name="Full data") + tb_regions = snap.read(safe_types=False, sheet_name="Regional data", skiprows=1) # # Process data. diff --git a/etl/steps/data/meadow/happiness/2024-06-09/happiness.py b/etl/steps/data/meadow/happiness/2024-06-09/happiness.py index 4a72258cd8a..87077a6ba5c 100644 --- a/etl/steps/data/meadow/happiness/2024-06-09/happiness.py +++ b/etl/steps/data/meadow/happiness/2024-06-09/happiness.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("happiness.xls") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # add year and country columns tb["year"] = 2023 # 2024 report -> 2023 data diff --git a/etl/steps/data/meadow/health/2023-08-09/unaids.py b/etl/steps/data/meadow/health/2023-08-09/unaids.py index 1e4dc1f554c..1aba1e3fbdb 100644 --- a/etl/steps/data/meadow/health/2023-08-09/unaids.py +++ b/etl/steps/data/meadow/health/2023-08-09/unaids.py @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None: # Load data from snapshot. log.info("health.unaids: loading data from snapshot") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/health/2024-03-21/gmh_countdown.py b/etl/steps/data/meadow/health/2024-03-21/gmh_countdown.py index ad886d20853..b6ef55f5740 100644 --- a/etl/steps/data/meadow/health/2024-03-21/gmh_countdown.py +++ b/etl/steps/data/meadow/health/2024-03-21/gmh_countdown.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gmh_countdown.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/health/2024-04-02/organ_donation_and_transplantation.py b/etl/steps/data/meadow/health/2024-04-02/organ_donation_and_transplantation.py index 13ae7e42859..e3563ac2625 100644 --- a/etl/steps/data/meadow/health/2024-04-02/organ_donation_and_transplantation.py +++ b/etl/steps/data/meadow/health/2024-04-02/organ_donation_and_transplantation.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("organ_donation_and_transplantation.xlsx") - tb = snap.read() + tb = snap.read(safe_types=False) # Population column has a bad formatting, so, for now, store it as a string. tb = tb.astype({"POPULATION": str}) diff --git a/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py b/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py index 9d808e0ffef..15747cb0d33 100644 --- a/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("polio_free_countries.csv") - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb.format() # # Save outputs. diff --git a/etl/steps/data/meadow/health/2024-04-12/polio_status.py b/etl/steps/data/meadow/health/2024-04-12/polio_status.py index b44471e4445..b5e2ad9c0ed 100644 --- a/etl/steps/data/meadow/health/2024-04-12/polio_status.py +++ b/etl/steps/data/meadow/health/2024-04-12/polio_status.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("polio_status.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # Process data. # diff --git a/etl/steps/data/meadow/health/2024-06-11/isaps_plastic_surgery.py b/etl/steps/data/meadow/health/2024-06-11/isaps_plastic_surgery.py index 90ff298e8a6..082bdd219d2 100644 --- a/etl/steps/data/meadow/health/2024-06-11/isaps_plastic_surgery.py +++ b/etl/steps/data/meadow/health/2024-06-11/isaps_plastic_surgery.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("isaps_plastic_surgery.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="Global") + tb = snap.read(safe_types=False, sheet_name="Global") # Add entity World tb["country"] = "World" diff --git a/etl/steps/data/meadow/health/2024-08-23/eurostat_cancer.py b/etl/steps/data/meadow/health/2024-08-23/eurostat_cancer.py index 7a540d747ca..411ae521693 100644 --- a/etl/steps/data/meadow/health/2024-08-23/eurostat_cancer.py +++ b/etl/steps/data/meadow/health/2024-08-23/eurostat_cancer.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("eurostat_cancer.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/health/2024-09-05/seattle_pathogens.py b/etl/steps/data/meadow/health/2024-09-05/seattle_pathogens.py index 4cbc5a33923..bc7d4ca04b1 100644 --- a/etl/steps/data/meadow/health/2024-09-05/seattle_pathogens.py +++ b/etl/steps/data/meadow/health/2024-09-05/seattle_pathogens.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("seattle_pathogens.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/hmd/2024-11-19/hfd.py b/etl/steps/data/meadow/hmd/2024-11-19/hfd.py new file mode 100644 index 00000000000..b785f4c433a --- /dev/null +++ b/etl/steps/data/meadow/hmd/2024-11-19/hfd.py @@ -0,0 +1,343 @@ +"""Load a snapshot and create a meadow dataset. + +Each table has different dimensions. I had to explore them and decide which columns to use as index. Find below the list of columns, tagged according to the columns used to index them: + +2y adjtfrRR +2y adjtfrRRbo + +3y asfrRR +3y asfrRRbo +4y asfrTR +4y asfrTRbo +3c asfrVH +3c asfrVHbo +4A asfrVV +4A asfrVVbo + +3y birthsRR +3y birthsRRbo +4y birthsTR +4y birthsTRbo +3c birthsVH +3c birthsVHbo +4A birthsVV +4A birthsVVbo + +2y cbrRR +2y cbrRRbo + +3c ccfrVH +3c ccfrVHbo + +3x cft + +3y cpfrRR +3y cpfrRRbo +4A cpfrVV +4A cpfrVVbo + +3y exposRR +3y exposRRpa +3y exposRRpac +4y exposTR +3c exposVH +4A exposVV + +2y mabRR +2y mabRRbo +2c mabVH +2c mabVHbo + +3y mi +3y mic + +2y patfr +2y patfrc + +3X pft +3X pftc + +2y pmab +2y pmabc + +2c pprVHbo + +2y sdmabRR +2y sdmabRRbo +2c sdmabVH +2c sdmabVHbo + +2y tfrRR +2y tfrRRbo +2c tfrVH +2c tfrVHbo + +2y totbirthsRR +2y totbirthsRRbo + + +where: + +2y: code, year + + adjtfrRR + Tempo-adjusted total fertility rates, Bongaarts-Feeney method + adjTFR + adjtfrRRbo + Tempo-adjusted total fertility rates by birth order, Bongaarts-Feeney method + adjTFR adjTFR1 adjTFR2 adjTFR3 adjTFR4 adjTFR5p + + + cbrRR + Crude birth rate + CBR + cbrRRbo + Crude birth rate by birth order + CBR CBR1 CBR2 CBR3 CBR4 CBR5p + + mabRR + Period mean ages at birth and period mean ages at birth by age 40 + MAB MAB40 + mabRRbo + Period mean ages at birth by birth order and period mean ages at birth by birth order by age 40 + MAB1 MAB2 MAB3 MAB4 MAB5p MAB40_1 MAB40_2 MAB40_3 MAB40_4 MAB40_5p + + patfr + Parity- and age-adjusted total fertility rate + PATFR PATFR1 PATFR2 PATFR3 PATFR4 PATFR5p + patfrc + Parity- and age-adjusted total fertility rate (based on parity distribution from population census) + PATFR PATFR1 PATFR2 PATFR3 PATFR4 PATFR5p + + pmab + Period table mean ages at birth by birth order + TMAB TMAB1 TMAB2 TMAB3 TMAB4 TMAB5p + pmabc + Period table mean ages at birth by birth order (based on parity distribution from population census) + TMAB TMAB1 TMAB2 TMAB3 TMAB4 TMAB5p + + sdmabRR + Standard deviation in period mean ages at birth and standard deviation in period mean ages at birth by age 40 + sdMAB sdMAB40 + sdmabRRbo + Standard deviation in period mean ages at birth by birth order and standard deviation in period mean ages at birth by birth order by age 40 + sdMAB sdMAB1 sdMAB2 sdMAB3 sdMAB4 sdMAB5p sdMAB40 sdMAB40_1 sdMAB40_2 sdMAB40_3 sdMAB40_4 sdMAB40_5p + + tfrRR + Period total fertility rates and period total fertility rates by age 40 + TFR TFR40 + tfrRRbo + Period total fertility rates by birth order and period total fertility rates by birth order by age 40 + TFR TFR1 TFR2 TFR3 TFR4 TFR5p TFR40 TFR40_1 TFR40_2 TFR40_3 TFR40_4 TFR40_5p + + + => OUTPUT + + columns + adjTFR + CBR + MAB + MAB40 + sdMAB + sdMAB40 + TFR + TFR40 + PATFR + PATFR_c + TMAB + TMAB_c + dimensions + code, year, birth_order + +3y: code, year, age +4y: code, year, age, cohort +2c: code, cohort +3c: code, cohort, age +3x: code, cohort, x +3X: code, year, x +4A: code, year, cohort, ardy +---- + +3y asfrRR +3y asfrRRbo +4y asfrTR +4y asfrTRbo +3c asfrVH +3c asfrVHbo +4A asfrVV +4A asfrVVbo + +3y birthsRR +3y birthsRRbo +4y birthsTR +4y birthsTRbo +3c birthsVH +3c birthsVHbo +4A birthsVV +4A birthsVVbo + + +3c ccfrVH +3c ccfrVHbo + +3x cft + +3y cpfrRR +3y cpfrRRbo +4A cpfrVV +4A cpfrVVbo + +3y exposRR +3y exposRRpa +3y exposRRpac +4y exposTR +3c exposVH +4A exposVV + +2c mabVH +2c mabVHbo + +3y mi +3y mic + +3X pft +3X pftc + +2c pprVHbo + +2c sdmabVH +2c sdmabVHbo + +2c tfrVH +2c tfrVHbo + +2y totbirthsRR +2y totbirthsRRbo + +""" +from pathlib import Path + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Behavior 1: +# Mosrtly use ["code", "year", "age"] except: +# - When table end with 'TR' or 'TRbo': ["code", "year", "year", "cohort"] for those +# - When table end with 'VV' or 'VVbo': ["code", "year", "ardy"] for those +cols_1 = ( + "asfr", + "births", + "ccfr", + "cpfr", + "expos", + "mi", +) +# Behaviour 2: Use ["code", "cohort"] always +cols_2 = ("pprVHbo",) +# Behavior 3: Use ["code", "year", "age"] always +cols_3 = ( + "cft", + "pft", +) + + +def get_cols_format(tb, short_name): + cols_index = None # Default value + + # 2y: code, year + if (short_name.endswith(("RR", "RRbo")) and "Age" not in tb.columns) or ( + short_name in {"patfr", "patfrc", "pmab", "pmabc"} + ): + cols_index = ["code", "year"] + + # 3y: code, year, age + elif (short_name.endswith(("RR", "RRbo", "RRpa", "RRpac")) and "Age" in tb.columns) or ( + short_name in {"mi", "mic"} + ): + cols_index = ["code", "year", "age"] + + # 4y: code, year, age, cohort + elif short_name.endswith(("TR", "TRbo")): + cols_index = ["code", "year", "age", "cohort"] + + # 2c: code, cohort + elif short_name.endswith(("VH", "VHbo")) and "Age" not in tb.columns: + cols_index = ["code", "cohort"] + + # 3c: code, cohort, age + elif short_name.endswith(("VH", "VHbo")) and "Age" in tb.columns: + cols_index = ["code", "cohort", "age"] + + # 3X: code, cohort, x + elif short_name == "cft": + cols_index = ["code", "cohort", "x"] + + # 3X: code, year, x + elif short_name in {"pft", "pftc"}: + cols_index = ["code", "year", "x"] + + # 4A: code, year, cohort, ardy + elif short_name.endswith(("VV", "VVbo")): + cols_index = ["code", "year", "cohort", "ardy"] + + else: + raise Exception(f"No index columns defined for this table! {short_name}") + return cols_index + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("hfd.zip") + + # Load data from snapshot. + tbs = [] + with snap.extract_to_tempdir() as tmp_dir: + p = Path(tmp_dir) + files = sorted(p.glob("Files/zip_w/*.txt")) + for f in files: + # print(f"> {f}") + # Read the content of the text file + tb_ = pr.read_csv( + f, + sep="\s+", + skiprows=2, + na_values=["."], + metadata=snap.to_table_metadata(), + origin=snap.m.origin, + ) + short_name = f.stem + + # Detect the columns to use to index the table (accounting for dimensions) + cols_format = get_cols_format(tb_, short_name) + # print(cols_format) + # print(tb_.columns) + + if short_name in {"pft", "pftc"}: + tb_ = tb_.rename( + columns={ + "L0x": "cap_l0x", + "L1x": "cap_l1x", + "L2x": "cap_l2x", + "L3x": "cap_l3x", + "L4x": "cap_l4x", + } + ) + # Format + tb_ = tb_.format(cols_format, short_name=short_name) + tbs.append(tb_) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=tbs, check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/hmd/2024-12-01/hmd.py b/etl/steps/data/meadow/hmd/2024-12-01/hmd.py new file mode 100644 index 00000000000..d4a56860adf --- /dev/null +++ b/etl/steps/data/meadow/hmd/2024-12-01/hmd.py @@ -0,0 +1,325 @@ +"""Load a snapshot and create a meadow dataset.""" + +import re +from io import StringIO +from pathlib import Path +from typing import List + +import owid.catalog.processing as pr +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +# Life tables +FOLDERS_LT = [ + "lt_male", + "lt_female", + "lt_both", + "c_lt_male", + "c_lt_female", + "c_lt_both", +] +REGEX_LT = ( + r"(?P[a-zA-Z\-\s,]+), Life tables \((?P[a-zA-Z]+) (?P\d+x\d+)\), (?P[a-zA-Z]+)" + r"\tLast modified: (?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" +) +COLUMNS_RENAME_LT = { + "mx": "central_death_rate", + "qx": "probability_of_death", + "ax": "average_survival_length", + "lx": "number_survivors", + "dx": "number_deaths", + "Lx": "number_person_years_lived", + "Tx": "number_person_years_remaining", + "ex": "life_expectancy", +} + +# Exposures +FOLDERS_EXPOSURES = [ + "c_exposures", + "exposures", +] +REGEX_EXP = ( + r"(?P[a-zA-Z\-\s,]+), (?PExposure) to risk \((?P[a-zA-Z]+) (?P\d+x\d+)\),\s\tLast modified: " + r"(?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" +) + +# Mortality +FOLDERS_MOR = [ + "deaths", +] +REGEX_MOR = ( + r"(?P[a-zA-Z\-\s,]+), (?PDeaths) \((?P[a-zA-Z]+) (?P\d+x\d+|Lexis triangle)\),\s\tLast modified: " + r"(?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" +) +# Population +FOLDERS_POP = [ + "population", +] +REGEX_POP = ( + r"(?P[a-zA-Z\-\s,]+?),?\s?(?PPopulation) size \((?P1\-year|abridged)\)\s+Last modified: " + r"(?P\d+ [a-zA-Z]{3} \d+)(; Methods Protocol: v\d+ \(\d+\)|,MPv\d \(in development\))\n\n(?P(?s:.)*)" +) +# Births +FOLDERS_BIRTHS = [ + "births", +] +REGEX_BIRTHS = ( + r"(?P[a-zA-Z\-\s,]+),\s+(?PBirths) \((?P1\-year)\)\s+Last modified: " + r"(?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" +) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("hmd.zip") + + # Load data from snapshot. + with snap.extract_to_tempdir() as tmpdir: + # Population + tb_pop = make_tb( + path=Path(tmpdir), + main_folders=FOLDERS_POP, + regex=REGEX_POP, + snap=snap, + ) + + # Life tables + tb_lt = make_tb( + path=Path(tmpdir), + main_folders=FOLDERS_LT, + regex=REGEX_LT, + snap=snap, + ) + # Exposure + tb_exp = make_tb( + path=Path(tmpdir), + main_folders=FOLDERS_EXPOSURES, + regex=REGEX_EXP, + snap=snap, + ) + # Mortality + tb_m = make_tb( + path=Path(tmpdir), + main_folders=FOLDERS_MOR, + regex=REGEX_MOR, + snap=snap, + ) + + # Births + tb_bi = make_tb( + path=Path(tmpdir), + main_folders=FOLDERS_BIRTHS, + regex=REGEX_BIRTHS, + snap=snap, + ) + + # Life tables + ## Column rename + ## e.g. "Lx -> lx" and "lx -> lx". This will cause an error when setting the index. + tb_lt = tb_lt.rename(columns=COLUMNS_RENAME_LT) + + # Population + ## Invert 'abridged' <-> '1-year' in the type column + message = "Types 'abridged' and '1-year' might not be reversed anymore!" + assert not tb_pop.loc[tb_pop["format"] == "abridged", "Age"].str.contains("-").any(), message + assert tb_pop.loc[tb_pop["format"] == "1-year", "Age"].str.contains("80-84").any(), message + tb_pop["format"] = tb_pop["format"].map( + lambda x: "1-year" if x == "abridged" else "abridged" if x == "1-year" else x + ) + + # Check missing values + _check_nas(tb_lt, 0.01, 14) + _check_nas(tb_exp, 0.23, 47) + _check_nas(tb_m, 0.001, 1) + _check_nas(tb_pop, 0.001, 1) + + # Ensure correct year dtype + tb_lt = _clean_year(tb_lt) + tb_exp = _clean_year(tb_exp) + tb_m = _clean_year(tb_m) + tb_bi = _clean_year(tb_bi) + tb_pop = _clean_population_type(tb_pop) + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tables = [ + tb_lt.format(["country", "year", "sex", "age", "type", "format"], short_name="life_tables"), + tb_exp.format(["country", "year", "sex", "age", "type", "format"], short_name="exposures"), + tb_m.format(["country", "year", "sex", "age", "type", "format"], short_name="deaths"), + tb_pop.format(["country", "year", "sex", "age", "format"], short_name="population"), + tb_bi.format(["country", "year", "sex", "format"], short_name="births"), + ] + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=snap.metadata, + ) + + # Save changes in the new meadow dataset. + ds_meadow.save() + + +def make_tb(path: Path, main_folders: List[str], regex: str, snap) -> Table: + """Create table from multiple category folders. + + It inspects the content in `main_folders` (should be in `path`), and looks for TXT files to parse into tables. + + The output is a table with the relevant indicators and dimensions for all the categories. + + Arguments: + path: Path where the HMD export is located. + main_folders: List of folders to consider in `path`. These should typically be categories, which + group different individual indicators + regex: Regex to extract the metadata for a set of TXTs file found in main_folders. We need this + because the structure of the header in the TXT files slightly varies depending on + the indicator. + """ + # List with all relevant tables + tbs = [] + # Iterate over each top-level folder + for category_folder in main_folders: + main_folder_path = path / category_folder + if not main_folder_path.is_dir(): + raise FileNotFoundError(f"Folder {main_folder_path} not found in {path}") + # Iterate over each indicator folder + for indicator_path in main_folder_path.iterdir(): + if "lexis" in indicator_path.name: + continue + if indicator_path.is_dir(): + # Read all TXT files in the indicator folder, and put them as a single table + paths.log.info(f"Creating list of tables from available files in {path}...") + files = list(indicator_path.glob("*.txt")) + tbs_ = [make_tb_from_txt(f, regex, snap) for f in files] + tbs.extend(tbs_) + # Concatenate all dataframes + tb = pr.concat(tbs, ignore_index=True) + return tb + + +def make_tb_from_txt(text_path: Path, regex: str, snap) -> Table: + """Create a table from a TXT file.""" + # print(text_path) + # Extract fields + groups = extract_fields(regex, text_path) + + # Build df + tb = parse_table(groups["data"], snap) + + # Optional melt + if ("Female" in tb.columns) and ("Male" in tb.columns): + id_vars = [col for col in ["Age", "Year"] if col in tb.columns] + if "name" not in groups: + raise ValueError( + f"Indicator name not found in {text_path}! Please revise that source files' content matches FILE_REGEX." + ) + tb = tb.melt(id_vars=id_vars, var_name="sex", value_name=groups["name"]) + + # Add dimensions + tb = tb.assign( + country=groups["country"], + ) + + # Optional sex column + if "sex" in groups: + tb["sex"] = groups["sex"] + if "format" in groups: + tb["format"] = groups["format"] + if "type" in groups: + tb["type"] = groups["type"] + return tb + + +def extract_fields(regex: str, path: Path) -> dict: + """Structure the fields in the raw TXT file.""" + # Read single file + with open(path, "r") as f: + text = f.read() + # Get relevant fields + match = re.search(regex, text) + if match is not None: + groups = match.groupdict() + else: + raise ValueError(f"No match found in {f}! Please revise that source files' content matches FILE_REGEX.") + return groups + + +def parse_table(data_raw: str, snap): + """Given the raw data from the TXT file (as string) map it to a table.""" + tb_str = data_raw.strip() + tb_str = re.sub(r"\n\s+", "\n", tb_str) + tb_str = re.sub(r"[^\S\r\n]+", "\t", string=tb_str) + tb = pr.read_csv( + StringIO(tb_str), + sep="\t", + na_values=["."], + metadata=snap.to_table_metadata(), + origin=snap.m.origin, + ) + + return tb + + +def _check_nas(tb, missing_row_max, missing_countries_max): + """Check missing values & countries in data.""" + row_nans = tb.isna().any(axis=1) + assert ( + row_nans.sum() / len(tb) < missing_row_max + ), f"Too many missing values in life tables: {row_nans.sum()/len(tb)}" + + # Countries missing + countries_missing_data = tb.loc[row_nans, "country"].unique() + assert ( + len(countries_missing_data) / len(tb) < missing_countries_max + ), f"Too many missing values in life tables: {len(countries_missing_data)}" + + +def _clean_population_type(tb): + """Data provider notes the following: + + For populations with territorial changes, two sets of population estimates are given for years in which a territorial change occurred. The first set of estimates (identified as year "19xx-") refers to the population just before the territorial change, whereas the second set (identified as year "19xx+") refers to the population just after the change. For example, in France, the data for "1914-" cover the previous territory (i.e., as of December 31, 1913), whereas the data for "1914+" reflect the territorial boundaries as of January 1, 1914. + + To avoid confusion and duplicity, whenever there are multiple entries for a year, we keep YYYY+ definition for the year (e.g. country with new territorial changes). + """ + # Crete new column with the year. + regex = r"\b\d{4}\b" + tb["year"] = tb["Year"].astype("string").str.extract(f"({regex})", expand=False) + assert tb["year"].notna().all(), "Year extraction was successful!" + tb["year"] = tb["year"].astype(int) + + # Ensure raw year is as expected + assert ( + tb.groupby(["country", "year", "Age", "sex", "format"]).Year.nunique().max() == 2 + ), "Unexpected number of years (+/-)" + + # Drop duplicate years, keeping YYYY+. + tb["Year"] = tb["Year"].astype("string") + tb = tb.sort_values("Year") + tb = tb.drop_duplicates(subset=["year", "Age", "sex", "country", "format"], keep="first").drop(columns="Year") + + tb = tb.rename(columns={"year": "Year"}) + + # Additionally, remove year periods + tb = _clean_year(tb) + + return tb + + +def _clean_year(tb): + # Remove year ranges, and convert to int + flag = tb["Year"].astype("string").str.contains("-") + tb = tb.loc[~flag] + tb["Year"] = tb["Year"].astype("int") + return tb diff --git a/etl/steps/data/meadow/hmd/2024-12-03/hmd_country.py b/etl/steps/data/meadow/hmd/2024-12-03/hmd_country.py new file mode 100644 index 00000000000..ef50429e65c --- /dev/null +++ b/etl/steps/data/meadow/hmd/2024-12-03/hmd_country.py @@ -0,0 +1,69 @@ +"""Load a snapshot and create a meadow dataset.""" + +from pathlib import Path + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("hmd_country.zip") + + # Load data from snapshot. + paths.log.info("Loading data from snapshot.") + tbs = [] + with snap.extract_to_tempdir() as tmp_dir: + p = Path(tmp_dir) + files = p.glob("**/InputDB/*month.txt") + for f in files: + tb_ = pr.read_csv( + f, + na_values=["."], + metadata=snap.to_table_metadata(), + origin=snap.m.origin, + ) + tb_.columns = tb_.columns.str.strip() + tb_ = tb_.rename( + columns={ + "NoteCode1": "Note1", + "NoteCode2": "Note2", + "NoteCode3": "Note3", + } + ) + tbs.append(tb_) + + # Concatenate + paths.log.info("Concatenating tables.") + tb = pr.concat(tbs, ignore_index=True) + tb = tb.rename(columns={"PopName": "country"}) + + # + # Process data. + # + paths.log.info("Processing data.") + tb = tb.groupby(["country", "Year", "Month"], as_index=False)["Births"].mean() + tb = tb.astype( + { + "country": "string", + "Month": "string", + } + ) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "month"], short_name="monthly") + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/homicide/2024-10-30/unodc.py b/etl/steps/data/meadow/homicide/2024-10-30/unodc.py new file mode 100644 index 00000000000..a396fd20d60 --- /dev/null +++ b/etl/steps/data/meadow/homicide/2024-10-30/unodc.py @@ -0,0 +1,49 @@ +from owid.catalog import Table +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +log = get_logger() + +# naming conventions +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + log.info("unodc.start") + + # retrieve snapshot + snap = paths.load_snapshot("unodc.xlsx") + + tb = snap.read(skiprows=2) + + # clean and transform data + tb = clean_data(tb) + + # reset index so the data can be saved in feather format + tb = tb.format(["country", "year", "indicator", "dimension", "category", "sex", "age", "unit_of_measurement"]) + + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() + + +def clean_data(tb: Table) -> Table: + tb = tb[ + (tb["Dimension"].isin(["Total", "by mechanisms", "by relationship to perpetrator", "by situational context"])) + & ( + tb["Indicator"].isin( + ["Victims of intentional homicide", "Victims of Intentional Homicide - Regional Estimate"] + ) + ) + ] + tb = tb.rename( + columns={ + "Country": "country", + "Year": "year", + }, + errors="raise", + ) + tb = tb.drop(columns=["Iso3_code", "Region", "Subregion"]) + return tb diff --git a/etl/steps/data/meadow/idmc/2024-08-02/internal_displacement.py b/etl/steps/data/meadow/idmc/2024-08-02/internal_displacement.py index 23a527b30ae..2d28a60dbcc 100644 --- a/etl/steps/data/meadow/idmc/2024-08-02/internal_displacement.py +++ b/etl/steps/data/meadow/idmc/2024-08-02/internal_displacement.py @@ -33,7 +33,7 @@ def run(dest_dir: str) -> None: ds_pop = paths.load_dataset("population") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # rename and drop columns tb = tb.drop(columns=COLUMNS_TO_DROP, errors="raise") diff --git a/etl/steps/data/meadow/iea/2024-11-20/fossil_fuel_subsidies.py b/etl/steps/data/meadow/iea/2024-11-20/fossil_fuel_subsidies.py new file mode 100644 index 00000000000..85eb9491ead --- /dev/null +++ b/etl/steps/data/meadow/iea/2024-11-20/fossil_fuel_subsidies.py @@ -0,0 +1,128 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Assumed USD year. +DOLLAR_YEAR = 2023 + + +def prepare_subsidies_by_country_table(tb_subsidies: Table) -> Table: + # The table is split into two subtables: Above, global data, and below, country data. They are separated by an empty row. + columns_total = tb_subsidies.loc[0].tolist() + table_country_start_index = tb_subsidies[tb_subsidies[tb_subsidies.columns[0]] == "Country"].index[0] + columns_countries = tb_subsidies.loc[table_country_start_index].tolist() + error = "Subsidies by country sheet has changed." + assert tb_subsidies.columns[0] == f"Unit: Real {DOLLAR_YEAR} million USD", error + # Check that tables are aligned. + assert columns_total[2:] == columns_countries[2:], error + # Rename columns. + columns = columns_countries[0:2] + [str(int(year)) for year in columns_countries[2:]] + # Extract global subtable and assign country "World". + tb_global = tb_subsidies.loc[1 : table_country_start_index - 1].dropna(how="all").reset_index(drop=True) + tb_global = tb_global.rename( + columns={old_column: new_column for old_column, new_column in zip(tb_global.columns, columns)}, errors="raise" + ) + tb_global["Country"] = "World" + tb_global["Product"] = tb_global["Product"].replace({"All Products": "Total", "Natural Gas": "Gas"}) + # Extract countries subtable. + tb_countries = tb_subsidies.loc[table_country_start_index + 1 :].reset_index(drop=True) + tb_countries = tb_countries.rename( + columns={old_column: new_column for old_column, new_column in zip(tb_countries.columns, columns)}, + errors="raise", + ) + # Combine both tables. + tb = pr.concat([tb_global, tb_countries], ignore_index=True) + # Transpose table. + tb = tb.melt(id_vars=["Country", "Product"], var_name="Year", value_name="subsidy") + + # Improve format. + tb = tb.format(["country", "year", "product"]) + + return tb + + +def prepare_indicators_by_country_table(tb_indicators: Table) -> Table: + # The year of the data is given in the very first cell. The actual table starts a few rows below. + error = "Indicators by country sheet has changed." + assert tb_indicators.columns[0] == f"Indicators for year {DOLLAR_YEAR}", error + columns = { + "Country": "country", + "Average subsidisation rate (%)": "subsidization_rate", + "Subsidy per capita ($/person)": "subsidy_per_capita", + "Total subsidy as share of GDP (%)": "subsidy_as_share_of_gdp", + } + assert tb_indicators.loc[2].tolist() == list(columns), error + tb_indicators = tb_indicators.loc[3:].reset_index(drop=True) + tb_indicators = tb_indicators.rename( + columns={old_column: new_column for old_column, new_column in zip(tb_indicators.columns, columns.values())}, + errors="raise", + ) + # Add a year column. + tb_indicators = tb_indicators.assign(**{"year": DOLLAR_YEAR}) + # Improve format. + tb_indicators = tb_indicators.format(short_name="fossil_fuel_subsidies_indicators") + + return tb_indicators + + +def prepare_transport_oil_table(tb_transport: Table) -> Table: + columns = ["country"] + [str(int(year)) for year in tb_transport.loc[0][1:].tolist()] + error = "Transport Oil Subsidies sheet has changed." + assert tb_transport.columns[0] == f"Unit: Real {DOLLAR_YEAR} million USD", error + assert [column.isdigit() for column in columns[1:]], error + tb_transport = tb_transport.loc[1:].reset_index(drop=True) + tb_transport = tb_transport.rename( + columns={old_column: new_column for old_column, new_column in zip(tb_transport.columns, columns)}, + errors="raise", + ) + # Transpose table. + tb_transport = tb_transport.melt(id_vars=["country"], var_name="year", value_name="transport_oil_subsidy") + # Improve format. + tb_transport = tb_transport.format(short_name="fossil_fuel_subsidies_transport_oil") + + return tb_transport + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("fossil_fuel_subsidies.xlsx") + + # Load data from all relevant sheets in the snapshot file. + tb_subsidies = snap.read(sheet_name="Subsidies by country", skiprows=3) + tb_indicators = snap.read(sheet_name="Indicators by country") + tb_transport = snap.read(sheet_name="Transport Oil Subsidies", skiprows=3) + + # + # Process data. + # + # Prepare "Subsidies by country" table. + tb_subsidies = prepare_subsidies_by_country_table(tb_subsidies=tb_subsidies) + + # Prepare "Indicators by country" table. + tb_indicators = prepare_indicators_by_country_table(tb_indicators=tb_indicators) + + # Prepare "Transport Oil Subsidies" table. + tb_transport = prepare_transport_oil_table(tb_transport=tb_transport) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset( + dest_dir, + tables=[tb_subsidies, tb_indicators, tb_transport], + check_variables_metadata=True, + default_metadata=snap.metadata, + ) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/igh/2024-07-05/better_data_homelessness.py b/etl/steps/data/meadow/igh/2024-07-05/better_data_homelessness.py index 1a84422eb49..6d5ff6be715 100644 --- a/etl/steps/data/meadow/igh/2024-07-05/better_data_homelessness.py +++ b/etl/steps/data/meadow/igh/2024-07-05/better_data_homelessness.py @@ -46,7 +46,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("better_data_homelessness.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="BDP 2024") + tb = snap.read(safe_types=False, sheet_name="BDP 2024") # # Process data. diff --git a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_cause.py b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_cause.py index 011e8780a49..13a5e217db9 100644 --- a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_cause.py +++ b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_cause.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gbd_cause.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # standardize column names tb = clean_data(tb) # fix percent values - they aren't consistently presented as either 0-1, or 0-100. diff --git a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_child_mortality.py b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_child_mortality.py index fb4d867dd63..d15a7c3063e 100644 --- a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_child_mortality.py +++ b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_child_mortality.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gbd_child_mortality.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # standardize column names tb = clean_data(tb) # fix percent values - they aren't consistently presented as either 0-1, or 0-100. diff --git a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_drug_risk.py b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_drug_risk.py index a930ccfcce4..bdf02be19a8 100644 --- a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_drug_risk.py +++ b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_drug_risk.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gbd_drug_risk.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # standardize column names tb = clean_data(tb) # fix percent values - they aren't consistently presented as either 0-1, or 0-100. diff --git a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_mental_health.py b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_mental_health.py index 97666ebab4f..ecc9aff54ad 100644 --- a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_mental_health.py +++ b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_mental_health.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gbd_mental_health.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # standardize column names tb = clean_data(tb) diff --git a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_mental_health_burden.py b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_mental_health_burden.py index c88075bdd4b..b656c43e60e 100644 --- a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_mental_health_burden.py +++ b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_mental_health_burden.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gbd_mental_health_burden.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # standardize column names tb = clean_data(tb) diff --git a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_prevalence.py b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_prevalence.py index abb1d2404f5..15e87f7f9f2 100644 --- a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_prevalence.py +++ b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_prevalence.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gbd_prevalence.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # standardize column names tb = clean_data(tb) # diff --git a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_risk.py b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_risk.py index 7753ebe9b78..13a99c10abf 100644 --- a/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_risk.py +++ b/etl/steps/data/meadow/ihme_gbd/2024-05-20/gbd_risk.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gbd_risk.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # standardize column names tb = clean_data(tb) # fix percent values - they aren't consistently presented as either 0-1, or 0-100. diff --git a/etl/steps/data/meadow/ihme_gbd/2024-05-20/impairments.py b/etl/steps/data/meadow/ihme_gbd/2024-05-20/impairments.py index 7f8a41c3fee..0f49990eda2 100644 --- a/etl/steps/data/meadow/ihme_gbd/2024-05-20/impairments.py +++ b/etl/steps/data/meadow/ihme_gbd/2024-05-20/impairments.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("impairments.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/ihme_gbd/2024-08-26/gbd_risk_cancer.py b/etl/steps/data/meadow/ihme_gbd/2024-08-26/gbd_risk_cancer.py index 37f735c9e82..3737b3ce95e 100644 --- a/etl/steps/data/meadow/ihme_gbd/2024-08-26/gbd_risk_cancer.py +++ b/etl/steps/data/meadow/ihme_gbd/2024-08-26/gbd_risk_cancer.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gbd_risk_cancer.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # standardize column names tb = clean_data(tb) # fix percent values - they aren't consistently presented as either 0-1, or 0-100. diff --git a/etl/steps/data/meadow/imf/2024-06-12/public_finances_modern_history.py b/etl/steps/data/meadow/imf/2024-06-12/public_finances_modern_history.py index 719f7b4183a..6e2f3dcfccd 100644 --- a/etl/steps/data/meadow/imf/2024-06-12/public_finances_modern_history.py +++ b/etl/steps/data/meadow/imf/2024-06-12/public_finances_modern_history.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("public_finances_modern_history.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="data") + tb = snap.read(safe_types=False, sheet_name="data") # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. tb = tb.format(["country", "year"]) diff --git a/etl/steps/data/meadow/imf/2024-11-25/world_economic_outlook.py b/etl/steps/data/meadow/imf/2024-11-25/world_economic_outlook.py new file mode 100644 index 00000000000..b534462322e --- /dev/null +++ b/etl/steps/data/meadow/imf/2024-11-25/world_economic_outlook.py @@ -0,0 +1,124 @@ +"""Load a snapshot and create a meadow dataset.""" + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +VARIABLE_LIST = [ + "NGDP_RPCH", # Gross domestic product, constant prices / Percent change + "LUR", # Unemployment rate / Percent of total labor force +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("world_economic_outlook.xls") + + # Load data from snapshot. + tb = snap.read_csv(delimiter="\t", encoding="utf-16-le") + + print(tb) + + # + # Process data. + # + tb = select_data(tb) + tb = make_variable_names(tb) + tb = pick_variables(tb) + tb = reshape_and_clean(tb) + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() + + +def select_data(tb: Table) -> Table: + """ + Selects the data we want to import from the raw table + """ + + tb = tb.drop( + columns=[ + "WEO Country Code", + "ISO", + "Country/Series-specific Notes", + "Subject Notes", + "Scale", + ] + ).dropna(subset=["Country"]) + + return tb + + +def make_variable_names(tb: Table) -> Table: + """ + Creates a variable name from the Subject Descriptor and Units columns. + """ + + tb["variable"] = tb["Subject Descriptor"] + " - " + tb["Units"] + tb = tb.drop(columns=["Subject Descriptor", "Units"]) + + return tb + + +def pick_variables(tb: Table) -> Table: + """ + Selects the variables we want to import from the raw table. + """ + + # Select only the variables we want to import. + tb = tb[tb["WEO Subject Code"].isin(VARIABLE_LIST)].reset_index(drop=True) + + # Drop WEO Subject Code + tb = tb.drop(columns="WEO Subject Code") + + return tb + + +def reshape_and_clean(tb: Table) -> Table: + """ + Reshapes the table from wide to long format and cleans the data. + """ + + # Drop any column with "Unnamed" in the name. + tb = tb.drop(columns=tb.columns[tb.columns.str.contains("Unnamed")]) + + tb = tb.melt(id_vars=["Country", "variable", "Estimates Start After"], var_name="year") + + # Coerce values to numeric. + tb["value"] = tb["value"].replace("--", pd.NA).astype("Float64") + tb["year"] = tb["year"].astype("Int64") + + # Split between observations and forecasts + tb.loc[tb.year > tb["Estimates Start After"], "variable"] += "_forecast" + tb.loc[tb.year <= tb["Estimates Start After"], "variable"] += "_observation" + + # Drop rows with missing values. + tb = tb.dropna(subset=["value"]) + + # Drop Estimates Start After + tb = tb.drop(columns="Estimates Start After") + + tb = tb.pivot( + index=["Country", "year"], + columns="variable", + values="value", + join_column_levels_with="_", + ) + + return tb diff --git a/etl/steps/data/meadow/insee/2024-03-21/inequality_france_1999.py b/etl/steps/data/meadow/insee/2024-03-21/inequality_france_1999.py index af1241b8517..69f950cbc5a 100644 --- a/etl/steps/data/meadow/insee/2024-03-21/inequality_france_1999.py +++ b/etl/steps/data/meadow/insee/2024-03-21/inequality_france_1999.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("inequality_france_1999.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/insee/2024-04-05/inequality_france.py b/etl/steps/data/meadow/insee/2024-04-05/inequality_france.py index bbef1e787ab..0f71472eeed 100644 --- a/etl/steps/data/meadow/insee/2024-04-05/inequality_france.py +++ b/etl/steps/data/meadow/insee/2024-04-05/inequality_france.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("inequality_france.xlsx") # Load data from snapshot. - tb = snap.read(header=2) + tb = snap.read(safe_types=False, header=2) # # Process data. diff --git a/etl/steps/data/meadow/insee/2024-04-25/insee_premiere_1875.py b/etl/steps/data/meadow/insee/2024-04-25/insee_premiere_1875.py index 9cf292a8b75..47b8ca7d513 100644 --- a/etl/steps/data/meadow/insee/2024-04-25/insee_premiere_1875.py +++ b/etl/steps/data/meadow/insee/2024-04-25/insee_premiere_1875.py @@ -37,8 +37,8 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("insee_premiere_1875.xlsx") # Load data from snapshot. - tb_inequality = snap.read(sheet_name="Figure 2", skiprows=2) - tb_poverty = snap.read(sheet_name="Figure 3", skiprows=2) + tb_inequality = snap.read(safe_types=False, sheet_name="Figure 2", skiprows=2) + tb_poverty = snap.read(safe_types=False, sheet_name="Figure 3", skiprows=2) # Process data. tb_inequality = process_inequality_data(tb=tb_inequality, columns=COLUMNS_INEQUALITY, short_name="inequality") diff --git a/etl/steps/data/meadow/insee/2024-04-26/relative_poverty_france.py b/etl/steps/data/meadow/insee/2024-04-26/relative_poverty_france.py index 1803b72cfe3..7ae62c96f49 100644 --- a/etl/steps/data/meadow/insee/2024-04-26/relative_poverty_france.py +++ b/etl/steps/data/meadow/insee/2024-04-26/relative_poverty_france.py @@ -24,7 +24,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("relative_poverty_france.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="Données", skiprows=3) + tb = snap.read(safe_types=False, sheet_name="Données", skiprows=3) # # Process data. diff --git a/etl/steps/data/meadow/irena/2023-12-12/renewable_energy_patents.py b/etl/steps/data/meadow/irena/2023-12-12/renewable_energy_patents.py index 782d1e58f67..44ec7c4668c 100644 --- a/etl/steps/data/meadow/irena/2023-12-12/renewable_energy_patents.py +++ b/etl/steps/data/meadow/irena/2023-12-12/renewable_energy_patents.py @@ -23,7 +23,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot. snap = paths.load_snapshot("renewable_energy_patents.xlsx") - tb = snap.read(sheet_name="INSPIRE_data") + tb = snap.read(safe_types=False, sheet_name="INSPIRE_data") # # Process data. diff --git a/etl/steps/data/meadow/irena/2024-10-29/renewable_power_generation_costs.py b/etl/steps/data/meadow/irena/2024-10-29/renewable_power_generation_costs.py index 25bff138259..ce9b9692312 100644 --- a/etl/steps/data/meadow/irena/2024-10-29/renewable_power_generation_costs.py +++ b/etl/steps/data/meadow/irena/2024-10-29/renewable_power_generation_costs.py @@ -25,8 +25,20 @@ EXPECTED_LCOE_UNIT = f"{EXPECTED_DOLLAR_YEAR} USD/kWh" # Expected unit to be found in the solar PV module prices sheet. EXPECTED_SOLAR_PV_MODULE_COST_UNIT = f"{EXPECTED_DOLLAR_YEAR} USD/W" -# Photovoltaic technology to choose for average monthly PV module costs. -PV_TECHNOLOGY = "Thin film a-Si/u-Si or Global Price Index (from Q4 2013)" +# Photovoltaic technologies to consider for average monthly PV module costs. +PV_TECHNOLOGIES = [ + # "Crystalline Europe (Germany)", + # "Crystalline China", + # "Crystalline Japan", + # "Thin film a-Si", + # "Thin film CdS/CdTe", + "Thin film a-Si/u-Si or Global Price Index (from Q4 2013)", + # "Bifacial", + # "High Efficiency", + # "All black", + # "Mainstream", + # "Low Cost", +] def prepare_solar_pv_module_prices(data: pr.ExcelFile) -> Table: @@ -64,22 +76,23 @@ def prepare_solar_pv_module_prices(data: pr.ExcelFile) -> Table: # Select PV technologies. error = "Names of solar PV module technologies have changed." - assert PV_TECHNOLOGY in set(pv_prices["technology"]), error - pv_prices = pv_prices[pv_prices["technology"] == PV_TECHNOLOGY].reset_index(drop=True) + assert set(PV_TECHNOLOGIES) <= set(pv_prices["technology"]), error + pv_prices = pv_prices[pv_prices["technology"].isin(PV_TECHNOLOGIES)].reset_index(drop=True) - # Get year from dates. + # Get month and year from dates. pv_prices["year"] = pd.to_datetime(pv_prices["month"], format="%b %y").dt.year + pv_prices["n_month"] = pd.to_datetime(pv_prices["month"], format="%b %y").dt.month # For each year get the average cost over all months. pv_prices = ( - pv_prices.groupby(["technology", "year"]) - .agg({"cost": "mean", "year": "count"}) - .rename(columns={"year": "n_months"}) + pv_prices.groupby(["year"]) + .agg({"cost": "mean", "n_month": "nunique"}) + .rename(columns={"n_month": "n_months"}) .reset_index() ) - # Remove unnecessary column and add column for region. - pv_prices = pv_prices.drop(columns="technology", errors="raise").assign(**{"country": "World"}) + # Add column for region. + pv_prices = pv_prices.assign(**{"country": "World"}) # Sanity check. error = "Incomplete years (with less than 12 months of data) were expected to be either the first or the last." @@ -97,8 +110,9 @@ def prepare_solar_pv_module_prices(data: pr.ExcelFile) -> Table: pv_prices[ "cost" ].metadata.description_short = "This data is expressed in US dollars per watt, adjusted for inflation." + pv_technologies = ", ".join([f"'{tech}'" for tech in PV_TECHNOLOGIES]) pv_prices["cost"].metadata.description_key = [ - f"IRENA presents solar photovoltaic module prices for a number of different technologies. Here we use the figures for '{PV_TECHNOLOGY}'." + f"IRENA presents solar photovoltaic module prices for a number of different technologies. Here we use the average yearly price for technologies {pv_technologies}." ] return pv_prices diff --git a/etl/steps/data/meadow/irena/2024-11-01/renewable_capacity_statistics.py b/etl/steps/data/meadow/irena/2024-11-01/renewable_capacity_statistics.py new file mode 100644 index 00000000000..da7bcb70ce9 --- /dev/null +++ b/etl/steps/data/meadow/irena/2024-11-01/renewable_capacity_statistics.py @@ -0,0 +1,104 @@ +"""Load a snapshot and create a meadow dataset.""" + +from typing import Dict + +import owid.catalog.processing as pr +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Expected sheet names. +EXPECTED_SHEET_NAMES = ["Country", "All Data", "Regional", "Global", "About"] +# Expected columns in the All Data sheet. +EXPECTED_COLUMNS = [ + "Region", + "Sub-region", + "Country", + "ISO3 code", + "M49 code", + "RE or Non-RE", + "Group Technology", + "Technology", + "Sub-Technology", + "Producer Type", + "Year", + "Electricity Generation (GWh)", + "Electricity Installed Capacity (MW)", + "Heat Generation (TJ)", + "Off-grid Biogas for Cooking (1,000 inhabitants)", + "Off-grid Biogas Production (1,000 m3)", + "Off-grid Electricity Access (1,000 inhabitants)", + "Public Flows (2021 USD M)", + "SDG 7a1 Intl. Public Flows (2021 USD M)", + "SDG 7b1 RE capacity per capita (W/inhabitant)", +] + + +def sanity_check_inputs(tables: Dict[str, Table]) -> None: + # Sanity checks. + error = "Sheet names have changed." + assert set(tables) == set(EXPECTED_SHEET_NAMES), error + + error = "Columns have changed in the 'All Data' sheet." + assert set(EXPECTED_COLUMNS) == set(tables["All Data"].columns), error + + # Ensure data from "Country" and "All Data" sheets agree. + check = tables["All Data"].merge( + tables["Country"].rename(columns={"Electricity Installed Capacity (MW)": "check"})[ + ["Country", "Year", "Sub-Technology", "Producer Type", "check"] + ], + how="inner", + on=["Country", "Year", "Sub-Technology", "Producer Type"], + ) + check = ( + check[["Country", "Year", "Sub-Technology", "Electricity Installed Capacity (MW)", "check"]] + .dropna() + .reset_index(drop=True) + ) + error = "Unexpected mismatch between data from 'Country' and 'All Data' sheets." + assert check[check["Electricity Installed Capacity (MW)"] != check["check"]].empty, error + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load snapshot and read data from all its sheets. + snap = paths.load_snapshot("renewable_capacity_statistics.xlsx") + data = snap.ExcelFile() + tables = {sheet: data.parse(sheet) for sheet in data.sheet_names} + + # Sanity checks. + sanity_check_inputs(tables) + + # Combine global, regional, and country-level data. + tb_global = ( + tables["Global"] + .assign(**{"Country": "World"}) + .rename( + columns={"Sum of Electricity Installed Capacity (MW)": "Electricity Installed Capacity (MW)"}, + errors="raise", + ) + ) + tb_regional = ( + tables["Regional"] + .rename(columns={"Region": "Country"}, errors="raise") + .rename( + columns={"Sum of Electricity Installed Capacity (MW)": "Electricity Installed Capacity (MW)"}, + errors="raise", + ) + ) + tb_all_data = tables["All Data"].drop(columns=["Region", "Sub-region", "ISO3 code", "M49 code"], errors="raise") + tb = pr.concat([tb_all_data, tb_global, tb_regional], ignore_index=True) + + # Format table. + tb = tb.format(keys=["country", "year", "group_technology", "technology", "sub_technology", "producer_type"]) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/irena/2024-11-15/renewable_power_generation_costs.py b/etl/steps/data/meadow/irena/2024-11-15/renewable_power_generation_costs.py new file mode 100644 index 00000000000..ce9b9692312 --- /dev/null +++ b/etl/steps/data/meadow/irena/2024-11-15/renewable_power_generation_costs.py @@ -0,0 +1,377 @@ +"""Extract global (as well as at the country level for some countries) weighted-average levelized cost of electricity +(LCOE) for all energy sources from IRENA's Renewable Power Generation Costs 2022 dataset. + +Extract solar photovoltaic module prices too. + +NOTE: The original data is poorly formatted. Each energy source is given as a separate sheet, with a different +structure. So it's likely that, on the next update, this script will not work. + +""" + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Expected USD year. +# NOTE: We could get this from the version, but, if later on we create a minor upgrade with a different year, this will fail. +# So, instead, hardcode the year and change it on next update. +EXPECTED_DOLLAR_YEAR = 2023 +# Expected unit to be found in each of the LCOE sheets. +EXPECTED_LCOE_UNIT = f"{EXPECTED_DOLLAR_YEAR} USD/kWh" +# Expected unit to be found in the solar PV module prices sheet. +EXPECTED_SOLAR_PV_MODULE_COST_UNIT = f"{EXPECTED_DOLLAR_YEAR} USD/W" +# Photovoltaic technologies to consider for average monthly PV module costs. +PV_TECHNOLOGIES = [ + # "Crystalline Europe (Germany)", + # "Crystalline China", + # "Crystalline Japan", + # "Thin film a-Si", + # "Thin film CdS/CdTe", + "Thin film a-Si/u-Si or Global Price Index (from Q4 2013)", + # "Bifacial", + # "High Efficiency", + # "All black", + # "Mainstream", + # "Low Cost", +] + + +def prepare_solar_pv_module_prices(data: pr.ExcelFile) -> Table: + """Prepare yearly data on average solar photovoltaic module prices. + + Monthly data will be averaged, and only complete years (with 12 informed months) will be considered. + + Parameters + ---------- + data : pr.ExcelFile + Raw data. + + Returns + ------- + pv_prices : Table + PV prices. + + """ + # NOTE: The currency is not explicitly given in sheet 3.2. But it is in sheet B3.1a (we assume it's the same). + error = "Cost unit for solar PV module prices has changed." + assert ( + data.parse(sheet_name="Fig B3.1a", skiprows=6).dropna(axis=1).columns[0] == EXPECTED_SOLAR_PV_MODULE_COST_UNIT + ), error + + # Load upper table in sheet from Figure 3.2, which is: + # Average monthly solar PV module prices by technology and manufacturing country sold in Europe, 2010 to 2021. + pv_prices = data.parse(sheet_name="Fig 3.2", skiprows=7).dropna(axis=1, how="all") + error = "The file format for solar PV module prices has changed." + assert pv_prices.columns[0] == "Technology", error + + # Transpose table so that each row corresponds to a month. + pv_prices = pv_prices.rename(columns={"Technology": "technology"}, errors="raise").melt( + id_vars="technology", var_name="month", value_name="cost" + ) + + # Select PV technologies. + error = "Names of solar PV module technologies have changed." + assert set(PV_TECHNOLOGIES) <= set(pv_prices["technology"]), error + pv_prices = pv_prices[pv_prices["technology"].isin(PV_TECHNOLOGIES)].reset_index(drop=True) + + # Get month and year from dates. + pv_prices["year"] = pd.to_datetime(pv_prices["month"], format="%b %y").dt.year + pv_prices["n_month"] = pd.to_datetime(pv_prices["month"], format="%b %y").dt.month + + # For each year get the average cost over all months. + pv_prices = ( + pv_prices.groupby(["year"]) + .agg({"cost": "mean", "n_month": "nunique"}) + .rename(columns={"n_month": "n_months"}) + .reset_index() + ) + + # Add column for region. + pv_prices = pv_prices.assign(**{"country": "World"}) + + # Sanity check. + error = "Incomplete years (with less than 12 months of data) were expected to be either the first or the last." + assert pv_prices[pv_prices["n_months"] != 12].index.isin([0, len(pv_prices) - 1]).all(), error + + # Ignore years for which we don't have 12 months. + pv_prices = pv_prices[pv_prices["n_months"] == 12].drop(columns=["n_months"], errors="raise").reset_index(drop=True) + + # Improve table formatting. + pv_prices = pv_prices.format(sort_columns=True, short_name="solar_photovoltaic_module_prices") + + # Add units. + pv_prices["cost"].metadata.unit = f"constant {EXPECTED_DOLLAR_YEAR} US$ per watt" + pv_prices["cost"].metadata.short_unit = "$/W" + pv_prices[ + "cost" + ].metadata.description_short = "This data is expressed in US dollars per watt, adjusted for inflation." + pv_technologies = ", ".join([f"'{tech}'" for tech in PV_TECHNOLOGIES]) + pv_prices["cost"].metadata.description_key = [ + f"IRENA presents solar photovoltaic module prices for a number of different technologies. Here we use the average yearly price for technologies {pv_technologies}." + ] + + return pv_prices + + +def extract_global_cost_for_all_sources_from_excel_file(data: pr.ExcelFile) -> Table: + """Extract global weighted-average LCOE of all energy sources from the excel file. + + Each energy source is given in a separate sheet, in a different way, to each needs a different treatment. + + Parameters + ---------- + data : pr.ExcelFile + Raw data. + + Returns + ------- + tb : Table + LCOE for different energy sources. + """ + # Extract weighted average LCOE for different sources (each one requires a slightly different processing): + + # Solar photovoltaic. + error = "The file format for solar PV LCOE has changed." + assert data.parse("Fig 3.1", skiprows=21).columns[1] == f"LCOE ({EXPECTED_LCOE_UNIT})", error + solar_pv = ( + data.parse("Fig 3.1", skiprows=22) + .dropna(how="all", axis=1) + .rename(columns={"Unnamed: 1": "temp"}, errors="raise") + ) + solar_pv = solar_pv[solar_pv["temp"] == "Weighted average"].melt( + id_vars="temp", var_name="year", value_name="cost" + )[["year", "cost"]] + solar_pv["technology"] = "Solar photovoltaic" + + # Onshore wind. + error = "The file format for onshore wind LCOE has changed." + # NOTE: Sheet 2.1 contains LCOE only from 2010, whereas 2.11 contains LCOE from 1984. + assert data.parse("Fig 2.11", skiprows=2).columns[1] == f"LCOE ({EXPECTED_LCOE_UNIT})", error + onshore_wind = ( + data.parse("Fig 2.11", skiprows=3) + .drop(columns="Unnamed: 0", errors="raise") + .rename( # type: ignore + columns={"Year": "year", "Weighted average": "cost"}, errors="raise" + ) + ) + onshore_wind["technology"] = "Onshore wind" + + # Concentrated solar power. + error = "The file format for CSP LCOE has changed." + assert data.parse("Fig 5.1", skiprows=19).columns[1] == f"LCOE ({EXPECTED_LCOE_UNIT})", error + csp = ( + data.parse("Fig 5.1", skiprows=20) + .dropna(how="all", axis=1) + .rename(columns={"Unnamed: 1": "temp"}, errors="raise") + ) + csp = csp[csp["temp"] == "Weighted average"].melt(id_vars="temp", var_name="year", value_name="cost")[ + ["year", "cost"] + ] + csp["technology"] = "Concentrated solar power" + + # Offshore wind. + error = "The file format for offshore wind LCOE has changed." + assert data.parse("Fig 4.11", skiprows=1).columns[1] == EXPECTED_LCOE_UNIT, error + offshore_wind = data.parse("Fig 4.11", skiprows=3).rename( # type: ignore + columns={"Year": "year", "Weighted average": "cost"}, errors="raise" + )[["year", "cost"]] + offshore_wind["technology"] = "Offshore wind" + + # Geothermal. + # NOTE: Sheet 8.1 contains LCOE only from 2010, whereas 8.4 contains LCOE from 2007. + error = "The file format for geothermal LCOE has changed." + assert data.parse("Fig 8.4", skiprows=3).columns[1] == f"LCOE ({EXPECTED_LCOE_UNIT})", error + geothermal = data.parse("Fig 8.4", skiprows=5).rename( + columns={"Year": "year", "Weighted average": "cost"}, errors="raise" + )[["year", "cost"]] # type: ignore + geothermal["technology"] = "Geothermal" + + # Bioenergy. + error = "The file format for bioenergy LCOE has changed." + assert data.parse("Fig 9.1", skiprows=19).columns[1] == f"LCOE ({EXPECTED_LCOE_UNIT})", error + bioenergy = ( + data.parse("Fig 9.1", skiprows=20) + .dropna(axis=1, how="all") + .rename(columns={"Unnamed: 1": "temp"}, errors="raise") # type: ignore + ) + bioenergy = bioenergy[bioenergy["temp"] == "Weighted average"].melt( + id_vars="temp", var_name="year", value_name="cost" + )[["year", "cost"]] + bioenergy["technology"] = "Bioenergy" + + # Hydropower. + error = "The file format for hydropower LCOE has changed." + assert data.parse("Fig 7.1", skiprows=19).columns[1] == f"LCOE ({EXPECTED_LCOE_UNIT})", error + hydropower = ( + data.parse("Fig 7.1", skiprows=20) + .dropna(how="all", axis=1) + .rename(columns={"Unnamed: 1": "temp"}, errors="raise") # type: ignore + ) + hydropower = hydropower[hydropower["temp"] == "Weighted average"].melt( + id_vars="temp", var_name="year", value_name="cost" + )[["year", "cost"]] + hydropower["technology"] = "Hydropower" + + # Concatenate all sources into one table. + tb = pr.concat([solar_pv, onshore_wind, csp, offshore_wind, geothermal, bioenergy, hydropower], ignore_index=True) + + # Add country column. + tb["country"] = "World" + + return tb + + +def extract_country_cost_from_excel_file(data: pr.ExcelFile) -> Table: + """Extract weighted-average LCOE of certain countries and certain energy sources from the excel file. + + Only onshore wind and solar photovoltaic seem to have this data, and only for specific countries. + + Parameters + ---------- + data : pr.ExcelFile + Raw data. + + Returns + ------- + tb : Table + LCOE for different energy sources. + """ + # Extract LCOE for specific countries and technologies (those that are available in original data). + + # Solar photovoltaic. + # NOTE: For some reason, sheet 3.11 contains LCOE from 2010 to 2023 for 15 countries, and 3.12 contains LCOE from 2018 to 2023 for 19 countries. + # So, let's take both, check that they are consistent, and concatenate them. + solar_pv = data.parse("Fig. 3.11", skiprows=5).dropna(how="all", axis=1) + solar_pv = solar_pv.rename(columns={solar_pv.columns[0]: "country"}, errors="raise").melt( + id_vars="country", var_name="year", value_name="cost" + ) + # Keep only rows of LCOE, and drop year changes and empty rows. + solar_pv = solar_pv[~solar_pv["year"].astype(str).str.startswith("%")].dropna().reset_index(drop=True) + + # Load additional data. + solar_pv_extra = data.parse("Fig. 3.12", skiprows=8) + # Drop empty columns and unnecessary regions column. + solar_pv_extra = solar_pv_extra.drop( + columns=[column for column in solar_pv_extra.columns if "Unnamed" in str(column)], errors="raise" + ).drop(columns="Region", errors="raise") + solar_pv_extra = solar_pv_extra.rename(columns={"Country": "country"}, errors="raise").melt( + id_vars="country", var_name="year", value_name="cost" + ) + + # Check that, where both tables overlap, they are consistent. + error = "Expected coincident country-years to have the same LCOE in sheets 3.11 and 3.12." + check = solar_pv.merge(solar_pv_extra, on=["country", "year"], how="inner") + # NOTE: Consider relaxing this to coincide within a certain tolerance, if this fails. + assert (check["cost_x"] == check["cost_y"]).all(), error + # Concatenate both tables and drop duplicates and empty rows. + solar_pv = ( + pr.concat([solar_pv, solar_pv_extra], ignore_index=True) + .drop_duplicates(subset=["country", "year"]) + .dropna() + .reset_index(drop=True) + ) + + # Onshore wind. + # NOTE: There is country-level LCOE data in sheets 2.12 and 2.13 (for smaller markets). + # Fetch both and concatenate them. + error = "The file format for onshore wind LCOE has changed." + assert data.parse("Fig 2.12", skiprows=3).columns[1] == f"LCOE ({EXPECTED_LCOE_UNIT})", error + # NOTE: Column "Country" appears twice, so drop one of them. + onshore_wind = ( + data.parse("Fig 2.12", skiprows=6) + .dropna(how="all", axis=1) + .drop(columns=["Country.1"]) + .rename(columns={"Country": "country"}, errors="raise") + .melt(id_vars="country", var_name="year", value_name="cost") + ) + # Keep only rows of LCOE, and drop year changes and empty rows. + onshore_wind = onshore_wind[~onshore_wind["year"].astype(str).str.startswith("%")].dropna().reset_index(drop=True) + + error = "The file format for country-level onshore wind LCOE for smaller markets has changed." + assert data.parse("Fig 2.13", skiprows=3).columns[1] == f"LCOE ({EXPECTED_LCOE_UNIT})", error + onshore_wind_extra = ( + data.parse("Fig 2.13", skiprows=6) + .dropna(how="all", axis=1) + .rename(columns={"Country": "country"}, errors="raise") + .melt(id_vars="country", var_name="year", value_name="cost") + .dropna() + .reset_index(drop=True) + ) + + # Check that there is no overlap between countries in the two tables. + # NOTE: If there is, then change this to check that the values on coincident country-years are consistent. + error = "Expected no overlap between countries in sheets 2.12 and 2.13." + assert set(onshore_wind["country"]).isdisjoint(set(onshore_wind_extra["country"])), error + + # Combine onshore wind data. + onshore_wind_combined = pr.concat([onshore_wind, onshore_wind_extra], ignore_index=True) + + # Add a technology column and concatenate different technologies. + solar_pv["technology"] = "Solar photovoltaic" + onshore_wind_combined["technology"] = "Onshore wind" + combined = pr.concat([solar_pv, onshore_wind_combined], ignore_index=True) + + return combined + + +def combine_global_and_national_data(tb_costs_global: Table, tb_costs_national: Table) -> Table: + # Combine global and national data. + tb_combined = pr.concat([tb_costs_global, tb_costs_national], ignore_index=True).astype({"year": int}) + + # Convert from long to wide format. + tb_combined = tb_combined.pivot( + index=["country", "year"], columns="technology", values="cost", join_column_levels_with="_" + ) + + # Improve table format. + tb_combined = tb_combined.format(sort_columns=True) + + # Add units. + for column in tb_combined.columns: + tb_combined[column].metadata.unit = f"constant {EXPECTED_DOLLAR_YEAR} US$ per kilowatt-hour" + tb_combined[column].metadata.short_unit = "$/kWh" + tb_combined[ + column + ].metadata.description_short = "This data is expressed in US dollars per kilowatt-hour. It is adjusted for inflation but does not account for differences in living costs between countries." + + return tb_combined + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("renewable_power_generation_costs.xlsx") + data = snap.ExcelFile() + + # Extract global, weighted-average LCOE cost for all energy sources. + tb_costs_global = extract_global_cost_for_all_sources_from_excel_file(data=data) + + # Extract national LCOE for specific countries and technologies. + tb_costs_national = extract_country_cost_from_excel_file(data=data) + + # Combine global and national data. + # NOTE: For convenience, we will also add units and a short description here (instead of in the garden step). + tb_combined = combine_global_and_national_data(tb_costs_global=tb_costs_global, tb_costs_national=tb_costs_national) + + # Extract global data on solar photovoltaic module prices. + # NOTE: For convenience, we will also add units and a short description here (instead of in the garden step). + tb_solar_pv_prices = prepare_solar_pv_module_prices(data=data) + + # + # Save outputs. + # + # Create a new Meadow dataset. + ds = create_dataset( + dest_dir=dest_dir, + tables=[tb_combined, tb_solar_pv_prices], + default_metadata=snap.metadata, + check_variables_metadata=True, + ) + ds.save() diff --git a/etl/steps/data/meadow/ivs/2023-11-27/integrated_values_survey.py b/etl/steps/data/meadow/ivs/2023-11-27/integrated_values_survey.py index ef17ba76de2..8e9e1326687 100644 --- a/etl/steps/data/meadow/ivs/2023-11-27/integrated_values_survey.py +++ b/etl/steps/data/meadow/ivs/2023-11-27/integrated_values_survey.py @@ -149,7 +149,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("integrated_values_survey.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/lgbt_rights/2023-04-27/lgbti_policy_index.py b/etl/steps/data/meadow/lgbt_rights/2023-04-27/lgbti_policy_index.py index 0555256b621..944e26b46b3 100644 --- a/etl/steps/data/meadow/lgbt_rights/2023-04-27/lgbti_policy_index.py +++ b/etl/steps/data/meadow/lgbt_rights/2023-04-27/lgbti_policy_index.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("lgbti_policy_index.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="Sheet1") + tb = snap.read(safe_types=False, sheet_name="Sheet1") # # Process data. diff --git a/etl/steps/data/meadow/lgbt_rights/2024-06-03/equaldex.py b/etl/steps/data/meadow/lgbt_rights/2024-06-03/equaldex.py index 6ef0b46cea2..78097d67cf9 100644 --- a/etl/steps/data/meadow/lgbt_rights/2024-06-03/equaldex.py +++ b/etl/steps/data/meadow/lgbt_rights/2024-06-03/equaldex.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap_indices = paths.load_snapshot("equaldex_indices.csv") # Load data from snapshots. - tb = snap.read() + tb = snap.read(safe_types=False) tb_current = snap_current.read() tb_indices = snap_indices.read() diff --git a/etl/steps/data/meadow/lgbt_rights/2024-06-11/criminalization_mignot.py b/etl/steps/data/meadow/lgbt_rights/2024-06-11/criminalization_mignot.py index 508556cddcf..f91a482c534 100644 --- a/etl/steps/data/meadow/lgbt_rights/2024-06-11/criminalization_mignot.py +++ b/etl/steps/data/meadow/lgbt_rights/2024-06-11/criminalization_mignot.py @@ -23,7 +23,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("criminalization_mignot.csv") # Load data from snapshot. - tb = snap.read(header=None, names=COLUMN_NAMES) + tb = snap.read(safe_types=False, header=None, names=COLUMN_NAMES) # # Process data. diff --git a/etl/steps/data/meadow/lis/2024-06-13/luxembourg_income_study.py b/etl/steps/data/meadow/lis/2024-06-13/luxembourg_income_study.py index 5abb7944a23..7e10a215d88 100644 --- a/etl/steps/data/meadow/lis/2024-06-13/luxembourg_income_study.py +++ b/etl/steps/data/meadow/lis/2024-06-13/luxembourg_income_study.py @@ -76,7 +76,7 @@ def edit_snapshots_and_add_to_dataset( for tb_name, tb_ids in snapshots_dict.items(): # Retrieve snapshot. snap = paths.load_snapshot(f"{tb_name}{age_suffix}.csv") - tb = snap.read() + tb = snap.read(safe_types=False) tb[[col for col in tb.columns if col not in tb_ids]] = tb[ [col for col in tb.columns if col not in tb_ids] diff --git a/etl/steps/data/meadow/met_office_hadley_centre/2024-11-18/near_surface_temperature.meta.yml b/etl/steps/data/meadow/met_office_hadley_centre/2024-11-18/near_surface_temperature.meta.yml new file mode 100644 index 00000000000..da35b666b92 --- /dev/null +++ b/etl/steps/data/meadow/met_office_hadley_centre/2024-11-18/near_surface_temperature.meta.yml @@ -0,0 +1,3 @@ +tables: + near_surface_temperature: + title: Near-surface temperature anomaly diff --git a/etl/steps/data/meadow/met_office_hadley_centre/2024-11-18/near_surface_temperature.py b/etl/steps/data/meadow/met_office_hadley_centre/2024-11-18/near_surface_temperature.py new file mode 100644 index 00000000000..a979b16218f --- /dev/null +++ b/etl/steps/data/meadow/met_office_hadley_centre/2024-11-18/near_surface_temperature.py @@ -0,0 +1,57 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select and how to name them. +COLUMNS = { + # Additional column. + "region": "region", + # Original column names and new names. + "Time": "year", + "Anomaly (deg C)": "temperature_anomaly", + "Lower confidence limit (2.5%)": "lower_limit", + "Upper confidence limit (97.5%)": "upper_limit", +} + +# Names of snapshot files. +REGION_FILE_NAMES = { + "Global": "near_surface_temperature_global.csv", + "Northern hemisphere": "near_surface_temperature_northern_hemisphere.csv", + "Southern hemisphere": "near_surface_temperature_southern_hemisphere.csv", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots. + snapshots = {region: paths.load_snapshot(file_name) for region, file_name in REGION_FILE_NAMES.items()} + + # Load data from snapshots. + tb = pr.concat( + [snapshot.read().assign(**{"region": region}) for region, snapshot in snapshots.items()], + ignore_index=True, + short_name=paths.short_name, + ) + + # + # Process data. + # + # Select and rename required columns. + tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["region", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/moatsos/2023-10-09/moatsos_historical_poverty.py b/etl/steps/data/meadow/moatsos/2023-10-09/moatsos_historical_poverty.py index 421a1218b8d..94d0e738920 100644 --- a/etl/steps/data/meadow/moatsos/2023-10-09/moatsos_historical_poverty.py +++ b/etl/steps/data/meadow/moatsos/2023-10-09/moatsos_historical_poverty.py @@ -18,25 +18,25 @@ def run(dest_dir: str) -> None: # Retrieve snapshot and load data # OECD (Cost of Basic Needs and $1.90 poverty line (2011 PPP)) snap = paths.load_snapshot("moatsos_historical_poverty_oecd.csv") - tb_oecd = snap.read() + tb_oecd = snap.read(safe_types=False) # $5, $10, $30 poverty lines (2011 PPP) snap = paths.load_snapshot("moatsos_historical_poverty_5.csv") - tb_5 = snap.read() + tb_5 = snap.read(safe_types=False) snap = paths.load_snapshot("moatsos_historical_poverty_10.csv") - tb_10 = snap.read() + tb_10 = snap.read(safe_types=False) snap = paths.load_snapshot("moatsos_historical_poverty_30.csv") - tb_30 = snap.read() + tb_30 = snap.read(safe_types=False) # CBN share for countries snap = paths.load_snapshot("moatsos_historical_poverty_oecd_countries_share.xlsx") - tb_cbn_share_countries = snap.read(sheet_name="Sheet1", header=2) + tb_cbn_share_countries = snap.read(safe_types=False, sheet_name="Sheet1", header=2) # CBN number for regions snap = paths.load_snapshot("moatsos_historical_poverty_oecd_regions_number.xlsx") - tb_cbn_number = snap.read(sheet_name="g9-4", header=17) + tb_cbn_number = snap.read(safe_types=False, sheet_name="g9-4", header=17) # Merge and format tables tables = [tb_oecd, tb_5, tb_10, tb_30] diff --git a/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/lymphatic_filariasis.py b/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/lymphatic_filariasis.py index 1118bfa9fe2..44895451244 100644 --- a/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/lymphatic_filariasis.py +++ b/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/lymphatic_filariasis.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("lymphatic_filariasis.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/schistosomiasis.py b/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/schistosomiasis.py index e1e438a4b73..5861d392be0 100644 --- a/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/schistosomiasis.py +++ b/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/schistosomiasis.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("schistosomiasis.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/soil_transmitted_helminthiases.py b/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/soil_transmitted_helminthiases.py index 4dcb42976e8..1906d22ab59 100644 --- a/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/soil_transmitted_helminthiases.py +++ b/etl/steps/data/meadow/neglected_tropical_diseases/2024-05-02/soil_transmitted_helminthiases.py @@ -17,7 +17,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("soil_transmitted_helminthiases.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb.drop(columns="country_code") # diff --git a/etl/steps/data/meadow/news/2024-05-08/guardian_mentions.py b/etl/steps/data/meadow/news/2024-05-08/guardian_mentions.py index a819fb0ede8..c3db0d6c685 100644 --- a/etl/steps/data/meadow/news/2024-05-08/guardian_mentions.py +++ b/etl/steps/data/meadow/news/2024-05-08/guardian_mentions.py @@ -14,11 +14,11 @@ def run(dest_dir: str) -> None: ## Attention (via tags) snap = paths.load_snapshot("guardian_mentions.csv") ## Load data from snapshot. - tb_tags = snap.read() + tb_tags = snap.read(safe_types=False) ## Attention (via mentions) snap = paths.load_snapshot("guardian_mentions_raw.csv") ## Load data from snapshot. - tb_mentions = snap.read() + tb_mentions = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/news/2024-05-23/gdelt_v2.py b/etl/steps/data/meadow/news/2024-05-23/gdelt_v2.py index 71a8a99258f..f6ecbf21443 100644 --- a/etl/steps/data/meadow/news/2024-05-23/gdelt_v2.py +++ b/etl/steps/data/meadow/news/2024-05-23/gdelt_v2.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gdelt_v2.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/oecd/2023-09-21/plastic_aquatic_leakage.py b/etl/steps/data/meadow/oecd/2023-09-21/plastic_aquatic_leakage.py index 6da3c23852e..80622c960ec 100644 --- a/etl/steps/data/meadow/oecd/2023-09-21/plastic_aquatic_leakage.py +++ b/etl/steps/data/meadow/oecd/2023-09-21/plastic_aquatic_leakage.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("plastic_aquatic_leakage.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/oecd/2023-09-21/plastic_aquatic_leakage_projections.py b/etl/steps/data/meadow/oecd/2023-09-21/plastic_aquatic_leakage_projections.py index 1c44bd25329..234e8f937e5 100644 --- a/etl/steps/data/meadow/oecd/2023-09-21/plastic_aquatic_leakage_projections.py +++ b/etl/steps/data/meadow/oecd/2023-09-21/plastic_aquatic_leakage_projections.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("plastic_aquatic_leakage_projections.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/oecd/2023-09-21/plastic_emissions.py b/etl/steps/data/meadow/oecd/2023-09-21/plastic_emissions.py index bd9184c51f5..8b608e81dcf 100644 --- a/etl/steps/data/meadow/oecd/2023-09-21/plastic_emissions.py +++ b/etl/steps/data/meadow/oecd/2023-09-21/plastic_emissions.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("plastic_emissions.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/oecd/2023-09-21/plastic_environment_leakage_projections.py b/etl/steps/data/meadow/oecd/2023-09-21/plastic_environment_leakage_projections.py index aaff5c74f74..12bf1fae7a5 100644 --- a/etl/steps/data/meadow/oecd/2023-09-21/plastic_environment_leakage_projections.py +++ b/etl/steps/data/meadow/oecd/2023-09-21/plastic_environment_leakage_projections.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("plastic_environment_leakage_projections.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/oecd/2023-09-21/plastic_fate_regions.py b/etl/steps/data/meadow/oecd/2023-09-21/plastic_fate_regions.py index fa3be3886dc..01af34a185b 100644 --- a/etl/steps/data/meadow/oecd/2023-09-21/plastic_fate_regions.py +++ b/etl/steps/data/meadow/oecd/2023-09-21/plastic_fate_regions.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("plastic_fate_regions.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # Process data. # columns_to_use = ["Location", "Plastic end of life fates", "Time", "Value"] diff --git a/etl/steps/data/meadow/oecd/2023-09-21/plastic_fate_regions_projections.py b/etl/steps/data/meadow/oecd/2023-09-21/plastic_fate_regions_projections.py index a8433c2bfab..ab01591db35 100644 --- a/etl/steps/data/meadow/oecd/2023-09-21/plastic_fate_regions_projections.py +++ b/etl/steps/data/meadow/oecd/2023-09-21/plastic_fate_regions_projections.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("plastic_fate_regions_projections.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_application.py b/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_application.py index 33260d86629..f7c7e28a260 100644 --- a/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_application.py +++ b/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_application.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("plastic_use_application.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_polymer.py b/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_polymer.py index 8d164f5197e..83ffef162a1 100644 --- a/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_polymer.py +++ b/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_polymer.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("plastic_use_polymer.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_projections.py b/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_projections.py index ee80a18c914..bfbf0edd112 100644 --- a/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_projections.py +++ b/etl/steps/data/meadow/oecd/2023-09-21/plastic_use_projections.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("plastic_use_projections.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/oecd/2023-09-21/plastic_waste_2019.py b/etl/steps/data/meadow/oecd/2023-09-21/plastic_waste_2019.py index 75d754ae739..80467f93782 100644 --- a/etl/steps/data/meadow/oecd/2023-09-21/plastic_waste_2019.py +++ b/etl/steps/data/meadow/oecd/2023-09-21/plastic_waste_2019.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("plastic_waste_2019.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/oecd/2023-10-11/life_expectancy_birth.py b/etl/steps/data/meadow/oecd/2023-10-11/life_expectancy_birth.py index bc51c111443..a7f0f2ce1eb 100644 --- a/etl/steps/data/meadow/oecd/2023-10-11/life_expectancy_birth.py +++ b/etl/steps/data/meadow/oecd/2023-10-11/life_expectancy_birth.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("life_expectancy_birth.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/oecd/2024-02-23/health_expenditure.py b/etl/steps/data/meadow/oecd/2024-02-23/health_expenditure.py index 346b4c5a82c..492476f1194 100644 --- a/etl/steps/data/meadow/oecd/2024-02-23/health_expenditure.py +++ b/etl/steps/data/meadow/oecd/2024-02-23/health_expenditure.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("health_expenditure.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/oecd/2024-04-10/income_distribution_database.py b/etl/steps/data/meadow/oecd/2024-04-10/income_distribution_database.py index 78c76fa0ba9..e88daab3866 100644 --- a/etl/steps/data/meadow/oecd/2024-04-10/income_distribution_database.py +++ b/etl/steps/data/meadow/oecd/2024-04-10/income_distribution_database.py @@ -20,7 +20,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("income_distribution_database.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/oecd/2024-04-30/affordable_housing_database.py b/etl/steps/data/meadow/oecd/2024-04-30/affordable_housing_database.py index 928d5d974b1..8aebf129177 100644 --- a/etl/steps/data/meadow/oecd/2024-04-30/affordable_housing_database.py +++ b/etl/steps/data/meadow/oecd/2024-04-30/affordable_housing_database.py @@ -205,22 +205,22 @@ def run(dest_dir: str) -> None: # Load data from snapshot. # Point-in-time data - tb_point_in_time = snap.read(sheet_name="HC3.1.1", usecols="L:P", skiprows=2, na_values=[".."]) + tb_point_in_time = snap.read(safe_types=False, sheet_name="HC3.1.1", usecols="L:P", skiprows=2, na_values=[".."]) # Flow data - tb_flow = snap.read(sheet_name="HC3.1.1", usecols="S:W", skiprows=2, na_values=[".."]) + tb_flow = snap.read(safe_types=False, sheet_name="HC3.1.1", usecols="S:W", skiprows=2, na_values=[".."]) # Share of women - tb_women = snap.read(sheet_name="HC3.1.2", usecols="R:S", skiprows=1, na_values=[".."]) + tb_women = snap.read(safe_types=False, sheet_name="HC3.1.2", usecols="R:S", skiprows=1, na_values=[".."]) # Index of people experiencing homelessness - tb_index = snap.read(sheet_name="HC3.1.3", usecols="L:S", skiprows=6, na_values=[".."]) + tb_index = snap.read(safe_types=False, sheet_name="HC3.1.3", usecols="L:S", skiprows=6, na_values=[".."]) # Share trends - tb_share = snap.read(sheet_name="HC3.1.4", usecols="Q:AE", skiprows=6, na_values=[".."]) + tb_share = snap.read(safe_types=False, sheet_name="HC3.1.4", usecols="Q:AE", skiprows=6, na_values=[".."]) # Number of people experiencing homelessness - tb_number = snap.read(sheet_name="Table_HC3.1.A2", skiprows=3, na_values=[".."]) + tb_number = snap.read(safe_types=False, sheet_name="Table_HC3.1.A2", skiprows=3, na_values=[".."]) # # Process data. diff --git a/etl/steps/data/meadow/oecd/2024-07-01/passenger_travel.py b/etl/steps/data/meadow/oecd/2024-07-01/passenger_travel.py index c44826c62d0..d4ee072f304 100644 --- a/etl/steps/data/meadow/oecd/2024-07-01/passenger_travel.py +++ b/etl/steps/data/meadow/oecd/2024-07-01/passenger_travel.py @@ -21,7 +21,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("passenger_travel.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # drop columns not needed tb = tb[COLS_TO_KEEP.keys()] diff --git a/etl/steps/data/meadow/oecd/2024-07-01/road_accidents.py b/etl/steps/data/meadow/oecd/2024-07-01/road_accidents.py index 067544dca4b..a8b9b775dce 100644 --- a/etl/steps/data/meadow/oecd/2024-07-01/road_accidents.py +++ b/etl/steps/data/meadow/oecd/2024-07-01/road_accidents.py @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("road_accidents.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # drop unneeded columns tb = tb[COLS_TO_KEEP.keys()] diff --git a/etl/steps/data/meadow/oecd/2024-08-08/tackling_inequalities_brazil_2010.py b/etl/steps/data/meadow/oecd/2024-08-08/tackling_inequalities_brazil_2010.py index ec6c0c90011..796a3be700b 100644 --- a/etl/steps/data/meadow/oecd/2024-08-08/tackling_inequalities_brazil_2010.py +++ b/etl/steps/data/meadow/oecd/2024-08-08/tackling_inequalities_brazil_2010.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("tackling_inequalities_brazil_2010.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/oecd/2024-08-15/decile_ratios.py b/etl/steps/data/meadow/oecd/2024-08-15/decile_ratios.py index b0d76088202..cfc16f61951 100644 --- a/etl/steps/data/meadow/oecd/2024-08-15/decile_ratios.py +++ b/etl/steps/data/meadow/oecd/2024-08-15/decile_ratios.py @@ -23,7 +23,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("decile_ratios.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/oecd/2024-08-19/ppp_exchange_rates.py b/etl/steps/data/meadow/oecd/2024-08-19/ppp_exchange_rates.py index 5dfc65b12e9..3acf9ed3b75 100644 --- a/etl/steps/data/meadow/oecd/2024-08-19/ppp_exchange_rates.py +++ b/etl/steps/data/meadow/oecd/2024-08-19/ppp_exchange_rates.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ppp_exchange_rates.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) columns_to_use = ["Reference area", "Transaction", "TIME_PERIOD", "OBS_VALUE"] tb = tb[columns_to_use] diff --git a/etl/steps/data/meadow/oecd/2024-08-21/co2_air_transport.py b/etl/steps/data/meadow/oecd/2024-08-21/co2_air_transport.py index 1c5b798cd62..394313ebf76 100644 --- a/etl/steps/data/meadow/oecd/2024-08-21/co2_air_transport.py +++ b/etl/steps/data/meadow/oecd/2024-08-21/co2_air_transport.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("co2_air_transport.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) columns_to_use = [ "Reference area", "Frequency of observation", diff --git a/etl/steps/data/meadow/ons/2024-03-20/hours_and_earnings_uk.py b/etl/steps/data/meadow/ons/2024-03-20/hours_and_earnings_uk.py index 6a2d49c0f70..4eefa74c9fb 100644 --- a/etl/steps/data/meadow/ons/2024-03-20/hours_and_earnings_uk.py +++ b/etl/steps/data/meadow/ons/2024-03-20/hours_and_earnings_uk.py @@ -33,7 +33,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("hours_and_earnings_uk.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="Table 5", header=2) + tb = snap.read(safe_types=False, sheet_name="Table 5", header=2) # # Process data. diff --git a/etl/steps/data/meadow/ophi/2024-10-28/multidimensional_poverty_index.py b/etl/steps/data/meadow/ophi/2024-10-28/multidimensional_poverty_index.py new file mode 100644 index 00000000000..67f94d5db3f --- /dev/null +++ b/etl/steps/data/meadow/ophi/2024-10-28/multidimensional_poverty_index.py @@ -0,0 +1,84 @@ +"""Load a snapshot and create a meadow dataset.""" + +from typing import List + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Define index columns for hot and cot tables. +INDEX_COLS = [ + "country", + "year", + "loa", + "measure", + "indicator", + "region_lab", + "area_lab", + "agec2_lab", + "agec4_lab", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap_cme = paths.load_snapshot("multidimensional_poverty_index_cme.csv") + snap_hot = paths.load_snapshot("multidimensional_poverty_index_hot.csv") + + # Load data from snapshot. + tb_cme = snap_cme.read() + tb_hot = snap_hot.read() + + # + # Process data. + # + # Format columns and index. + tb_cme = format_columns_and_index(tb=tb_cme, short_name="cme", index_columns=INDEX_COLS) + tb_hot = format_columns_and_index(tb=tb_hot, short_name="hot", index_columns=INDEX_COLS) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, tables=[tb_cme, tb_hot], check_variables_metadata=True, default_metadata=snap_cme.metadata + ) + + # Save changes in the new meadow dataset. + ds_meadow.save() + + +def format_columns_and_index(tb: Table, short_name: str, index_columns: List[str]) -> Table: + """ + Rename columns, format year and select the categories I need. + """ + # Rename columns. + tb = tb.rename(columns={"cty_lab": "country"}) + + # Make year string + tb["year"] = tb["year"].astype("string") + + # In the measure column, select all the categories, except for pctb + tb = tb[tb["measure"] != "pctb"].reset_index(drop=True) + + tb = tb[~tb["loa"].isin(["hship", "agec2", "agec4", "region"])].reset_index(drop=True) + + # NOTE: On years + # As the year data is encoded in a string variable between two years, we need to map the data to a single (integer) year. + # For now, arbitrarily, I take the first year in these cases and convert to integer. + + tb["year"] = tb["year"].str[:4].astype(int) + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format( + index_columns, + short_name=short_name, + ) + + return tb diff --git a/etl/steps/data/meadow/owid/latest/ig_countries.py b/etl/steps/data/meadow/owid/latest/ig_countries.py new file mode 100644 index 00000000000..48869ca2cda --- /dev/null +++ b/etl/steps/data/meadow/owid/latest/ig_countries.py @@ -0,0 +1,34 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("ig_countries.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + tb["count"] = 1 + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "date"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/papers/2023-10-20/anthromes.py b/etl/steps/data/meadow/papers/2023-10-20/anthromes.py index 677d0dd1f43..527b4e959af 100644 --- a/etl/steps/data/meadow/papers/2023-10-20/anthromes.py +++ b/etl/steps/data/meadow/papers/2023-10-20/anthromes.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("anthromes.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/papers/2023-12-12/farmer_lafond_2016.py b/etl/steps/data/meadow/papers/2023-12-12/farmer_lafond_2016.py index aee259d792c..70493c8fb68 100644 --- a/etl/steps/data/meadow/papers/2023-12-12/farmer_lafond_2016.py +++ b/etl/steps/data/meadow/papers/2023-12-12/farmer_lafond_2016.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: # # Load snapshot. snap = paths.load_snapshot("farmer_lafond_2016.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Prepare data. diff --git a/etl/steps/data/meadow/papers/2023-12-12/nemet_2009.py b/etl/steps/data/meadow/papers/2023-12-12/nemet_2009.py index 138b61b0e95..1c47526d1e1 100644 --- a/etl/steps/data/meadow/papers/2023-12-12/nemet_2009.py +++ b/etl/steps/data/meadow/papers/2023-12-12/nemet_2009.py @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None: # # Load snapshot. snap = paths.load_snapshot("nemet_2009.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/papers/2024-03-26/bayliss_smith_wanmali_1984.py b/etl/steps/data/meadow/papers/2024-03-26/bayliss_smith_wanmali_1984.py index 174540c781d..7a82c9562b0 100644 --- a/etl/steps/data/meadow/papers/2024-03-26/bayliss_smith_wanmali_1984.py +++ b/etl/steps/data/meadow/papers/2024-03-26/bayliss_smith_wanmali_1984.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("bayliss_smith_wanmali_1984.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/papers/2024-03-26/brassley_2000.py b/etl/steps/data/meadow/papers/2024-03-26/brassley_2000.py index 9f3db77f080..2212e039ed4 100644 --- a/etl/steps/data/meadow/papers/2024-03-26/brassley_2000.py +++ b/etl/steps/data/meadow/papers/2024-03-26/brassley_2000.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("brassley_2000.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/papers/2024-03-26/broadberry_et_al_2015.py b/etl/steps/data/meadow/papers/2024-03-26/broadberry_et_al_2015.py index 7fe6a10b9d1..a933c72f53f 100644 --- a/etl/steps/data/meadow/papers/2024-03-26/broadberry_et_al_2015.py +++ b/etl/steps/data/meadow/papers/2024-03-26/broadberry_et_al_2015.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("broadberry_et_al_2015.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/papers/2024-03-26/wuepper_et_al_2020.py b/etl/steps/data/meadow/papers/2024-03-26/wuepper_et_al_2020.py index b4f9dd388eb..c1de9ddccee 100644 --- a/etl/steps/data/meadow/papers/2024-03-26/wuepper_et_al_2020.py +++ b/etl/steps/data/meadow/papers/2024-03-26/wuepper_et_al_2020.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("wuepper_et_al_2020.xlsx") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/papers/2024-10-25/ipsos.py b/etl/steps/data/meadow/papers/2024-10-25/ipsos.py index a95c9bec119..175a8ab7288 100644 --- a/etl/steps/data/meadow/papers/2024-10-25/ipsos.py +++ b/etl/steps/data/meadow/papers/2024-10-25/ipsos.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("ipsos.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb.rename(columns={"Entity": "country", "Year": "year"}, errors="raise").drop(columns=["Code"]) # # Process data. diff --git a/etl/steps/data/meadow/pew/2024-06-03/same_sex_marriage.py b/etl/steps/data/meadow/pew/2024-06-03/same_sex_marriage.py index f721c87c877..6c039cbe1b4 100644 --- a/etl/steps/data/meadow/pew/2024-06-03/same_sex_marriage.py +++ b/etl/steps/data/meadow/pew/2024-06-03/same_sex_marriage.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("same_sex_marriage.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/sipri/2024-07-08/military_expenditure.py b/etl/steps/data/meadow/sipri/2024-07-08/military_expenditure.py index 25f998f1b21..72b9c75ceab 100644 --- a/etl/steps/data/meadow/sipri/2024-07-08/military_expenditure.py +++ b/etl/steps/data/meadow/sipri/2024-07-08/military_expenditure.py @@ -27,11 +27,17 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("military_expenditure.xlsx") # Load data from snapshot. - tb_constant_usd = snap.read(sheet_name="Constant (2022) US$", skiprows=5, na_values=["...", "xxx"]) - tb_constant_usd_regions = snap.read(sheet_name="Regional totals", skiprows=13, na_values=["...", "xxx"]) - tb_share_gdp = snap.read(sheet_name="Share of GDP", skiprows=5, na_values=["...", "xxx"]) - tb_per_capita = snap.read(sheet_name="Per capita", skiprows=6, na_values=["...", "xxx"]) - tb_share_govt_spending = snap.read(sheet_name="Share of Govt. spending", skiprows=7, na_values=["...", "xxx"]) + tb_constant_usd = snap.read( + safe_types=False, sheet_name="Constant (2022) US$", skiprows=5, na_values=["...", "xxx"] + ) + tb_constant_usd_regions = snap.read( + safe_types=False, sheet_name="Regional totals", skiprows=13, na_values=["...", "xxx"] + ) + tb_share_gdp = snap.read(safe_types=False, sheet_name="Share of GDP", skiprows=5, na_values=["...", "xxx"]) + tb_per_capita = snap.read(safe_types=False, sheet_name="Per capita", skiprows=6, na_values=["...", "xxx"]) + tb_share_govt_spending = snap.read( + safe_types=False, sheet_name="Share of Govt. spending", skiprows=7, na_values=["...", "xxx"] + ) # # Process data. diff --git a/etl/steps/data/meadow/space/2024-01-03/near_earth_asteroids.py b/etl/steps/data/meadow/space/2024-01-03/near_earth_asteroids.py index b1d6f1dd44d..994915f604e 100644 --- a/etl/steps/data/meadow/space/2024-01-03/near_earth_asteroids.py +++ b/etl/steps/data/meadow/space/2024-01-03/near_earth_asteroids.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("near_earth_asteroids.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/space/2024-01-04/object_launches.py b/etl/steps/data/meadow/space/2024-01-04/object_launches.py index e060202036e..2a5a8d55c6f 100644 --- a/etl/steps/data/meadow/space/2024-01-04/object_launches.py +++ b/etl/steps/data/meadow/space/2024-01-04/object_launches.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("object_launches.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/state_capacity/2023-10-19/state_capacity_dataset.py b/etl/steps/data/meadow/state_capacity/2023-10-19/state_capacity_dataset.py index 3fc87d97311..1a549390321 100644 --- a/etl/steps/data/meadow/state_capacity/2023-10-19/state_capacity_dataset.py +++ b/etl/steps/data/meadow/state_capacity/2023-10-19/state_capacity_dataset.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("state_capacity_dataset.dta") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/state_capacity/2023-11-10/information_capacity_dataset.py b/etl/steps/data/meadow/state_capacity/2023-11-10/information_capacity_dataset.py index db6cbf07696..0461432f936 100644 --- a/etl/steps/data/meadow/state_capacity/2023-11-10/information_capacity_dataset.py +++ b/etl/steps/data/meadow/state_capacity/2023-11-10/information_capacity_dataset.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("information_capacity_dataset.dta") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/statins/2023-10-05/bmj_2022.py b/etl/steps/data/meadow/statins/2023-10-05/bmj_2022.py index d1656b99007..3e8d09c4774 100644 --- a/etl/steps/data/meadow/statins/2023-10-05/bmj_2022.py +++ b/etl/steps/data/meadow/statins/2023-10-05/bmj_2022.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("bmj_2022.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # Process data. # @@ -55,6 +55,7 @@ def run(dest_dir: str) -> None: essential_med_tb["year"] = 2017 essential_med_tb["indicator"] = "Essential medicine list" essential_med_tb = essential_med_tb.rename(columns={"statins_essential_medicine_2017": "value"}) + essential_med_tb["value"] = essential_med_tb["value"].astype(str) essential_med_tb["value"].replace("-", np.nan, inplace=True) essential_med_tb["value"].replace("No", 0, inplace=True) essential_med_tb["value"].replace("Yes", 1, inplace=True) diff --git a/etl/steps/data/meadow/statins/2023-10-05/lancet_2022.py b/etl/steps/data/meadow/statins/2023-10-05/lancet_2022.py index 48235db62cc..cc6ad09eb58 100644 --- a/etl/steps/data/meadow/statins/2023-10-05/lancet_2022.py +++ b/etl/steps/data/meadow/statins/2023-10-05/lancet_2022.py @@ -14,11 +14,11 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("lancet_2022.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # Remove all characters after "(" in statin use (confidence boundaries for statin use) - tb["statin_use_secondary"] = tb["statin_use_secondary"].str.split("(").str[0].str.strip().astype(float) - tb["statin_use_primary"] = tb["statin_use_primary"].str.split("(").str[0].str.strip().astype(float) + tb["statin_use_secondary"] = tb["statin_use_secondary"].str.split("(").str[0].str.strip().astype("float[pyarrow]") + tb["statin_use_primary"] = tb["statin_use_primary"].str.split("(").str[0].str.strip().astype("float[pyarrow]") # Drop rows that contain only NaN values and row with Grenadines (actually belongs to St. Vincent and the Grenadines, row above) tb = tb[tb["country"] != "Grenadines"] tb = tb.dropna(how="all") diff --git a/etl/steps/data/meadow/statistics_canada/2024-08-09/gini_coefficients.py b/etl/steps/data/meadow/statistics_canada/2024-08-09/gini_coefficients.py index 5b3cb589904..11421370f59 100644 --- a/etl/steps/data/meadow/statistics_canada/2024-08-09/gini_coefficients.py +++ b/etl/steps/data/meadow/statistics_canada/2024-08-09/gini_coefficients.py @@ -26,7 +26,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gini_coefficients.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/statistics_canada/2024-08-09/relative_poverty.py b/etl/steps/data/meadow/statistics_canada/2024-08-09/relative_poverty.py index af6313864db..457af2ca086 100644 --- a/etl/steps/data/meadow/statistics_canada/2024-08-09/relative_poverty.py +++ b/etl/steps/data/meadow/statistics_canada/2024-08-09/relative_poverty.py @@ -17,7 +17,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("relative_poverty.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/survey/2023-08-04/latinobarometro_trust.py b/etl/steps/data/meadow/survey/2023-08-04/latinobarometro_trust.py index dcbe8aa3595..d2e05040b67 100644 --- a/etl/steps/data/meadow/survey/2023-08-04/latinobarometro_trust.py +++ b/etl/steps/data/meadow/survey/2023-08-04/latinobarometro_trust.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("latinobarometro_trust.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/survey/2023-08-07/afrobarometer_trust.py b/etl/steps/data/meadow/survey/2023-08-07/afrobarometer_trust.py index b3044fd971c..d09af925f1f 100644 --- a/etl/steps/data/meadow/survey/2023-08-07/afrobarometer_trust.py +++ b/etl/steps/data/meadow/survey/2023-08-07/afrobarometer_trust.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("afrobarometer_trust.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/technology/2023-11-28/dna_sequencing.py b/etl/steps/data/meadow/technology/2023-11-28/dna_sequencing.py index 18eed4f4187..7b55c353375 100644 --- a/etl/steps/data/meadow/technology/2023-11-28/dna_sequencing.py +++ b/etl/steps/data/meadow/technology/2023-11-28/dna_sequencing.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("dna_sequencing.xls") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/tourism/2024-08-17/unwto.py b/etl/steps/data/meadow/tourism/2024-08-17/unwto.py index 2dd1b675e8b..551c6edad10 100644 --- a/etl/steps/data/meadow/tourism/2024-08-17/unwto.py +++ b/etl/steps/data/meadow/tourism/2024-08-17/unwto.py @@ -40,7 +40,7 @@ def run(dest_dir: str) -> None: tbs = [] for sheet_name in sheet_names_to_load: - tb = snap.read(sheet_name=sheet_name, header=2) + tb = snap.read(safe_types=False, sheet_name=sheet_name, header=2) # Drop unnecessary columns columns_to_drop = ["C.", "S.", "C. & S.", "Units", "Notes", "Series", "Unnamed: 38", "Unnamed: 39"] diff --git a/etl/steps/data/meadow/tourism/2024-08-17/unwto_environment.py b/etl/steps/data/meadow/tourism/2024-08-17/unwto_environment.py index 6bbd7593b8e..0ebbf785338 100644 --- a/etl/steps/data/meadow/tourism/2024-08-17/unwto_environment.py +++ b/etl/steps/data/meadow/tourism/2024-08-17/unwto_environment.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("unwto_environment.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/tourism/2024-08-17/unwto_gdp.py b/etl/steps/data/meadow/tourism/2024-08-17/unwto_gdp.py index 764467db2cc..8136b056d7b 100644 --- a/etl/steps/data/meadow/tourism/2024-08-17/unwto_gdp.py +++ b/etl/steps/data/meadow/tourism/2024-08-17/unwto_gdp.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("unwto_gdp.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/tuberculosis/2023-11-27/burden_estimates.py b/etl/steps/data/meadow/tuberculosis/2023-11-27/burden_estimates.py index 5e04765b85a..df99ec370dd 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-11-27/burden_estimates.py +++ b/etl/steps/data/meadow/tuberculosis/2023-11-27/burden_estimates.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("burden_estimates.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/tuberculosis/2023-12-01/budget.py b/etl/steps/data/meadow/tuberculosis/2023-12-01/budget.py index f21ddab9389..053ce733e2c 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-12-01/budget.py +++ b/etl/steps/data/meadow/tuberculosis/2023-12-01/budget.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("budget.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb.drop(columns=["iso2", "iso3", "iso_numeric", "g_whoregion"]) # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. tb = tb.underscore().set_index(["country", "year"], verify_integrity=True).sort_index() diff --git a/etl/steps/data/meadow/tuberculosis/2023-12-04/burden_disaggregated.py b/etl/steps/data/meadow/tuberculosis/2023-12-04/burden_disaggregated.py index 2211851a18c..5e2ffb6f1e3 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-12-04/burden_disaggregated.py +++ b/etl/steps/data/meadow/tuberculosis/2023-12-04/burden_disaggregated.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("burden_disaggregated.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/tuberculosis/2023-12-04/drug_resistance_surveillance.py b/etl/steps/data/meadow/tuberculosis/2023-12-04/drug_resistance_surveillance.py index 22169764dba..8c9b93c3494 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-12-04/drug_resistance_surveillance.py +++ b/etl/steps/data/meadow/tuberculosis/2023-12-04/drug_resistance_surveillance.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("drug_resistance_surveillance.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb.drop(columns=["iso2", "iso3", "iso_numeric", "g_whoregion"]) # # Process data. diff --git a/etl/steps/data/meadow/tuberculosis/2023-12-05/expenditure.py b/etl/steps/data/meadow/tuberculosis/2023-12-05/expenditure.py index e04e14d2282..0b1bf185288 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-12-05/expenditure.py +++ b/etl/steps/data/meadow/tuberculosis/2023-12-05/expenditure.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("expenditure.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb.drop(columns=["iso2", "iso3", "iso_numeric", "g_whoregion"]) # diff --git a/etl/steps/data/meadow/tuberculosis/2023-12-06/laboratories.py b/etl/steps/data/meadow/tuberculosis/2023-12-06/laboratories.py index c9d47eeaadb..4b7b8611d40 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-12-06/laboratories.py +++ b/etl/steps/data/meadow/tuberculosis/2023-12-06/laboratories.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("laboratories.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/tuberculosis/2023-12-06/latent.py b/etl/steps/data/meadow/tuberculosis/2023-12-06/latent.py index 2ef463d902a..9f47c2d61ca 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-12-06/latent.py +++ b/etl/steps/data/meadow/tuberculosis/2023-12-06/latent.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("latent.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/tuberculosis/2023-12-11/notifications.py b/etl/steps/data/meadow/tuberculosis/2023-12-11/notifications.py index 09deede241b..3508c2aa183 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-12-11/notifications.py +++ b/etl/steps/data/meadow/tuberculosis/2023-12-11/notifications.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("notifications.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/tuberculosis/2023-12-11/outcomes.py b/etl/steps/data/meadow/tuberculosis/2023-12-11/outcomes.py index 07d1c75376f..62533abef7e 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-12-11/outcomes.py +++ b/etl/steps/data/meadow/tuberculosis/2023-12-11/outcomes.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("outcomes.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/tuberculosis/2023-12-12/outcomes_disagg.py b/etl/steps/data/meadow/tuberculosis/2023-12-12/outcomes_disagg.py index 50577d4b903..bfb32b5354d 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-12-12/outcomes_disagg.py +++ b/etl/steps/data/meadow/tuberculosis/2023-12-12/outcomes_disagg.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("outcomes_disagg.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/tuberculosis/2023-12-12/unhlm_commitments.py b/etl/steps/data/meadow/tuberculosis/2023-12-12/unhlm_commitments.py index 4e082991764..7dc2f1fd293 100644 --- a/etl/steps/data/meadow/tuberculosis/2023-12-12/unhlm_commitments.py +++ b/etl/steps/data/meadow/tuberculosis/2023-12-12/unhlm_commitments.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("unhlm_commitments.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/un/2018/igme.py b/etl/steps/data/meadow/un/2018/igme.py index d128594ddf1..896fe43d660 100644 --- a/etl/steps/data/meadow/un/2018/igme.py +++ b/etl/steps/data/meadow/un/2018/igme.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("igme.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb.rename(columns={"REF_AREA_NAME": "country", "REF_DATE": "year"}) columns_to_keep = [ "country", diff --git a/etl/steps/data/meadow/un/2023-10-30/un_members.py b/etl/steps/data/meadow/un/2023-10-30/un_members.py index 335829219c8..92efafc9ec6 100644 --- a/etl/steps/data/meadow/un/2023-10-30/un_members.py +++ b/etl/steps/data/meadow/un/2023-10-30/un_members.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("un_members.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_300k.py b/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_300k.py index ee5dd89831c..a791b6417c6 100644 --- a/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_300k.py +++ b/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_300k.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("urban_agglomerations_300k.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_definition.py b/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_definition.py index 6b6af46f6df..ac15852d89f 100644 --- a/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_definition.py +++ b/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_definition.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("urban_agglomerations_definition.xls") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_largest_cities.py b/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_largest_cities.py index c366e527527..e0470050142 100644 --- a/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_largest_cities.py +++ b/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_largest_cities.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("urban_agglomerations_largest_cities.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_size_class.py b/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_size_class.py index ea2a6b26afc..fa2c1ad626d 100644 --- a/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_size_class.py +++ b/etl/steps/data/meadow/un/2024-01-17/urban_agglomerations_size_class.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("urban_agglomerations_size_class.xls") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/un/2024-01-17/urbanization_urban_rural.py b/etl/steps/data/meadow/un/2024-01-17/urbanization_urban_rural.py index 9241d56030a..0fbc91bac13 100644 --- a/etl/steps/data/meadow/un/2024-01-17/urbanization_urban_rural.py +++ b/etl/steps/data/meadow/un/2024-01-17/urbanization_urban_rural.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("urbanization_urban_rural.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/un/2024-07-11/un_wpp.py b/etl/steps/data/meadow/un/2024-07-11/un_wpp.py index 805c73417cd..7a735606361 100644 --- a/etl/steps/data/meadow/un/2024-07-11/un_wpp.py +++ b/etl/steps/data/meadow/un/2024-07-11/un_wpp.py @@ -129,10 +129,10 @@ def read_estimates_and_projections_from_snap(short_name: str) -> Table: snap = paths.load_snapshot(short_name) # Read tables # TODO: Add support for Low, and High variants - tb_estimates = snap.read(sheet_name="Estimates") - tb_projections_medium = snap.read(sheet_name="Medium") - # tb_projections_low = snap.read(sheet_name="Low") - # tb_projections_high = snap.read(sheet_name="High") + tb_estimates = snap.read(safe_types=False, sheet_name="Estimates") + tb_projections_medium = snap.read(safe_types=False, sheet_name="Medium") + # tb_projections_low = snap.read(safe_types=False, sheet_name="Low") + # tb_projections_high = snap.read(safe_types=False, sheet_name="High") # Merge tables tb = concat( [ diff --git a/etl/steps/data/meadow/un/2024-07-12/un_wpp.py b/etl/steps/data/meadow/un/2024-07-12/un_wpp.py index e688a7846ac..1374b39e23a 100644 --- a/etl/steps/data/meadow/un/2024-07-12/un_wpp.py +++ b/etl/steps/data/meadow/un/2024-07-12/un_wpp.py @@ -350,10 +350,10 @@ def read_from_xlsx(short_name: str) -> Table: # Read snap snap = paths.load_snapshot(short_name) # Read tables - tb_estimates = snap.read(sheet_name="Estimates", skiprows=16) - tb_projections_medium = snap.read(sheet_name="Medium variant", skiprows=16) - tb_projections_low = snap.read(sheet_name="Low variant", skiprows=16) - tb_projections_high = snap.read(sheet_name="High variant", skiprows=16) + tb_estimates = snap.read(safe_types=False, sheet_name="Estimates", skiprows=16) + tb_projections_medium = snap.read(safe_types=False, sheet_name="Medium variant", skiprows=16) + tb_projections_low = snap.read(safe_types=False, sheet_name="Low variant", skiprows=16) + tb_projections_high = snap.read(safe_types=False, sheet_name="High variant", skiprows=16) # Merge tables tb = concat( [ diff --git a/etl/steps/data/meadow/un/2024-08-27/un_sdg.py b/etl/steps/data/meadow/un/2024-08-27/un_sdg.py index a25e74b27e3..fa79fcace31 100644 --- a/etl/steps/data/meadow/un/2024-08-27/un_sdg.py +++ b/etl/steps/data/meadow/un/2024-08-27/un_sdg.py @@ -100,7 +100,7 @@ def run(dest_dir: str) -> None: log.info("un_sdg.start") snap = paths.load_snapshot("un_sdg.feather") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/un/2024-09-11/igme.py b/etl/steps/data/meadow/un/2024-09-11/igme.py index b35cff24bac..1d1dfd4f53f 100644 --- a/etl/steps/data/meadow/un/2024-09-11/igme.py +++ b/etl/steps/data/meadow/un/2024-09-11/igme.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("igme.zip") # Load data from snapshot. - tb = snap.read_in_archive("UN IGME 2023.csv", low_memory=False) + tb = snap.read_in_archive("UN IGME 2023.csv", low_memory=False, safe_types=False) # # Process data. # diff --git a/etl/steps/data/meadow/un/2024-10-21/census_dates.py b/etl/steps/data/meadow/un/2024-10-21/census_dates.py new file mode 100644 index 00000000000..33916cc0574 --- /dev/null +++ b/etl/steps/data/meadow/un/2024-10-21/census_dates.py @@ -0,0 +1,34 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("census_dates.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + tb = tb.drop_duplicates() + # dates with less than 4 characters are not valid dates + tb = tb[tb["Date"].apply(len) > 3] + + tb = tb.format(["country", "date"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/un/2024-12-02/un_wpp_lt.py b/etl/steps/data/meadow/un/2024-12-02/un_wpp_lt.py new file mode 100644 index 00000000000..9535957eedc --- /dev/null +++ b/etl/steps/data/meadow/un/2024-12-02/un_wpp_lt.py @@ -0,0 +1,67 @@ +"""Load a snapshot and create a meadow dataset.""" + +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Logger +log = get_logger() +# Column rename +COLUMNS_RENAME = { + "mx": "central_death_rate", + "qx": "probability_of_death", + "px": "probability_of_survival", + "lx": "number_survivors", + "dx": "number_deaths", + "Lx": "number_person_years_lived", + "Sx": "survivorship_ratio", + "Tx": "number_person_years_remaining", + "ex": "life_expectancy", + "ax": "average_survival_length", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + snap_short_names = [ + "un_wpp_lt_all", # ALL + "un_wpp_lt_f", # FEMALE + "un_wpp_lt_m", # MALE + ] + + tables = [] + for snap_short_name in snap_short_names: + # Load data from snapshot. + log.info(f"un_wpp_lt: reading {snap_short_name}") + snap = paths.load_snapshot(f"{snap_short_name}.csv") + tb = snap.read_csv( + dtype={ + "Notes": str, + "ISO3_code": "category", + "ISO2_code": "category", + }, + compression="gzip", + ) + # Rename columns + tb = tb.rename(columns=COLUMNS_RENAME) + # Filter only relevant location types + tb = tb[ + tb["LocTypeName"].isin(["Geographic region", "Income group", "Country/Area", "World", "Development group"]) + ] + # Set index + tb = tb.format(["location", "time", "sex", "agegrp", "loctypename"]) + # Add to tables list + tables.append(tb) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=tables, check_variables_metadata=True, default_metadata=snap.metadata) # type: ignore + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/unesco/2024-11-21/enrolment_rates.py b/etl/steps/data/meadow/unesco/2024-11-21/enrolment_rates.py new file mode 100644 index 00000000000..4353ba8b46e --- /dev/null +++ b/etl/steps/data/meadow/unesco/2024-11-21/enrolment_rates.py @@ -0,0 +1,29 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("enrolment_rates.csv") + + tb = snap.read(low_memory=False) + + tb = tb[["Country", "Time", "Value", "Indicator", "Flags"]] + tb = tb.rename(columns={"Time": "year"}) + + tb = tb.format(["year", "country", "indicator"]) + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/unicef/2024-07-30/child_migration.py b/etl/steps/data/meadow/unicef/2024-07-30/child_migration.py index 66b55d2763c..18cc204c639 100644 --- a/etl/steps/data/meadow/unicef/2024-07-30/child_migration.py +++ b/etl/steps/data/meadow/unicef/2024-07-30/child_migration.py @@ -38,7 +38,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("child_migration.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # Rename columns. tb = tb[COLUMNS_TO_KEEP] diff --git a/etl/steps/data/meadow/unu_wider/2023-11-01/government_revenue_dataset.py b/etl/steps/data/meadow/unu_wider/2023-11-01/government_revenue_dataset.py index ca80a82eea4..7772cbee02d 100644 --- a/etl/steps/data/meadow/unu_wider/2023-11-01/government_revenue_dataset.py +++ b/etl/steps/data/meadow/unu_wider/2023-11-01/government_revenue_dataset.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("government_revenue_dataset.dta") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/unu_wider/2024-04-22/world_income_inequality_database.py b/etl/steps/data/meadow/unu_wider/2024-04-22/world_income_inequality_database.py index b7c63ebe650..a6d56a7683f 100644 --- a/etl/steps/data/meadow/unu_wider/2024-04-22/world_income_inequality_database.py +++ b/etl/steps/data/meadow/unu_wider/2024-04-22/world_income_inequality_database.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("world_income_inequality_database.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="Sheet1") + tb = snap.read(safe_types=False, sheet_name="Sheet1") # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. tb = tb.format( diff --git a/etl/steps/data/meadow/urbanization/2024-10-14/ghsl_degree_of_urbanisation.py b/etl/steps/data/meadow/urbanization/2024-10-14/ghsl_degree_of_urbanisation.py index 8d4a8bac16c..568ae73296f 100644 --- a/etl/steps/data/meadow/urbanization/2024-10-14/ghsl_degree_of_urbanisation.py +++ b/etl/steps/data/meadow/urbanization/2024-10-14/ghsl_degree_of_urbanisation.py @@ -62,7 +62,7 @@ def run(dest_dir: str) -> None: def load_and_process_sheet(snap, sheet_name: str, columns_to_drop: list) -> Table: # Load data from snapshot. - tb = snap.read(sheet_name=sheet_name) + tb = snap.read(safe_types=False, sheet_name=sheet_name) # Remove rows where all values are NaNs. tb = tb.dropna(how="all") diff --git a/etl/steps/data/meadow/urbanization/2024-12-02/ghsl_urban_centers.py b/etl/steps/data/meadow/urbanization/2024-12-02/ghsl_urban_centers.py new file mode 100644 index 00000000000..ac3575f9683 --- /dev/null +++ b/etl/steps/data/meadow/urbanization/2024-12-02/ghsl_urban_centers.py @@ -0,0 +1,112 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("ghsl_urban_centers.xlsx") + + # Load data from snapshot. + tb_urban_center_names = snap.read(safe_types=False, sheet_name="General_info") + tb_urban_center_density = snap.read(safe_types=False, sheet_name="Area_km2_time_series") + tb_urban_center_population = snap.read(safe_types=False, sheet_name="POP_time_series") + + # Process data. + # + + # Remove duplicates in the ID sheet - based on the name of the urban center and country + tb_urban_center_names = tb_urban_center_names.drop_duplicates(subset=["Main Name", "GADM_name"]) + + tb_urban_center_names = tb_urban_center_names[ + [ + "ID_MTUC_G0", + "Main Name", + "GADM_name", + "UNSDGRegion", + "CountryCapital", + ] + ] + tb_urban_center_density = tb_urban_center_density.melt( + id_vars=["ID_MTUC_G0"], var_name="year", value_name="urban_area" + ) + tb_urban_center_population = tb_urban_center_population.melt( + id_vars=["ID_MTUC_G0"], var_name="year", value_name="urban_pop" + ) + + # Replace zeros with NaNs in the urban_pop column (when the urban center did not meet the criteria) + tb_urban_center_population["urban_pop"] = tb_urban_center_population["urban_pop"].replace(0, pd.NA) + + # Convert the urban_pop column to a numeric dtype + tb_urban_center_population["urban_pop"] = pd.to_numeric(tb_urban_center_population["urban_pop"], errors="coerce") + + tb = pr.merge( + tb_urban_center_population, + tb_urban_center_density, + on=["ID_MTUC_G0", "year"], + how="outer", + ) + tb["urban_density"] = tb["urban_pop"] / tb["urban_area"] + + tb = pr.merge( + tb, + tb_urban_center_names, + on="ID_MTUC_G0", + how="right", + ) + + tb = tb.rename( + columns={ + "GADM_name": "country", + "Main Name": "urban_center_name", + "UNSDGRegion": "region", + "WBIncome2022": "income_group", + "CountryCapital": "capital", + } + ) + + # Filter the Table where urban_center_name is NaN + tb = tb.dropna(subset=["urban_center_name"]) + + # Population and density of the capital city + tb_capitals = tb[tb["capital"] == 1] + + tb_capitals = tb_capitals.drop(columns=["ID_MTUC_G0", "region", "capital"]) + + # Select the top 100 most populous cities in 2020 + tb_2020 = tb[tb["year"] == 2020] + top_100_pop_2020 = tb_2020.nlargest(100, "urban_pop").drop_duplicates(subset=["ID_MTUC_G0"]) + + # Filter the original Table to select the top urban centers + tb_top = tb[tb["ID_MTUC_G0"].isin(top_100_pop_2020["ID_MTUC_G0"])] + + tb_top = tb_top.drop(columns=["urban_area", "ID_MTUC_G0", "region", "capital"]) + tb_top = tb_top.rename(columns={"urban_density": "urban_density_top_100", "urban_pop": "urban_pop_top_100"}) + + # Format the country column + tb_top["country"] = tb_top["urban_center_name"] + " (" + tb_top["country"] + ")" + tb_top = tb_top.drop(columns=["urban_center_name"]) + + tb = pr.merge(tb_capitals, tb_top, on=["country", "year"], how="outer") + + for col in ["urban_pop", "urban_density_top_100", "urban_pop_top_100"]: + tb[col].metadata.origins = tb["country"].metadata.origins + + tb = tb.format(["country", "year"]) + + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py b/etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py index 1eeba5fcac0..4041fc5610d 100644 --- a/etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py +++ b/etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshots and read their data. snap = paths.load_snapshot("food_availability.xls") - data = snap.read(sheet_name="Totals", skiprows=1) + data = snap.read(safe_types=False, sheet_name="Totals", skiprows=1) # # Process data. diff --git a/etl/steps/data/meadow/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py b/etl/steps/data/meadow/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py index ddfb88f2a2b..5febdd0b6f9 100644 --- a/etl/steps/data/meadow/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py +++ b/etl/steps/data/meadow/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py @@ -48,6 +48,8 @@ World production data for 1908 to the most recent year represent the quantity of feldspar that was produced annually throughout the world as reported in the MR and the MYB. World production data do not include production data for nepheline syenite.""", "silver": """World Production World production data for 1900 to the most recent year represent the recoverable silver content of precious-metal ores that were extracted from mines throughout the world. World production data were from the MR and the MYB.""", + "kyanite": """World Production +World production data for 1928-60 were from the “World Production” table in the 1960 MYB. World production data for 1961-70 were from the “World Mine Production” table in the CDS. World production data for 1971-2002 were from the MCS. Data for 2003 to 2006 are unpublished revisions made by the USGS kyanite commodity specialist. World production data for 2007 to the most recent year are from the 2010 to most recent MYB.""", } @@ -244,6 +246,23 @@ def combine_data_for_all_commodities( continue df = clean_sheet_data(data=data, commodity=commodity, sheet_name=sheet_name) + if sheet_name == "Gemstones": + # Gemstones does not contain a "Production" column, but a "Production value ($)" column. + # For now, for consistency with all other files, ignore this column. + df = df.drop(columns=["Production value ($)"], errors="raise") + + if sheet_name == "Ilmenite and Slag": + # For commodity "titanium_mineralconcentrates", the table contains a column "Unit values", with subcolumns for "Ilmenite", "Slag", and "Weighted average". We could attempt to extract the latter, but for now, simply skip. + continue + + if sheet_name == "Lithium statistics": + # Lithium has three different columns for global production, namely "World production (gross weight)", "World production (lithium content)", and "World production (lithium carbonate equivalent)". + # For consistency with USGS current data, we'll use the "World production (lithium content)" column. + # NOTE: The original "World production (gross weight)" column was already renamed to "World production" in clean_sheet_data(). + df = df.drop( + columns=["World production", "World production (lithium carbonate equivalent)"], errors="raise" + ).rename(columns={"World production (lithium content)": "World production"}, errors="raise") + # Add the dataframe for the current commodity to the combined dataframe. combined = pd.concat([combined, df]) diff --git a/etl/steps/data/meadow/war/2023-09-21/mars.py b/etl/steps/data/meadow/war/2023-09-21/mars.py index 1b9e9075723..9f12aac63cd 100644 --- a/etl/steps/data/meadow/war/2023-09-21/mars.py +++ b/etl/steps/data/meadow/war/2023-09-21/mars.py @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("war_mars.xls") # Read excel - tb = snap.read(sheet_name="ProjectMarsV1.1") + tb = snap.read(safe_types=False, sheet_name="ProjectMarsV1.1") # # Process data. diff --git a/etl/steps/data/meadow/war/2023-09-21/prio_v31.py b/etl/steps/data/meadow/war/2023-09-21/prio_v31.py index 11602ec0dcc..75821036e40 100644 --- a/etl/steps/data/meadow/war/2023-09-21/prio_v31.py +++ b/etl/steps/data/meadow/war/2023-09-21/prio_v31.py @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("prio_v31.xls") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/war/2023-11-29/chupilkin_koczan.py b/etl/steps/data/meadow/war/2023-11-29/chupilkin_koczan.py index fff1cf73d12..7c0b5ec6d5d 100644 --- a/etl/steps/data/meadow/war/2023-11-29/chupilkin_koczan.py +++ b/etl/steps/data/meadow/war/2023-11-29/chupilkin_koczan.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("chupilkin_koczan.dta") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/war/2024-01-08/strategic_nuclear_forces.py b/etl/steps/data/meadow/war/2024-01-08/strategic_nuclear_forces.py index d7264b62ea5..6398e522aa3 100644 --- a/etl/steps/data/meadow/war/2024-01-08/strategic_nuclear_forces.py +++ b/etl/steps/data/meadow/war/2024-01-08/strategic_nuclear_forces.py @@ -12,11 +12,11 @@ def run(dest_dir: str) -> None: # # Retrieve snapshots of dyadic data. snap = paths.load_snapshot("strategic_nuclear_forces.xlsx") - tb_dyadic = snap.read() + tb_dyadic = snap.read(safe_types=False) # Retrieve snapshots of monadic data. snap = paths.load_snapshot("strategic_nuclear_forces_monadic.xlsx") - tb_monadic = snap.read() + tb_monadic = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/war/2024-01-11/nuclear_threat_initiative_overview.py b/etl/steps/data/meadow/war/2024-01-11/nuclear_threat_initiative_overview.py index 619d4bdf3af..5b9ca4f7573 100644 --- a/etl/steps/data/meadow/war/2024-01-11/nuclear_threat_initiative_overview.py +++ b/etl/steps/data/meadow/war/2024-01-11/nuclear_threat_initiative_overview.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its main table. snap = paths.load_snapshot("nuclear_threat_initiative_overview.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/war/2024-01-23/nuclear_weapons_treaties.py b/etl/steps/data/meadow/war/2024-01-23/nuclear_weapons_treaties.py index 26d1805e481..bba68b4e45b 100644 --- a/etl/steps/data/meadow/war/2024-01-23/nuclear_weapons_treaties.py +++ b/etl/steps/data/meadow/war/2024-01-23/nuclear_weapons_treaties.py @@ -24,7 +24,7 @@ def run(dest_dir: str) -> None: for treaty_short_name, treaty_title in TREATIES.items(): # Retrieve snapshot and read its data. snap = paths.load_snapshot(f"{treaty_short_name}.csv") - tb = snap.read().assign(**{"treaty": treaty_title}) + tb = snap.read(safe_types=False).assign(**{"treaty": treaty_title}) data.append(tb) # diff --git a/etl/steps/data/meadow/war/2024-01-30/strategic_nuclear_forces.py b/etl/steps/data/meadow/war/2024-01-30/strategic_nuclear_forces.py index d7264b62ea5..6398e522aa3 100644 --- a/etl/steps/data/meadow/war/2024-01-30/strategic_nuclear_forces.py +++ b/etl/steps/data/meadow/war/2024-01-30/strategic_nuclear_forces.py @@ -12,11 +12,11 @@ def run(dest_dir: str) -> None: # # Retrieve snapshots of dyadic data. snap = paths.load_snapshot("strategic_nuclear_forces.xlsx") - tb_dyadic = snap.read() + tb_dyadic = snap.read(safe_types=False) # Retrieve snapshots of monadic data. snap = paths.load_snapshot("strategic_nuclear_forces_monadic.xlsx") - tb_monadic = snap.read() + tb_monadic = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py b/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py new file mode 100644 index 00000000000..7d7496f9cd9 --- /dev/null +++ b/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py @@ -0,0 +1,64 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("start") + + # + # Load inputs. + # + # Retrieve snapshot. + snap_10 = paths.load_snapshot(short_name="ucdp_ced_v24_0_10.csv") + snap_q3 = paths.load_snapshot(short_name="ucdp_ced_v24_01_24_09.csv") + + # Read as tables + tb_10 = snap_10.read_csv() + tb_q3 = snap_q3.read_csv() + + # Remove spurious columns, sanity checks + if "#" in tb_10.columns: + tb_10 = tb_10.drop(columns=["#"]) + + assert (tb_10.columns == tb_q3.columns).all(), "Columns do not match between monthly and quarterly snapshots!" + + # Combine tables + tb = pr.concat([tb_q3, tb_10], ignore_index=True) + tb = tb.drop_duplicates() + + # Monthly data may have events that were already reported in the quarterly release. + # Idea: Check that this is the case, and safely remove duplicates from the quarterly release, since the monthly release is more up-to-date. + + ## Ensure that all duplicate IDs are indeed because of duplicates between monthly-quarterly + value_counts = tb["id"].value_counts() + assert set(value_counts.unique()) == {1, 2}, "IDs should appear once or twice, not more!" + ids_duplicated = list(value_counts[value_counts > 1].index) + assert len(ids_duplicated) == tb_10[tb_10["id"].isin(ids_duplicated)].shape[0], "All duplicated ID" + tb = tb.drop_duplicates(subset="id", keep="last") + + # Format table + tb = tb.format( + "id", + short_name="ucdp_ced", + ) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, + tables=[tb], + check_variables_metadata=True, + ) # type: ignore + + # Save changes in the new garden dataset. + ds_meadow.save() + + paths.log.info("end") diff --git a/etl/steps/data/meadow/wash/2024-01-06/who.py b/etl/steps/data/meadow/wash/2024-01-06/who.py index d609f9e44a8..2de9833f5f3 100644 --- a/etl/steps/data/meadow/wash/2024-01-06/who.py +++ b/etl/steps/data/meadow/wash/2024-01-06/who.py @@ -20,7 +20,7 @@ def run(dest_dir: str) -> None: # Retrieve snapshot. snap = paths.load_snapshot("who.csv") snap_regions = paths.load_snapshot("who_regions.csv") - tb = snap.read() + tb = snap.read(safe_types=False) tb_reg = snap_regions.read() # Prepare data. tb = tb.drop(columns=["iso3"], axis=1) diff --git a/etl/steps/data/meadow/wb/2023-11-21/worldwide_bureaucracy_indicators.py b/etl/steps/data/meadow/wb/2023-11-21/worldwide_bureaucracy_indicators.py index 54d98d4d29e..a625e52918f 100644 --- a/etl/steps/data/meadow/wb/2023-11-21/worldwide_bureaucracy_indicators.py +++ b/etl/steps/data/meadow/wb/2023-11-21/worldwide_bureaucracy_indicators.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("worldwide_bureaucracy_indicators.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/wb/2023-12-29/gender_statistics.py b/etl/steps/data/meadow/wb/2023-12-29/gender_statistics.py index b442c159505..152c96b41b1 100644 --- a/etl/steps/data/meadow/wb/2023-12-29/gender_statistics.py +++ b/etl/steps/data/meadow/wb/2023-12-29/gender_statistics.py @@ -8,7 +8,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("gender_statistics.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb.format(["country", "year", "wb_seriescode"]) # Drop indicator_name column series column as it should be roughgly the same as indicator_name column (long definition of the indicator) diff --git a/etl/steps/data/meadow/wb/2024-01-17/world_bank_pip.py b/etl/steps/data/meadow/wb/2024-01-17/world_bank_pip.py index 90c84c0726d..de612a95039 100644 --- a/etl/steps/data/meadow/wb/2024-01-17/world_bank_pip.py +++ b/etl/steps/data/meadow/wb/2024-01-17/world_bank_pip.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # Retrieve snapshots. # For key indicators snap = paths.load_snapshot("world_bank_pip.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # For percentiles snap_percentiles = paths.load_snapshot("world_bank_pip_percentiles.csv") diff --git a/etl/steps/data/meadow/wb/2024-01-22/thousand_bins_distribution.py b/etl/steps/data/meadow/wb/2024-01-22/thousand_bins_distribution.py index 004d287478e..b2640f95dd7 100644 --- a/etl/steps/data/meadow/wb/2024-01-22/thousand_bins_distribution.py +++ b/etl/steps/data/meadow/wb/2024-01-22/thousand_bins_distribution.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("thousand_bins_distribution.dta") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/wb/2024-03-11/income_groups.py b/etl/steps/data/meadow/wb/2024-03-11/income_groups.py index e9ff5ca6300..1dea322d1b6 100644 --- a/etl/steps/data/meadow/wb/2024-03-11/income_groups.py +++ b/etl/steps/data/meadow/wb/2024-03-11/income_groups.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: # # Load snapshot and read its data. snap = paths.load_snapshot("income_groups.xlsx") - tb = snap.read(sheet_name="Country Analytical History") + tb = snap.read(safe_types=False, sheet_name="Country Analytical History") # # Process data. diff --git a/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py b/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py index 90c84c0726d..de612a95039 100644 --- a/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py +++ b/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # Retrieve snapshots. # For key indicators snap = paths.load_snapshot("world_bank_pip.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # For percentiles snap_percentiles = paths.load_snapshot("world_bank_pip_percentiles.csv") diff --git a/etl/steps/data/meadow/wb/2024-07-29/income_groups.py b/etl/steps/data/meadow/wb/2024-07-29/income_groups.py index e9ff5ca6300..1dea322d1b6 100644 --- a/etl/steps/data/meadow/wb/2024-07-29/income_groups.py +++ b/etl/steps/data/meadow/wb/2024-07-29/income_groups.py @@ -18,7 +18,7 @@ def run(dest_dir: str) -> None: # # Load snapshot and read its data. snap = paths.load_snapshot("income_groups.xlsx") - tb = snap.read(sheet_name="Country Analytical History") + tb = snap.read(safe_types=False, sheet_name="Country Analytical History") # # Process data. diff --git a/etl/steps/data/meadow/wb/2024-09-09/food_prices_for_nutrition.py b/etl/steps/data/meadow/wb/2024-09-09/food_prices_for_nutrition.py index ea02b56e8e8..b07e05ca3a7 100644 --- a/etl/steps/data/meadow/wb/2024-09-09/food_prices_for_nutrition.py +++ b/etl/steps/data/meadow/wb/2024-09-09/food_prices_for_nutrition.py @@ -46,7 +46,7 @@ def run(dest_dir: str) -> None: # # Retrieve snapshot and read its data. snap = paths.load_snapshot("food_prices_for_nutrition.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/wb/2024-10-07/world_bank_pip.py b/etl/steps/data/meadow/wb/2024-10-07/world_bank_pip.py index d2775702981..74d4ec7c1c9 100644 --- a/etl/steps/data/meadow/wb/2024-10-07/world_bank_pip.py +++ b/etl/steps/data/meadow/wb/2024-10-07/world_bank_pip.py @@ -13,15 +13,15 @@ def run(dest_dir: str) -> None: # Retrieve snapshots. # For key indicators snap = paths.load_snapshot("world_bank_pip.csv") - tb = snap.read() + tb = snap.read(safe_types=False) # For percentiles snap_percentiles = paths.load_snapshot("world_bank_pip_percentiles.csv") - tb_percentiles = snap_percentiles.read() + tb_percentiles = snap_percentiles.read(safe_types=False) # For regional definitions snap_regions = paths.load_snapshot("world_bank_pip_regions.csv") - tb_regions = snap_regions.read() + tb_regions = snap_regions.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/wb/2024-11-04/edstats.py b/etl/steps/data/meadow/wb/2024-11-04/edstats.py new file mode 100644 index 00000000000..a8eeb008359 --- /dev/null +++ b/etl/steps/data/meadow/wb/2024-11-04/edstats.py @@ -0,0 +1,34 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("edstats.csv") + + # Load data from snapshot. + tb = snap.read(low_memory=False) + + # + # Process data. + # + # Rename indicator code and name columns + tb = tb.rename(columns={"Series": "indicator_name", "wb_seriescode": "indicator_code"}) + + tb = tb.format(["country", "year", "indicator_name"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/wb/2024-12-03/poverty_projections.py b/etl/steps/data/meadow/wb/2024-12-03/poverty_projections.py new file mode 100644 index 00000000000..5855e159ba8 --- /dev/null +++ b/etl/steps/data/meadow/wb/2024-12-03/poverty_projections.py @@ -0,0 +1,60 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Define files directory +FILES_DIRECTORY = "FR_WLD_2024_198/Reproducibility package/Chapter 1/1-data/raw/forecasts" + +# Define index columns +INDEX_COLUMNS = ["country", "year", "povertyline", "scenario"] + +# Define table parameters +TABLE_PARAMETERS = { + "country": {"file": "FGTcountry_1990_2050_3pr24.dta"}, + "region": {"file": "FGTregion_1990_2050_3pr24.dta"}, + "global": {"file": "FGTglobal_1990_2050_3pr24.dta"}, +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("reproducibility_package_poverty_prosperity_planet.zip") + + # Define empty list to store tables. + tables = [] + for table, table_config in TABLE_PARAMETERS.items(): + # Load data from snapshot. + tb = snap.read_in_archive(f"{FILES_DIRECTORY}/{table_config['file']}") + + # + # Process data. + # + # Rename and add columns + if table == "region": + tb = tb.rename(columns={"region_pip": "country"}) + elif table == "global": + tb["country"] = "World" + + # Remove duplicates in the data + tb = tb.drop_duplicates(subset=INDEX_COLUMNS) + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(keys=INDEX_COLUMNS, short_name=table) + + # Append table to list. + tables.append(tb) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=tables, check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/who/2022-09-30/ghe.py b/etl/steps/data/meadow/who/2022-09-30/ghe.py index cb58ad977ee..d56141b3ca2 100644 --- a/etl/steps/data/meadow/who/2022-09-30/ghe.py +++ b/etl/steps/data/meadow/who/2022-09-30/ghe.py @@ -8,7 +8,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot() - tb = snap.read() + tb = snap.read(safe_types=False) # clean and transform data tb = clean_data(tb) diff --git a/etl/steps/data/meadow/who/2024-04-08/polio_afp.py b/etl/steps/data/meadow/who/2024-04-08/polio_afp.py index e7ba26edcf6..7986703e1b3 100644 --- a/etl/steps/data/meadow/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/meadow/who/2024-04-08/polio_afp.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("polio_afp.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) tb = tb.rename(columns={"Country / Territory / Region": "country", "Year": "year"}) # # Process data. diff --git a/etl/steps/data/meadow/who/2024-04-09/polio_historical.py b/etl/steps/data/meadow/who/2024-04-09/polio_historical.py index 684655a90b7..34a2229bd00 100644 --- a/etl/steps/data/meadow/who/2024-04-09/polio_historical.py +++ b/etl/steps/data/meadow/who/2024-04-09/polio_historical.py @@ -16,7 +16,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("polio_historical.xls") # Load data from snapshot. - tb = snap.read(sheet_name="Polio") + tb = snap.read(safe_types=False, sheet_name="Polio") # # Process data. diff --git a/etl/steps/data/meadow/who/2024-04-22/polio_vaccine_schedule.py b/etl/steps/data/meadow/who/2024-04-22/polio_vaccine_schedule.py index e45801e8e59..26fbbf44b61 100644 --- a/etl/steps/data/meadow/who/2024-04-22/polio_vaccine_schedule.py +++ b/etl/steps/data/meadow/who/2024-04-22/polio_vaccine_schedule.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("polio_vaccine_schedule.xlsx") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/who/2024-04-26/avian_influenza_ah5n1.py b/etl/steps/data/meadow/who/2024-04-26/avian_influenza_ah5n1.py index a562ce43812..a4bb6343b25 100644 --- a/etl/steps/data/meadow/who/2024-04-26/avian_influenza_ah5n1.py +++ b/etl/steps/data/meadow/who/2024-04-26/avian_influenza_ah5n1.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("avian_influenza_ah5n1.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/who/2024-07-26/mortality_database.py b/etl/steps/data/meadow/who/2024-07-26/mortality_database.py index e28a3b96a0e..2b8a520879f 100644 --- a/etl/steps/data/meadow/who/2024-07-26/mortality_database.py +++ b/etl/steps/data/meadow/who/2024-07-26/mortality_database.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("mortality_database.feather") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/who/2024-07-30/ghe.py b/etl/steps/data/meadow/who/2024-07-30/ghe.py index cb58ad977ee..d56141b3ca2 100644 --- a/etl/steps/data/meadow/who/2024-07-30/ghe.py +++ b/etl/steps/data/meadow/who/2024-07-30/ghe.py @@ -8,7 +8,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot() - tb = snap.read() + tb = snap.read(safe_types=False) # clean and transform data tb = clean_data(tb) diff --git a/etl/steps/data/meadow/who/2024-08-06/mortality_database_cancer.py b/etl/steps/data/meadow/who/2024-08-06/mortality_database_cancer.py index 6d2a03eae65..f56241c5974 100644 --- a/etl/steps/data/meadow/who/2024-08-06/mortality_database_cancer.py +++ b/etl/steps/data/meadow/who/2024-08-06/mortality_database_cancer.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("mortality_database_cancer.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/who/latest/avian_influenza_ah5n1.py b/etl/steps/data/meadow/who/latest/avian_influenza_ah5n1.py index a562ce43812..a4bb6343b25 100644 --- a/etl/steps/data/meadow/who/latest/avian_influenza_ah5n1.py +++ b/etl/steps/data/meadow/who/latest/avian_influenza_ah5n1.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("avian_influenza_ah5n1.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/who/latest/flunet.py b/etl/steps/data/meadow/who/latest/flunet.py index c2c5506ae65..b8ad6a7a779 100644 --- a/etl/steps/data/meadow/who/latest/flunet.py +++ b/etl/steps/data/meadow/who/latest/flunet.py @@ -34,7 +34,7 @@ def run(dest_dir: str) -> None: # Convert messy columns to string. # for col in ("aother_subtype_details", "parainfluenza", "otherrespvirus"): - for col in ("aother_subtype_details",): + for col in ("aother_subtype_details", "other_respvirus_details"): ix = tb[col].notnull() tb.loc[ix, col] = tb.loc[ix, col].astype("str") diff --git a/etl/steps/data/meadow/who/latest/monkeypox.py b/etl/steps/data/meadow/who/latest/monkeypox.py index 051263e1f0d..76ff2217b08 100644 --- a/etl/steps/data/meadow/who/latest/monkeypox.py +++ b/etl/steps/data/meadow/who/latest/monkeypox.py @@ -14,7 +14,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("monkeypox.csv") # Load data from snapshot. - tb = snap.read() + tb = snap.read(safe_types=False) # # Process data. diff --git a/etl/steps/data/meadow/wpf/2024-10-03/famines.py b/etl/steps/data/meadow/wpf/2024-10-03/famines.py index c2aed7598e6..7eed3c1d884 100644 --- a/etl/steps/data/meadow/wpf/2024-10-03/famines.py +++ b/etl/steps/data/meadow/wpf/2024-10-03/famines.py @@ -76,7 +76,7 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("famines.xlsx") # Load data from snapshot. - tb = snap.read(sheet_name="0. Spreadsheet for disseminatio") + tb = snap.read(safe_types=False, sheet_name="0. Spreadsheet for disseminatio") # # Process data. diff --git a/etl/steps/export/explorers/covid/latest/covid.config.yml b/etl/steps/export/explorers/covid/latest/covid.config.yml index 78a765b42e8..34abcc9829e 100644 --- a/etl/steps/export/explorers/covid/latest/covid.config.yml +++ b/etl/steps/export/explorers/covid/latest/covid.config.yml @@ -492,7 +492,7 @@ views: title: COVID-19 vaccines administered, initial doses and boosters subtitle: Total number of doses administered, broken down by whether they are part of the initial protocol or booster doses. hasMapTab: False - type: StackedDiscreteBar + chartTypes: ["StackedDiscreteBar"] - indicator: - grapher/covid/latest/vaccinations_global/vaccinations_global#total_vaccinations_no_boosters_per_hundred_interpolated - grapher/covid/latest/vaccinations_global/vaccinations_global#total_boosters_per_hundred @@ -503,7 +503,7 @@ views: title: COVID-19 vaccines administered per 100 people, initial doses and boosters subtitle: Total number of doses administered, broken down by whether they are part of the initial protocol or booster doses, divided by the total population of the country. hasMapTab: False - type: StackedDiscreteBar + chartTypes: ["StackedDiscreteBar"] ####################### # People vaccinated @@ -622,7 +622,7 @@ views: sortBy: column sortColumnSlug: Omicron hideTotalValueLabel: True - type: StackedDiscreteBar + chartTypes: ["StackedDiscreteBar"] note: This share may not reflect the complete breakdown of cases, since only a fraction of all cases are sequenced. Recently-discovered or actively-monitored variants may be overrepresented, as suspected cases of these variants are likely to be sequenced preferentially or faster than other cases. ####################### # Omicron variant (share) diff --git a/etl/steps/export/explorers/covid/latest/covid.py b/etl/steps/export/explorers/covid/latest/covid.py index 4da19b84510..f9f3cef1bf6 100644 --- a/etl/steps/export/explorers/covid/latest/covid.py +++ b/etl/steps/export/explorers/covid/latest/covid.py @@ -69,7 +69,7 @@ def run(dest_dir: str) -> None: # Load necessry tables # ds = paths.load_dataset("cases_deaths") - # tb = ds.read_table("cases_deaths") + # tb = ds.read("cases_deaths") # Read all tables # tables = {} diff --git a/etl/steps/export/explorers/migration/2024-08-05/migration.py b/etl/steps/export/explorers/migration/2024-08-05/migration.py index 072e4267a69..1a3e2c41557 100644 --- a/etl/steps/export/explorers/migration/2024-08-05/migration.py +++ b/etl/steps/export/explorers/migration/2024-08-05/migration.py @@ -27,12 +27,12 @@ def run(dest_dir: str) -> None: ds_wdi = paths.load_dataset("wdi") ds_idmc = paths.load_dataset("internal_displacement") - tb_child_mig = ds_unicef.read_table("child_migration") - tb_refugee_data = ds_unhcr.read_table("refugee_data") - tb_migrant_stock = ds_undesa.read_table("migrant_stock") - tb_un_wpp_full = ds_un_wpp.read_table("migration") - tb_wdi = ds_wdi.read_table("wdi") - tb_idmc = ds_idmc.read_table("internal_displacement") + tb_child_mig = ds_unicef.read("child_migration") + tb_refugee_data = ds_unhcr.read("refugee_data") + tb_migrant_stock = ds_undesa.read("migrant_stock") + tb_un_wpp_full = ds_un_wpp.read("migration") + tb_wdi = ds_wdi.read("wdi") + tb_idmc = ds_idmc.read("internal_displacement") tbs_and_ds = [ (tb_child_mig, ds_unicef), diff --git a/etl/steps/export/explorers/minerals/latest/minerals.py b/etl/steps/export/explorers/minerals/latest/minerals.py index b07032f06b7..436eeed3885 100644 --- a/etl/steps/export/explorers/minerals/latest/minerals.py +++ b/etl/steps/export/explorers/minerals/latest/minerals.py @@ -3,7 +3,7 @@ import pandas as pd from structlog import get_logger -from etl.helpers import PathFinder, create_explorer_old +from etl.helpers import PathFinder, create_explorer # Initialize log. log = get_logger() @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None: # # Load minerals grapher dataset and read its main table. ds = paths.load_dataset("minerals") - tb = ds.read_table("minerals") + tb = ds.read("minerals") # # Process data. @@ -190,5 +190,5 @@ def run(dest_dir: str) -> None: # Save outputs. # # Create a new explorers dataset and tsv file. - ds_explorer = create_explorer_old(dest_dir=dest_dir, config=config, df_graphers=df_graphers) + ds_explorer = create_explorer(dest_dir=dest_dir, config=config, df_graphers=df_graphers) ds_explorer.save() diff --git a/etl/steps/export/explorers/minerals/latest/minerals_supply_and_demand_prospects.py b/etl/steps/export/explorers/minerals/latest/minerals_supply_and_demand_prospects.py index e9e895ca386..700126ba8a1 100644 --- a/etl/steps/export/explorers/minerals/latest/minerals_supply_and_demand_prospects.py +++ b/etl/steps/export/explorers/minerals/latest/minerals_supply_and_demand_prospects.py @@ -18,11 +18,11 @@ def run(dest_dir: str) -> None: # # Load minerals grapher dataset on demand by technology. ds_demand = paths.load_dataset("critical_minerals_demand_by_technology") - tb_demand = ds_demand.read_table("demand_by_technology") + tb_demand = ds_demand.read("demand_by_technology") # Load minerals grapher dataset on supply by country. ds_supply = paths.load_dataset("critical_minerals_supply_by_country") - tb_supply = ds_supply.read_table("supply_by_country") + tb_supply = ds_supply.read("supply_by_country") # # Process data. @@ -86,7 +86,7 @@ def run(dest_dir: str) -> None: df_graphers["yAxisMin"] = 0 # Make all views stacked area charts. - df_graphers["type"] = "StackedArea" + df_graphers["chartTypes"] = ["StackedArea"] # Sanity check. error = "Duplicated rows in explorer." diff --git a/etl/steps/export/github/co2_data/latest/owid_co2.py b/etl/steps/export/github/co2_data/latest/owid_co2.py index bdc6bc1f75a..98927151f8e 100644 --- a/etl/steps/export/github/co2_data/latest/owid_co2.py +++ b/etl/steps/export/github/co2_data/latest/owid_co2.py @@ -1,374 +1,257 @@ """Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset. -Datasets combined: +The combined datasets are: * Global Carbon Budget - Global Carbon Project. * National contributions to climate change - Jones et al. -* Greenhouse gas emissions by sector - Climate Watch. * Primary energy consumption - EI & EIA. Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2023) on GDP are included. -""" - +Outputs that will be committed to a branch in the co2-data repository: +* The main data file (as a .csv file). +* The codebook (as a .csv file). +* The README file. +""" import os +import tempfile +from pathlib import Path -import numpy as np -from owid.catalog import Dataset, Origin, Table +import git +import pandas as pd +from owid.catalog import Table +from structlog import get_logger from apps.owidbot import github_utils as gh from etl.helpers import PathFinder +from etl.paths import BASE_DIR + +# Initialize logger. +log = get_logger() # Get paths and naming conventions for current step. paths = PathFinder(__file__) -# Conversion factor from tonnes to million tonnes. -TONNES_TO_MILLION_TONNES = 1e-6 - -# Select columns to use from each dataset, and how to rename them. -GCP_COLUMNS = { - "country": "country", - "year": "year", - "emissions_total": "co2", - "emissions_total_per_capita": "co2_per_capita", - "traded_emissions": "trade_co2", - "emissions_from_cement": "cement_co2", - "emissions_from_cement_per_capita": "cement_co2_per_capita", - "emissions_from_coal": "coal_co2", - "emissions_from_coal_per_capita": "coal_co2_per_capita", - "emissions_from_flaring": "flaring_co2", - "emissions_from_flaring_per_capita": "flaring_co2_per_capita", - "emissions_from_gas": "gas_co2", - "emissions_from_gas_per_capita": "gas_co2_per_capita", - "emissions_from_oil": "oil_co2", - "emissions_from_oil_per_capita": "oil_co2_per_capita", - "emissions_from_other_industry": "other_industry_co2", - "emissions_from_other_industry_per_capita": "other_co2_per_capita", - "pct_growth_emissions_total": "co2_growth_prct", - "growth_emissions_total": "co2_growth_abs", - "emissions_total_per_gdp": "co2_per_gdp", - "emissions_total_per_unit_energy": "co2_per_unit_energy", - "consumption_emissions": "consumption_co2", - "consumption_emissions_per_capita": "consumption_co2_per_capita", - "consumption_emissions_per_gdp": "consumption_co2_per_gdp", - "cumulative_emissions_total": "cumulative_co2", - "cumulative_emissions_from_cement": "cumulative_cement_co2", - "cumulative_emissions_from_coal": "cumulative_coal_co2", - "cumulative_emissions_from_flaring": "cumulative_flaring_co2", - "cumulative_emissions_from_gas": "cumulative_gas_co2", - "cumulative_emissions_from_oil": "cumulative_oil_co2", - "cumulative_emissions_from_other_industry": "cumulative_other_co2", - "pct_traded_emissions": "trade_co2_share", - "emissions_total_as_share_of_global": "share_global_co2", - "emissions_from_cement_as_share_of_global": "share_global_cement_co2", - "emissions_from_coal_as_share_of_global": "share_global_coal_co2", - "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", - "emissions_from_gas_as_share_of_global": "share_global_gas_co2", - "emissions_from_oil_as_share_of_global": "share_global_oil_co2", - "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", - "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", - "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", - "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", - "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", - "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", - "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", - "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", - # New variables, related to land-use change emissions. - "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", - "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", - "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", - "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", - "emissions_from_land_use_change": "land_use_change_co2", - "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", - "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", - "emissions_total_including_land_use_change": "co2_including_luc", - "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", - "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", - "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", - "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", - "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", - "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", -} -JONES_COLUMNS = { - "country": "country", - "year": "year", - "temperature_response_co2_total": "temperature_change_from_co2", - "temperature_response_ghg_total": "temperature_change_from_ghg", - "temperature_response_ch4_total": "temperature_change_from_ch4", - "temperature_response_n2o_total": "temperature_change_from_n2o", - "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", -} -CLIMATE_WATCH_GHG_COLUMNS = { - "country": "country", - "year": "year", - "total_ghg_emissions_excluding_lucf": "total_ghg_excluding_lucf", - "total_ghg_emissions_excluding_lucf_per_capita": "ghg_excluding_lucf_per_capita", - "total_ghg_emissions_including_lucf": "total_ghg", - "total_ghg_emissions_including_lucf_per_capita": "ghg_per_capita", -} -CLIMATE_WATCH_CH4_COLUMNS = { - "country": "country", - "year": "year", - "total_ch4_emissions_including_lucf": "methane", - "total_ch4_emissions_including_lucf_per_capita": "methane_per_capita", -} -CLIMATE_WATCH_N2O_COLUMNS = { - "country": "country", - "year": "year", - "total_n2o_emissions_including_lucf": "nitrous_oxide", - "total_n2o_emissions_including_lucf_per_capita": "nitrous_oxide_per_capita", -} -PRIMARY_ENERGY_COLUMNS = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "primary_energy_consumption", - "primary_energy_consumption_per_capita__kwh": "energy_per_capita", - "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", -} -REGIONS_COLUMNS = { - "name": "country", - "iso_alpha3": "iso_code", -} -POPULATION_COLUMNS = { - "country": "country", - "year": "year", - "population": "population", -} -GDP_COLUMNS = { - "country": "country", - "year": "year", - "gdp": "gdp", -} - -UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes", "new_short_unit": "Mt"}} - - -def convert_units(table: Table) -> Table: - """Convert units of table. - - Parameters - ---------- - table : Table - Data with its original units. - - Returns - ------- - Table - Data after converting units of specific columns. - - """ - table = table.copy() - # Check units and convert to more convenient ones. - for column in table.columns: - unit = table[column].metadata.unit - title = table[column].metadata.title - description_short = table[column].metadata.description or table[column].metadata.description_short - if unit in list(UNITS): - table[column] *= UNITS[unit]["conversion"] - table[column].metadata.unit = UNITS[unit]["new_unit"] - table[column].metadata.short_unit = UNITS[unit]["new_short_unit"] - table[column].metadata.title = title.replace(unit, UNITS[unit]["new_unit"]) - table[column].metadata.description_short = description_short.replace(unit, UNITS[unit]["new_unit"]) - - return table - - -def combine_tables( - tb_gcp: Table, - tb_jones: Table, - tb_climate_watch_ghg: Table, - tb_climate_watch_ch4: Table, - tb_climate_watch_n2o: Table, - tb_energy: Table, - tb_gdp: Table, - tb_population: Table, - tb_regions: Table, -) -> Table: - """Combine tables. - - Parameters - ---------- - tb_gcp : Table - Global Carbon Budget table (from Global Carbon Project). - tb_jones : Table - National contributions to climate change (from Jones et al. (2023)). - tb_climate_watch_ghg : Table - Greenhouse gas emissions table (from Climate Watch). - tb_climate_watch_ch4 : Table - CH4 emissions table (from Climate Watch). - tb_climate_watch_n2o : Table - N2O emissions table (from Climate Watch). - tb_energy : Table - Primary energy consumption table (from BP & EIA). - tb_gdp : Table - Maddison GDP table (from GGDC). - tb_population : Table - OWID population table (from various sources). - tb_regions : Table - OWID regions table. - - Returns - ------- - combined : Table - Combined table with metadata and variables metadata. - - """ - # Combine main tables (with an outer join, to gather all entities from all tables). - combined = tb_gcp.copy() - for table in [tb_jones, tb_climate_watch_ghg, tb_climate_watch_ch4, tb_climate_watch_n2o]: - combined = combined.merge(table, on=["country", "year"], how="outer", short_name=paths.short_name) - - # Add secondary tables (with a left join, to keep only entities for which we have emissions data). - for table in [tb_energy, tb_gdp, tb_population]: - combined = combined.merge(table, on=["country", "year"], how="left") - - # Countries-regions dataset does not have a year column, so it has to be merged on country. - combined = combined.merge(tb_regions, on="country", how="left") - - # Check that there were no repetition in column names. - error = "Repeated columns in combined data." - assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error - - # Adjust units. - combined = convert_units(combined) - - return combined - - -def prepare_outputs(combined: Table, ds_regions: Dataset) -> Table: - """Clean and prepare output table. - - Parameters - ---------- - combined : Table - Combined table. - ds_regions : Dataset - Regions dataset, only used to get its version. - - Returns - ------- - combined: Table - Cleaned combined table. - - """ - # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). - columns_that_must_have_data = [ - column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] - ] - combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - - # Add metadata to the ISO column (loaded from the regions dataset). - combined["iso_code"].m.origins = [ - Origin( - producer="International Organization for Standardization", - title="Regions", - date_published=ds_regions.version, - ) - ] - combined["iso_code"].metadata.title = "ISO code" - combined["iso_code"].metadata.description_short = "ISO 3166-1 alpha-3 three-letter country codes." - combined["iso_code"].metadata.unit = "" - # Sanity check. - columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] - assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" +def prepare_readme(tb: Table) -> str: + # NOTE: In a future update, we could figure out a way to generate the main content of the README from the table's metadata (possibly with the help of VersionTracker). + # origins = {origin.title_snapshot or origin.title: origin for origin in set(sum([tb[column].metadata.origins for column in tb.columns], []))} + readme = """\ +# Data on CO2 and Greenhouse Gas Emissions by *Our World in Data* + +Our complete CO2 and Greenhouse Gas Emissions dataset is a collection of key metrics maintained by [*Our World in Data*](https://ourworldindata.org/co2-and-other-greenhouse-gas-emissions). It is updated regularly and includes data on CO2 emissions (annual, per capita, cumulative and consumption-based), other greenhouse gases, energy mix, and other relevant metrics. + +## The complete *Our World in Data* CO2 and Greenhouse Gas Emissions dataset + +### 🗂️ Download our complete CO2 and Greenhouse Gas Emissions dataset : [CSV](https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.csv) | [XLSX](https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.xlsx) | [JSON](https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.json) + +The CSV and XLSX files follow a format of 1 row per location and year. The JSON version is split by country, with an array of yearly records. + +The indicators represent all of our main data related to CO2 emissions, other greenhouse gas emissions, energy mix, as well as other indicators of potential interest. + +We will continue to publish updated data on CO2 and Greenhouse Gas Emissions as it becomes available. Most metrics are published on an annual basis. + +A [full codebook](https://github.com/owid/co2-data/blob/master/owid-co2-codebook.csv) is made available, with a description and source for each indicator in the dataset. This codebook is also included as an additional sheet in the XLSX file. + +## Our source data and code + +The dataset is built upon a number of datasets and processing steps: + +- Statistical review of world energy (Energy Institute, EI): + - [Source data](https://www.energyinst.org/statistical-review) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/energy_institute/2024-06-20/statistical_review_of_world_energy.py) + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/energy_institute/2024-06-20/statistical_review_of_world_energy.py) + - [Further processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/energy_institute/2024-06-20/statistical_review_of_world_energy.py) +- International energy data (U.S. Energy Information Administration, EIA): + - [Source data](https://www.eia.gov/opendata/bulkfiles.php) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/eia/2023-12-12/international_energy_data.py) + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/eia/2023-12-12/energy_consumption.py) + - [Further processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/eia/2023-12-12/energy_consumption.py) +- Primary energy consumption (Our World in Data based on EI's Statistical review of world energy & EIA's International energy data): + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/energy/2024-06-20/primary_energy_consumption.py) +- Global carbon budget - Fossil CO2 emissions (Global Carbon Project): + - [Source data](https://zenodo.org/records/13981696/files/GCB2024v17_MtCO2_flat.csv) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/gcp/2024-11-13/global_carbon_budget.py) +- Global carbon budget - Global carbon emissions (Global Carbon Project): + - [Source data](https://globalcarbonbudgetdata.org/downloads/jGJH0-data/Global_Carbon_Budget_2024_v1.0.xlsx) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/gcp/2024-11-13/global_carbon_budget.py) +- Global carbon budget - National fossil carbon emissions (Global Carbon Project): + - [Source data](https://globalcarbonbudgetdata.org/downloads/jGJH0-data/National_Fossil_Carbon_Emissions_2024v1.0.xlsx) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/gcp/2024-11-13/global_carbon_budget.py) +- Global carbon budget - National land-use change carbon emissions (Global Carbon Project): + - [Source data](https://globalcarbonbudgetdata.org/downloads/jGJH0-data/National_LandUseChange_Carbon_Emissions_2024v1.0.xlsx) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/gcp/2024-11-13/global_carbon_budget.py) +- Global carbon budget (Our World in Data based on the Global Carbon Project's Fossil CO2 emissions, Global carbon emissions, National fossil carbon emissions, and National land-use change emissions): + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/gcp/2024-11-13/global_carbon_budget.py) + - [Further processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.py) +- National contributions to climate change (Jones et al. (2024)): + - [Source data](https://zenodo.org/records/7636699/latest) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/emissions/2024-11-21/national_contributions.py) + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/emissions/2024-11-21/national_contributions.py) + - [Further processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/emissions/2024-11-21/national_contributions.py) +- CO2 dataset (Our World in Data based on all sources above): + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/emissions/2024-11-21/owid_co2.py) + - [Exporting code](https://github.com/owid/etl/blob/master/etl/steps/export/github/co2_data/latest/owid_co2.py) + - [Uploading code](https://github.com/owid/etl/blob/master/etl/steps/export/s3/co2_data/latest/owid_co2.py) + +Additionally, to construct indicators per capita and per GDP, we use the following datasets and processing steps: +- Regions (Our World in Data). + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/regions/2023-01-01/regions.py) +- Population (Our World in Data based on [a number of different sources](https://ourworldindata.org/population-sources)). + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/demography/2024-07-15/population/__init__.py) +- Income groups (World Bank). + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/wb/2024-07-29/income_groups.py) +- GDP (University of Groningen GGDC's Maddison Project Database, Bolt and van Zanden, 2024). + - [Source data](https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2023) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/ggdc/2024-04-26/maddison_project_database.py) + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/ggdc/2024-04-26/maddison_project_database.py) + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/ggdc/2024-04-26/maddison_project_database.py) + +## Changelog + +- 2024-11-21: + - Updated dataset (and codebook) to use the latest version of the Global Carbon Budget (2024), and Jones et al. (2024) (version 2024.2). + - Now methane, nitrous oxide, and total greenhouse gas emissions data come from Jones et al. (2024), instead of Climate Watch, to provide a wider data coverage. +- 2024-06-20: + - Update data from the Statistical Review of World Energy. + - Update data from the Maddison Project Database. +- 2024-04-10: + - Updated dataset and codebook to use the latest version of the data on National contributions to climate change (Jones et al. (2024)). +- 2023-12-28: + - Enhanced codebook (improved descriptions, added units, updated sources). + - Updated primary energy consumption (to update metadata, nothing has changed in the data). +- 2023-12-05: + - Updated dataset (and codebook) to use the latest version of the Global Carbon Budget (2023). + - In this version, "International transport" has been replaced by "International aviation" and "International shipping". Also, some overseas territories have no data in this version. More details on the changes can be found in the pdf file hosted [here](https://zenodo.org/records/10177738). +- 2023-11-08: + - Updated CO2 emissions data to use the latest emissions by sector from Climate Watch (2023). + - Update codebook accordingly. +- 2023-10-16: + - Improved codebook. + - Fixed issue related to consumption-based emissions in Africa, and Palau emissions. +- 2023-07-10: + - Updated primary energy consumption and other indicators relying on energy data, to use the latest Statistical Review of World Energy by the Energy Institute. + - Renamed countries 'East Timor' and 'Faroe Islands'. +- 2023-05-04: + - Added indicators `share_of_temperature_change_from_ghg`, `temperature_change_from_ch4`, `temperature_change_from_co2`, `temperature_change_from_ghg`, and `temperature_change_from_n2o` using data from Jones et al. (2023). +- 2022-11-11: + - Updated CO2 emissions data with the newly released Global Carbon Budget (2022) by the Global Carbon Project. + - Added various new indicators related to national land-use change emissions. + - Added the emissions of the 1991 Kuwaiti oil fires in Kuwait's emissions (while also keeping 'Kuwaiti Oil Fires (GCP)' as a separate entity), to properly account for these emissions in the aggregate of Asia. + - Applied minor changes to entity names (e.g. "Asia (excl. China & India)" -> "Asia (excl. China and India)"). +- 2022-09-06: + - Updated data on primary energy consumption (from BP & EIA) and greenhouse gas emissions by sector (from CAIT). + - Refactored code, since now this repository simply loads the data, generates the output files, and uploads them to the cloud; the code to generate the dataset is now in our [etl repository](https://github.com/owid/etl). + - Minor changes in the codebook. +- 2022-04-15: + - Updated primary energy consumption data. + - Updated CO2 data to include aggregations for the different country income levels. +- 2022-02-24: + - Updated greenhouse gas emissions data from CAIT Climate Data Explorer. + - Included two new columns in dataset: total greenhouse gases excluding land-use change and forestry, and the same as per capita values. +- 2021-11-05: Updated CO2 emissions data with the newly released Global Carbon Budget (v2021). +- 2021-09-16: + - Fixed data quality issues in CO2 emissions indicators (emissions less than 0, missing data for Eswatini, ...). + - Replaced all input CSVs with data retrieved directly from ourworldindata.org. +- 2021-02-08: Updated this dataset with the latest annual release from the Global Carbon Project. +- 2020-08-07: The first version of this dataset was made available. + +## Data alterations + +- **We standardize names of countries and regions.** Since the names of countries and regions are different in different data sources, we standardize all names in order to minimize data loss during data merges. +- **We recalculate carbon emissions to CO2.** The primary data sources on CO2 emissions—the Global Carbon Project, for example—typically report emissions in tonnes of carbon. We have recalculated these figures as tonnes of CO2 using a conversion factor of 3.664. +- **We calculate per capita figures.** All of our per capita figures are calculated from our metric `Population`, which is included in the complete dataset. These population figures are sourced from [Gapminder](http://gapminder.org) and the [UN World Population Prospects (UNWPP)](https://population.un.org/wpp/). + +## License + +All visualizations, data, and code produced by _Our World in Data_ are completely open access under the [Creative Commons BY license](https://creativecommons.org/licenses/by/4.0/). You have the permission to use, distribute, and reproduce these in any medium, provided the source and authors are credited. + +The data produced by third parties and made available by _Our World in Data_ is subject to the license terms from the original third-party authors. We will always indicate the original source of the data in our database, and you should always check the license of any such third-party data before use. + +## Authors + +This data has been collected, aggregated, and documented by Hannah Ritchie, Max Roser, Edouard Mathieu, Bobbie Macdonald and Pablo Rosado. + +The mission of *Our World in Data* is to make data and research on the world's largest problems understandable and accessible. [Read more about our mission](https://ourworldindata.org/about). + + +## How to cite this data? + +If you are using this dataset, please cite both [Our World in Data](https://ourworldindata.org/co2-and-greenhouse-gas-emissions#article-citation) and the underlying data source(s). + +Please follow [the guidelines in our FAQ](https://ourworldindata.org/faqs#citing-work-produced-by-third-parties-and-made-available-by-our-world-in-data) on how to cite our work. - # Set index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index() +""" + return readme + + +def prepare_and_save_outputs(tb: Table, codebook: Table, temp_dir_path: Path) -> None: + # Create codebook and save it as a csv file. + log.info("Creating codebook csv file.") + pd.DataFrame(codebook).to_csv(temp_dir_path / "owid-co2-codebook.csv", index=False) - return combined + # Create a csv file. + log.info("Creating csv file.") + pd.DataFrame(tb).to_csv(temp_dir_path / "owid-co2-data.csv", index=False, float_format="%.3f") + + # Create a README file. + log.info("Creating README file.") + readme = prepare_readme(tb) + (temp_dir_path / "README.md").write_text(readme) def run(dest_dir: str) -> None: # # Load data. # - # Load the global carbon budget dataset from the Global Carbon Project (GCP). - ds_gcp = paths.load_dataset("global_carbon_budget") - - # Load the Jones et al. (2023) dataset on national contributions to climate change. - ds_jones = paths.load_dataset("national_contributions") - - # Load the greenhouse gas emissions by sector dataset by Climate Watch. - ds_climate_watch = paths.load_dataset("emissions_by_sector") - - # Load the GDP dataset by GGDC Maddison. - ds_gdp = paths.load_dataset("maddison_project_database") - - # Load primary energy consumption dataset (by different sources in our 'energy' namespace). - ds_energy = paths.load_dataset("primary_energy_consumption") - - # Load population dataset. - ds_population = paths.load_dataset("population") - - # Load countries-regions dataset (required to get ISO codes). - ds_regions = paths.load_dataset("regions") - - # Gather all required tables from all datasets. - tb_gcp = ds_gcp["global_carbon_budget"] - tb_jones = ds_jones["national_contributions"] - tb_climate_watch_ghg = ds_climate_watch["greenhouse_gas_emissions_by_sector"] - tb_climate_watch_ch4 = ds_climate_watch["methane_emissions_by_sector"] - tb_climate_watch_n2o = ds_climate_watch["nitrous_oxide_emissions_by_sector"] - tb_energy = ds_energy["primary_energy_consumption"] - tb_gdp = ds_gdp["maddison_project_database"] - tb_population = ds_population["population"] - tb_regions = ds_regions["regions"] + # Load the owid_co2 emissions dataset from garden, and read its main table and codebook. + ds_gcp = paths.load_dataset("owid_co2") + tb = ds_gcp.read("owid_co2") + codebook = ds_gcp.read("owid_co2_codebook") # - # Process data. + # Save outputs. # - # Choose required columns and rename them. - tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS, errors="raise") - tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS, errors="raise") - tb_climate_watch_ghg = tb_climate_watch_ghg.reset_index()[list(CLIMATE_WATCH_GHG_COLUMNS)].rename( - columns=CLIMATE_WATCH_GHG_COLUMNS, errors="raise" - ) - tb_climate_watch_ch4 = tb_climate_watch_ch4.reset_index()[list(CLIMATE_WATCH_CH4_COLUMNS)].rename( - columns=CLIMATE_WATCH_CH4_COLUMNS, errors="raise" - ) - tb_climate_watch_n2o = tb_climate_watch_n2o.reset_index()[list(CLIMATE_WATCH_N2O_COLUMNS)].rename( - columns=CLIMATE_WATCH_N2O_COLUMNS, errors="raise" - ) - tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename( - columns=PRIMARY_ENERGY_COLUMNS, errors="raise" - ) - tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") - tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename( - columns=POPULATION_COLUMNS, errors="raise" - ) - tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS, errors="raise") - - # Combine tables. - combined = combine_tables( - tb_gcp=tb_gcp, - tb_jones=tb_jones, - tb_climate_watch_ghg=tb_climate_watch_ghg, - tb_climate_watch_ch4=tb_climate_watch_ch4, - tb_climate_watch_n2o=tb_climate_watch_n2o, - tb_energy=tb_energy, - tb_gdp=tb_gdp, - tb_population=tb_population, - tb_regions=tb_regions, - ) - - # Prepare outputs. - combined = prepare_outputs(combined=combined, ds_regions=ds_regions) - - # If you want to really commit the data, use `CO2_BRANCH=my-branch etlr github/co2_data --export` - if os.environ.get("CO2_BRANCH"): - dry_run = False - branch = os.environ["CO2_BRANCH"] - else: + branch = git.Repo(BASE_DIR).active_branch.name + + if branch == "master": + log.warning("You are on master branch, using dry mode.") dry_run = True - branch = "master" - - gh.commit_file_to_github( - combined.to_csv(), - repo_name="co2-data", - file_path="owid-co2-data.csv", - commit_message=":bar_chart: Automated update", - branch=branch, - dry_run=dry_run, - ) + else: + log.info(f"Committing files to branch {branch}") + # Load DRY_RUN from env or use False as default. + dry_run = bool(int(os.environ.get("DRY_RUN", 0))) + + # Uncomment to inspect changes. + # from etl.data_helpers.misc import compare_tables + # branch = "update-ghg-emissions" + # old = pd.read_csv("https://raw.githubusercontent.com/owid/co2-data/refs/heads/master/owid-co2-data.csv") + # new = tb.copy() + # new = pd.read_csv(f"https://raw.githubusercontent.com/owid/co2-data/refs/heads/{branch}/owid-co2-data.csv") + # compare_tables(old, new, countries=["World"]) + + # Create a temporary directory for all files to be committed. + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + + prepare_and_save_outputs(tb, codebook=codebook, temp_dir_path=temp_dir_path) + + gh.create_branch_if_not_exists(repo_name="co2-data", branch=branch, dry_run=dry_run) + + # Commit csv files to the repos. + for file_name in ["owid-co2-data.csv", "owid-co2-codebook.csv", "README.md"]: + with (temp_dir_path / file_name).open("r") as file_content: + gh.commit_file_to_github( + file_content.read(), + repo_name="co2-data", + file_path=file_name, + commit_message=":bar_chart: Automated update", + branch=branch, + dry_run=dry_run, + ) + + if not dry_run: + log.info( + f"Files committed successfully to branch {branch}. Create a PR here https://github.com/owid/co2-data/compare/master...{branch}." + ) diff --git a/etl/steps/export/github/who/latest/monkeypox.py b/etl/steps/export/github/who/latest/monkeypox.py index c68c94c9117..f26d8e8868f 100644 --- a/etl/steps/export/github/who/latest/monkeypox.py +++ b/etl/steps/export/github/who/latest/monkeypox.py @@ -12,7 +12,7 @@ def run(dest_dir: str) -> None: # Load inputs. # ds = paths.load_dataset("monkeypox") - tb = ds.read_table("monkeypox") + tb = ds.read("monkeypox") # Process it for backwards compatibility. tb = tb.rename(columns={"country": "location"}).drop(columns=["suspected_cases_cumulative", "annotation"]) diff --git a/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml b/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml index eb2c8d5b211..9ca21fa345e 100644 --- a/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml +++ b/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml @@ -23,7 +23,7 @@ views: y: grapher/covid/latest/testing/testing#new_tests_7day_smoothed x: grapher/covid/latest/cases_deaths/cases_deaths#new_cases_7_day_avg_right config: - type: ScatterPlot + chartTypes: ["ScatterPlot"] - dimensions: normalize: per_capita indicators: @@ -31,7 +31,7 @@ views: x: grapher/covid/latest/cases_deaths/cases_deaths#new_cases_per_million_7_day_avg_right color: 123 config: - type: ScatterPlot + chartTypes: ["ScatterPlot"] map: colorScale: binningStrategy: equalInterval diff --git a/etl/steps/export/multidim/covid/latest/covid.covax.yml b/etl/steps/export/multidim/covid/latest/covid.covax.yml index f637d785189..c6fcfaa58c0 100644 --- a/etl/steps/export/multidim/covid/latest/covid.covax.yml +++ b/etl/steps/export/multidim/covid/latest/covid.covax.yml @@ -67,7 +67,7 @@ views: subtitle: Doses donated to the COVAX initiative by each country. note: COVAX is a worldwide initiative aimed at equitable access to COVID-19 vaccines. It is directed by Gavi, CEPI, and the WHO. originUrl: ourworldindata.org/coronavirus - type: StackedDiscreteBar + chartTypes: ["StackedDiscreteBar"] sortBy: column sortColumnSlug: "{definitions.table}#delivered" dimensions: @@ -100,7 +100,7 @@ views: subtitle: Doses donated to the COVAX initiative by each country, per person living in the donating country. note: COVAX is a worldwide initiative aimed at equitable access to COVID-19 vaccines. It is directed by Gavi, CEPI, and the WHO. Gross domestic product is expressed in U.S. Dollars; it is sourced from the World Bank and OECD. originUrl: ourworldindata.org/coronavirus - type: StackedDiscreteBar + chartTypes: ["StackedDiscreteBar"] - dimensions: normalize: per_dose @@ -115,7 +115,7 @@ views: subtitle: Doses donated to the COVAX initiative by each country, per dose administered domestically. note: COVAX is a worldwide initiative aimed at equitable access to COVID-19 vaccines. It is directed by Gavi, CEPI, and the WHO. Gross domestic product is expressed in U.S. Dollars; it is sourced from the World Bank and OECD. originUrl: ourworldindata.org/coronavirus - type: StackedDiscreteBar + chartTypes: ["StackedDiscreteBar"] - dimensions: normalize: per_gdp @@ -130,5 +130,4 @@ views: subtitle: Doses donated to the COVAX initiative by each country, per million dollars of GDP of the donating country. note: COVAX is a worldwide initiative aimed at equitable access to COVID-19 vaccines. It is directed by Gavi, CEPI, and the WHO. Gross domestic product is expressed in U.S. Dollars; it is sourced from the World Bank and OECD. originUrl: ourworldindata.org/coronavirus - type: StackedDiscreteBar - + chartTypes: ["StackedDiscreteBar"] diff --git a/etl/steps/export/multidim/covid/latest/covid.vax_breakdowns.yml b/etl/steps/export/multidim/covid/latest/covid.vax_breakdowns.yml index 45dc0bebb47..6c7458f0ea2 100644 --- a/etl/steps/export/multidim/covid/latest/covid.vax_breakdowns.yml +++ b/etl/steps/export/multidim/covid/latest/covid.vax_breakdowns.yml @@ -55,7 +55,6 @@ views: - "{definitions.table}#total_vaccinations__vaccine_skycovione" - "{definitions.table}#total_vaccinations__vaccine_valneva" config: - type: "StackedArea" + chartTypes: ["StackedArea"] selectedEntityNames: - European Union (27) - diff --git a/etl/steps/export/multidim/energy/latest/energy.yml b/etl/steps/export/multidim/energy/latest/energy.yml index 09829d64df9..cd8342fa9ef 100644 --- a/etl/steps/export/multidim/energy/latest/energy.yml +++ b/etl/steps/export/multidim/energy/latest/energy.yml @@ -89,7 +89,7 @@ views: - "grapher/energy/2024-06-20/energy_mix/energy_mix#wind__twh__equivalent" config: $schema: https://files.ourworldindata.org/schemas/grapher-schema.005.json - type: StackedArea + chartTypes: ["StackedArea"] tab: chart title: Total consumed energy by source subtitle: "[Primary energy](#dod:primaryenergy) consumption is measured in [terawatt-hours](#dod:watt-hours), using the [substitution method](#dod:substitutionmethod)." @@ -107,7 +107,7 @@ views: - "grapher/energy/2024-06-20/energy_mix/energy_mix#wind_per_capita__kwh__equivalent" config: $schema: https://files.ourworldindata.org/schemas/grapher-schema.005.json - type: StackedArea + chartTypes: ["StackedArea"] - dimensions: source: all metric: total diff --git a/etl/steps/export/multidim/energy/latest/energy_prices.py b/etl/steps/export/multidim/energy/latest/energy_prices.py new file mode 100644 index 00000000000..5a9a6e59e08 --- /dev/null +++ b/etl/steps/export/multidim/energy/latest/energy_prices.py @@ -0,0 +1,47 @@ +from etl import multidim +from etl.db import get_engine +from etl.helpers import PathFinder + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load Eurostat data on gas and electricity prices. + ds_grapher = paths.load_dataset("energy_prices") + + # Read table of prices in euros. + tb_annual = ds_grapher.read("energy_prices_annual") + tb_monthly = ds_grapher.read("energy_prices_monthly") + + # + # Process data. + # + # Load configuration from adjacent yaml file. + config = paths.load_mdim_config() + + # Create views. + config["views"] = multidim.generate_views_for_dimensions( + dimensions=config["dimensions"], + tables=[tb_annual, tb_monthly], + dimensions_order_in_slug=("frequency", "source", "consumer", "price_component", "unit"), + warn_on_missing_combinations=False, + additional_config={ + "$schema": "https://files.ourworldindata.org/schemas/grapher-schema.005.json", + "chartTypes": ["LineChart"], + "hasMapTab": True, + "tab": "map", + "map": { + "projection": "Europe", + "colorScale": {"baseColorScheme": "YlOrBr"}, + }, + }, + ) + + # + # Save outputs. + # + multidim.upsert_multidim_data_page(slug="mdd-energy-prices", config=config, engine=get_engine()) diff --git a/etl/steps/export/multidim/energy/latest/energy_prices.yml b/etl/steps/export/multidim/energy/latest/energy_prices.yml new file mode 100644 index 00000000000..f2640249e48 --- /dev/null +++ b/etl/steps/export/multidim/energy/latest/energy_prices.yml @@ -0,0 +1,102 @@ +title: + title: "Energy prices" + titleVariant: "by energy source" +defaultSelection: + - "European Union (27)" +topicTags: + - "Energy" +dimensions: + - slug: "frequency" + name: "Frequency" + choices: + - slug: "annual" + name: "Annual" + description: "Annual data" + - slug: "monthly" + name: "Monthly" + description: "Monthly data" + - slug: "source" + name: "Energy source" + choices: + - slug: "electricity" + name: "Electricity" + - slug: "gas" + name: "Gas" + - slug: "price_component" + name: "Price component" + choices: + - slug: "total_price_including_taxes" + name: "Total consumer price" + description: "Total consumer price including all taxes and levies" + group: Overview + - slug: "wholesale" + name: "Wholesale price" + description: "Wholesale price" + group: Overview + - slug: "energy_and_supply" + name: "Energy and supply" + description: "Energy and supply" + group: Individual price components + - slug: "network_costs" + name: "Network costs" + description: "Network costs" + group: Individual price components + - slug: "capacity_taxes" + name: "Capacity taxes" + description: "Capacity taxes" + group: Individual price components + - slug: "value_added_tax_vat" + name: "Value added tax (VAT)" + description: "Value added tax (VAT)" + group: Individual price components + - slug: "environmental_taxes" + name: "Environmental taxes" + description: "Environmental taxes" + group: Individual price components + - slug: "nuclear_taxes" + name: "Nuclear taxes" + description: "Nuclear taxes" + group: Individual price components + - slug: "renewable_taxes" + name: "Renewable taxes" + description: "Renewable taxes" + group: Individual price components + - slug: "taxes_fees_levies_and_charges" + name: "All taxes, fees, levies and charges" + description: "All taxes, fees, levies and charges" + group: Individual price components + - slug: "other" + name: "Other costs" + description: "Other costs" + group: Individual price components + # Other available price components: + # 'capacity_taxes_allowances', + # 'environmental_taxes_allowance', + # 'nuclear_taxes_allowance', + # 'renewable_taxes_allowance', + # 'taxes_fees_levies_and_charges_allowance', + # 'other_allowance', + - slug: "consumer" + name: "Consumer type" + choices: + - slug: "household" + name: "Households" + description: "Household consumers" + - slug: "non_household" + name: "Non-households" + description: "Non-household consumers" + - slug: "all" + name: "All consumers" + description: "All consumers" + - slug: "unit" + name: "Unit" + choices: + - slug: "euro" + name: "Euro" + description: "Price in euros" + - slug: "pps" + name: "PPS" + description: "Price in Purchasing Power Standard" +views: + # Views will be filled out programmatically. + [] diff --git a/etl/steps/export/s3/co2_data/latest/owid_co2.py b/etl/steps/export/s3/co2_data/latest/owid_co2.py new file mode 100644 index 00000000000..65973768684 --- /dev/null +++ b/etl/steps/export/s3/co2_data/latest/owid_co2.py @@ -0,0 +1,132 @@ +"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset. + +The combined datasets are: +* Global Carbon Budget - Global Carbon Project. +* National contributions to climate change - Jones et al. +* Greenhouse gas emissions by sector - Climate Watch. +* Primary energy consumption - EI & EIA. + +Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2023) on +GDP are included. + +Outputs: +* The data in three different formats will also be uploaded to S3, and will be made publicly available, in: + * https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.csv + * https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.xlsx + * https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.json + +""" +import json +import tempfile +from pathlib import Path + +import pandas as pd +from owid.catalog import Table +from owid.datautils.s3 import S3 +from structlog import get_logger +from tqdm.auto import tqdm + +from etl.helpers import PathFinder + +# Initialize logger. +log = get_logger() + +# Define S3 base URL. +S3_URL = "https://nyc3.digitaloceanspaces.com" +# Profile name to use for S3 client (as defined in .aws/config). +S3_PROFILE_NAME = "default" +# S3 bucket name and folder where dataset files will be stored. +S3_BUCKET_NAME = "owid-public" +S3_DATA_DIR = Path("data/co2") + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def save_data_to_json(tb: Table, output_path: str) -> None: + tb = tb.copy() + + # Initialize output dictionary, that contains one item per country in the data. + output_dict = {} + + # Each country contains a dictionary, which contains: + # * "iso_code", which is the ISO code (as a string), if it exists. + # * "data", which is a list of dictionaries, one per year. + # Each dictionary contains "year" as the first item, followed by all other non-nan indicator values for that year. + for country in sorted(set(tb["country"])): + # Initialize output dictionary for current country. + output_dict[country] = {} + + # If there is an ISO code for this country, add it as a new item of the dictionary. + iso_code = tb[tb["country"] == country].iloc[0]["iso_code"] + if not pd.isna(iso_code): + output_dict[country]["iso_code"] = iso_code + + # Create the data dictionary for this country. + dict_country = tb[tb["country"] == country].drop(columns=["country", "iso_code"]).to_dict(orient="records") + # Remove all nans. + data_country = [ + {indicator: value for indicator, value in d_year.items() if not pd.isna(value)} for d_year in dict_country + ] + output_dict[country]["data"] = data_country + + # Write dictionary to file as a big json object. + with open(output_path, "w") as file: + file.write(json.dumps(output_dict, indent=4)) + + +def prepare_and_save_outputs(tb: Table, codebook: Table, temp_dir_path: Path) -> None: + # Create a csv file. + log.info("Creating csv file.") + pd.DataFrame(tb).to_csv(temp_dir_path / "owid-co2-data.csv", index=False, float_format="%.3f") + + # Create a json file. + log.info("Creating json file.") + save_data_to_json(tb, temp_dir_path / "owid-co2-data.json") + + # Create an excel file. + log.info("Creating excel file.") + with pd.ExcelWriter(temp_dir_path / "owid-co2-data.xlsx") as writer: + tb.to_excel(writer, sheet_name="Data", index=False, float_format="%.3f") + codebook.to_excel(writer, sheet_name="Metadata") + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load the owid_co2 emissions dataset from garden, and read its main table. + ds_gcp = paths.load_dataset("owid_co2") + tb = ds_gcp.read("owid_co2") + codebook = ds_gcp.read("owid_co2_codebook") + + # + # Save outputs. + # + # Create a temporary directory for all files to be committed. + with tempfile.TemporaryDirectory() as temp_dir: + ################################################################################################################ + # TODO: Create new public files and update the way we write to them. + log.warning( + "This implementation currently does not work. We should create an R2 public bucket and update the way we write to it. For now, manually update files in Digital Ocean using the web interface." + ) + ################################################################################################################ + + temp_dir_path = Path(temp_dir) + + prepare_and_save_outputs(tb, codebook=codebook, temp_dir_path=temp_dir_path) + + # Initialise S3 client. + s3 = S3(profile_name=S3_PROFILE_NAME) + for file_name in tqdm(["owid-co2-data.csv", "owid-co2-data.xlsx", "owid-co2-data.json"]): + # Path to local file. + local_file = temp_dir_path / file_name + # Path (within bucket) to S3 file. + s3_file = Path("data/co2") / file_name + tqdm.write(f"Uploading file {local_file} to S3 bucket {S3_BUCKET_NAME} as {s3_file}.") + # Upload and make public each of the files. + s3.upload_to_s3( + local_path=str(local_file), + s3_path=f"s3://{S3_BUCKET_NAME}/{str(s3_file)}", + public=True, + ) diff --git a/etl/steps/open_numbers.py b/etl/steps/open_numbers.py index 607bda3a3de..f490f406044 100644 --- a/etl/steps/open_numbers.py +++ b/etl/steps/open_numbers.py @@ -13,7 +13,7 @@ import hashlib import tempfile import warnings -from multiprocessing import Pool +from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import Dict, List, Tuple, cast @@ -58,9 +58,9 @@ def run(dest_dir: str) -> None: resource_map = remap_names(package.resources) # copy tables one by one - with Pool() as pool: + with ThreadPoolExecutor() as executor: args = [(ds, repo, short_name, resources) for short_name, resources in resource_map.items()] - pool.starmap(add_resource, args) + executor.map(lambda p: add_resource(*p), args) def add_resource( diff --git a/etl/version_tracker.py b/etl/version_tracker.py index 57f73541748..100af0e0ac2 100644 --- a/etl/version_tracker.py +++ b/etl/version_tracker.py @@ -446,53 +446,41 @@ def get_path_to_script(self, step: str, omit_base_dir: bool = False) -> Optional """Get the path to the script of a given step.""" # Get step attributes. _, step_type, _, channel, namespace, version, name, _ = extract_step_attributes(step=step).values() - state = "active" if step in self.all_active_steps else "archive" # Create a dictionary that contains the path to a script for a given step. # This dictionary has to keys, namely "active" and "archive". # Active steps should have a script in the active directory. # But steps that are in the archive dag can be either in the active or the archive directory. - path_to_script = {"active": None, "archive": None} + path_to_script = None if step_type == "export": - path_to_script["active"] = paths.STEP_DIR / "export" / channel / namespace / version / name # type: ignore + path_to_script = paths.STEP_DIR / "export" / channel / namespace / version / name # type: ignore elif channel == "snapshot": - path_to_script["active"] = paths.SNAPSHOTS_DIR / namespace / version / name # type: ignore - path_to_script["archive"] = paths.SNAPSHOTS_DIR_ARCHIVE / namespace / version / name # type: ignore + path_to_script = paths.SNAPSHOTS_DIR / namespace / version / name # type: ignore elif channel in ["meadow", "garden", "grapher", "explorers", "open_numbers", "examples", "external"]: - path_to_script["active"] = paths.STEP_DIR / "data" / channel / namespace / version / name # type: ignore - path_to_script["archive"] = paths.STEP_DIR_ARCHIVE / channel / namespace / version / name # type: ignore + path_to_script = paths.STEP_DIR / "data" / channel / namespace / version / name # type: ignore elif channel == "walden": - path_to_script["active"] = paths.BASE_DIR / "lib" / "walden" / "ingests" / namespace / version / name # type: ignore - path_to_script["archive"] = paths.BASE_DIR / "lib" / "walden" / "ingests" / namespace / version / name # type: ignore + path_to_script = paths.BASE_DIR / "lib" / "walden" / "ingests" / namespace / version / name # type: ignore elif channel in ["backport", "etag"]: # Ignore these channels, for which there is never a script. return None else: log.error(f"Unknown channel {channel} for step {step}.") - if state == "active": - # Steps in the active dag should only have a script in the active directory. - del path_to_script["archive"] - path_to_script_detected = None - for state in path_to_script: - # A step script can exist either as a .py file, as a .ipynb file, or a __init__.py file inside a folder. - # In the case of snapshots, there may or may not be a .py file, but there definitely needs to be a dvc file. - # In that case, the corresponding script is not trivial to find, but at least we can return the dvc file. - for path_to_script_candidate in [ - path_to_script[state].with_suffix(".py"), # type: ignore - path_to_script[state].with_suffix(".ipynb"), # type: ignore - path_to_script[state] / "__init__.py", # type: ignore - path_to_script[state].with_name(path_to_script[state].name + ".dvc"), # type: ignore - ]: - if path_to_script_candidate.exists(): - path_to_script_detected = path_to_script_candidate - break + # A step script can exist either as a .py file, as a .ipynb file, or a __init__.py file inside a folder. + # In the case of snapshots, there may or may not be a .py file, but there definitely needs to be a dvc file. + # In that case, the corresponding script is not trivial to find, but at least we can return the dvc file. + for path_to_script_candidate in [ + path_to_script.with_suffix(".py"), # type: ignore + path_to_script.with_suffix(".ipynb"), # type: ignore + path_to_script / "__init__.py", # type: ignore + path_to_script.with_name(path_to_script.name + ".dvc"), # type: ignore + ]: + if path_to_script_candidate.exists(): + path_to_script_detected = path_to_script_candidate + break if path_to_script_detected is None: - if state == "active": - log.error(f"Script for step {step} not found.") - else: - log.warning(f"Script for archive step {step} not found.") + log.error(f"Script for step {step} not found.") if omit_base_dir and path_to_script_detected is not None: # Return the path relative to the base directory (omitting the local path to the ETL repos). diff --git a/lib/catalog/owid/catalog/datasets.py b/lib/catalog/owid/catalog/datasets.py index 370838de1ed..d24f55d2c5f 100644 --- a/lib/catalog/owid/catalog/datasets.py +++ b/lib/catalog/owid/catalog/datasets.py @@ -11,13 +11,15 @@ from os import environ from os.path import join from pathlib import Path -from typing import Any, Dict, Iterator, List, Literal, Optional, Union +from typing import Any, Dict, Iterator, List, Literal, Optional, Union, cast import numpy as np import pandas as pd import yaml from _hashlib import HASH +from owid.repack import to_safe_types + from . import tables, utils from .meta import SOURCE_EXISTS_OPTIONS, DatasetMeta, TableMeta from .processing_log import disable_processing_log @@ -30,7 +32,7 @@ SUPPORTED_FORMATS: List[FileFormat] = ["feather", "parquet", "csv"] # the formats we generate by default -DEFAULT_FORMATS: List[FileFormat] = ["feather"] +DEFAULT_FORMATS: List[FileFormat] = environ.get("DEFAULT_FORMATS", "feather").split(",") # type: ignore # the format we use by default if we only need one PREFERRED_FORMAT: FileFormat = "feather" @@ -117,7 +119,7 @@ def add( utils.validate_underscore(col, "Variable's name") if not table.primary_key: - if "OWID_STRICT" in environ: + if environ.get("OWID_STRICT"): raise PrimaryKeyMissing( f"Table `{table.metadata.short_name}` does not have a primary_key -- please use t.set_index([col, ...], verify_integrity=True) to indicate dimensions before saving" ) @@ -126,7 +128,7 @@ def add( f"Table `{table.metadata.short_name}` does not have a primary_key -- please use t.set_index([col, ...], verify_integrity=True) to indicate dimensions before saving" ) - if not table.index.is_unique and "OWID_STRICT" in environ: + if not table.index.is_unique and environ.get("OWID_STRICT"): [(k, dups)] = table.index.value_counts().head(1).to_dict().items() raise NonUniqueIndex( f"Table `{table.metadata.short_name}` has duplicate values in the index -- could you have made a mistake?\n\n" @@ -153,12 +155,14 @@ def add( table_filename = join(self.path, table.metadata.checked_name + f".{format}") table.to(table_filename, repack=repack) - def read_table(self, name: str, reset_index: bool = True) -> tables.Table: + def read(self, name: str, reset_index: bool = True, safe_types: bool = True) -> tables.Table: """Read dataset's table from disk. Alternative to ds[table_name], but with more options to optimize the reading. :param reset_index: If true, don't set primary keys of the table. This can make loading large datasets with multi-indexes much faster. + :param safe_types: If true, convert numeric columns to Float64 and Int64 and categorical + columns to string[pyarrow]. This can significantly increase memory usage. """ stem = self.path / Path(name) @@ -166,14 +170,15 @@ def read_table(self, name: str, reset_index: bool = True) -> tables.Table: path = stem.with_suffix(f".{format}") if path.exists(): t = tables.Table.read(path, primary_key=[] if reset_index else None) - # dataset metadata might have been updated, refresh it t.metadata.dataset = self.metadata + if safe_types: + t = cast(tables.Table, to_safe_types(t)) return t raise KeyError(f"Table `{name}` not found, available tables: {', '.join(self.table_names)}") def __getitem__(self, name: str) -> tables.Table: - return self.read_table(name, reset_index=False) + return self.read(name, reset_index=False, safe_types=False) def __contains__(self, name: str) -> bool: return any((Path(self.path) / name).with_suffix(f".{format}").exists() for format in SUPPORTED_FORMATS) diff --git a/lib/catalog/owid/catalog/meta.py b/lib/catalog/owid/catalog/meta.py index b19ec524788..3913ec1fda6 100644 --- a/lib/catalog/owid/catalog/meta.py +++ b/lib/catalog/owid/catalog/meta.py @@ -10,15 +10,15 @@ import re from dataclasses import dataclass, field, is_dataclass from pathlib import Path -from typing import Any, Dict, List, Literal, NewType, Optional, Type, TypeVar, Union +from typing import Any, Dict, List, Literal, NewType, Optional, TypeVar, Union import mistune import pandas as pd -from dataclasses_json import dataclass_json +from dataclasses_json import DataClassJsonMixin from typing_extensions import Self from .processing_log import ProcessingLog -from .utils import hash_any, pruned_json +from .utils import dataclass_from_dict, hash_any, pruned_json SOURCE_EXISTS_OPTIONS = Literal["fail", "append", "replace"] @@ -30,7 +30,7 @@ T = TypeVar("T") -class MetaBase: +class MetaBase(DataClassJsonMixin): def __hash__(self): """Hash that uniquely identifies an object (without needing frozen dataclass).""" return hash_any(self) @@ -40,12 +40,13 @@ def __eq__(self, other: Self) -> bool: # type: ignore return False return self.__hash__() == other.__hash__() - def to_dict(self) -> Dict[str, Any]: - ... + def to_dict(self, encode_json: bool = False) -> Dict[str, Any]: # type: ignore + return super().to_dict(encode_json=encode_json) @classmethod - def from_dict(cls: Type[T], d: Dict[str, Any]) -> T: - ... + def from_dict(cls, d: Dict[str, Any]) -> T: # type: ignore + # NOTE: this is much faster than using dataclasses_json + return dataclass_from_dict(cls, d) # type: ignore def update(self, **kwargs: Dict[str, Any]) -> None: """Update object with new values.""" @@ -72,7 +73,6 @@ def load(cls, filename: str) -> Self: @pruned_json -@dataclass_json @dataclass(eq=False) class License(MetaBase): name: Optional[str] = None @@ -84,7 +84,6 @@ def __bool__(self): # DEPRECATED: use Origin instead @pruned_json -@dataclass_json @dataclass(eq=False) class Source(MetaBase): """Notes on importing sources to grapher: @@ -109,7 +108,6 @@ class Source(MetaBase): @pruned_json -@dataclass_json @dataclass(eq=False) class Origin(MetaBase): # Producer name @@ -167,7 +165,6 @@ def __post_init__(self): @pruned_json -@dataclass_json @dataclass(eq=False) class FaqLink(MetaBase): gdoc_id: str @@ -178,7 +175,6 @@ class FaqLink(MetaBase): @pruned_json -@dataclass_json @dataclass(eq=False) class VariablePresentationMeta(MetaBase): # Any fields of grapher config can be set here - title and subtitle *should* be set whenever possible @@ -201,7 +197,6 @@ class VariablePresentationMeta(MetaBase): @pruned_json -@dataclass_json @dataclass(eq=False) class VariableMeta(MetaBase): """Allowed fields for `display` attribute used for grapher: @@ -283,7 +278,6 @@ def _repr_html_(self): @pruned_json -@dataclass_json @dataclass(eq=False) class DatasetMeta(MetaBase): """ @@ -385,7 +379,6 @@ def uri(self) -> str: @pruned_json -@dataclass_json @dataclass(eq=False) class TableMeta(MetaBase): # data about this table diff --git a/lib/catalog/owid/catalog/processing_log.py b/lib/catalog/owid/catalog/processing_log.py index 3db0e73097c..389c4fe7e0e 100644 --- a/lib/catalog/owid/catalog/processing_log.py +++ b/lib/catalog/owid/catalog/processing_log.py @@ -6,7 +6,7 @@ from dataclasses import dataclass, field from functools import wraps from pathlib import Path -from typing import Any, Dict, List, Literal, Optional, Tuple +from typing import Any, Dict, List, Literal, Optional, Tuple, Union from dataclasses_json import dataclass_json @@ -68,6 +68,11 @@ class ProcessingLog(List[LogEntry]): # hack for dataclasses_json __args__ = (LogEntry,) + def __init__(self, entries: List[Union[LogEntry, Dict[str, Any]]] = []): + # Accept both LogEntry and dict when initializing, i.e. ProcessingLog([{"variable": "foo", ...}]) returns a list + # of LogEntry objects. + super().__init__([entry if isinstance(entry, LogEntry) else LogEntry.from_dict(entry) for entry in entries]) + # NOTE: calling this method `as_dict` is intentional, otherwise it gets called # by dataclass_json def as_dict(self) -> List[Dict[str, Any]]: @@ -331,7 +336,7 @@ def _add_upstream_channels(data_dir: Path, pl: ProcessingLog) -> ProcessingLog: from owid.catalog import Dataset # reverse processing log to traverse backwards - pl = ProcessingLog(pl[::-1]) + pl = ProcessingLog(pl[::-1]) # type: ignore new_pl = [] seen_parents_variables = set() diff --git a/lib/catalog/owid/catalog/s3_utils.py b/lib/catalog/owid/catalog/s3_utils.py index 9ce690591d2..564306ed695 100644 --- a/lib/catalog/owid/catalog/s3_utils.py +++ b/lib/catalog/owid/catalog/s3_utils.py @@ -74,11 +74,15 @@ def download(s3_url: str, filename: str, quiet: bool = False, client: Optional[B def download_s3_folder( s3_folder: str, local_dir: Path, - ignore: Optional[str] = None, + exclude: List[str] = [], + include: List[str] = [], client: Optional[BaseClient] = None, max_workers: int = 20, + delete: bool = False, ) -> None: """Download all files in the given S3 folder to the local directory.""" + assert s3_folder.endswith("/"), "s3_folder must end with a slash" + client = client or connect_r2() bucket, _ = s3_bucket_key(s3_folder) @@ -88,8 +92,11 @@ def download_s3_folder( s3_keys = list_s3_objects(s3_folder, client=client) - if ignore: - s3_keys = [key for key in s3_keys if ignore not in key] + if exclude: + s3_keys = [key for key in s3_keys if not any(pattern in key for pattern in exclude)] + + if include: + s3_keys = [key for key in s3_keys if any(pattern in key for pattern in include)] with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [] @@ -103,6 +110,13 @@ def download_s3_folder( concurrent.futures.wait(futures) + if delete: + local_files = set(local_dir.glob("*")) + downloaded_files = set(local_dir / Path(s3_key).name for s3_key in s3_keys) + files_to_delete = local_files - downloaded_files + for file in files_to_delete: + file.unlink() + def upload(s3_url: str, filename: str, public: bool = False, quiet: bool = False) -> None: """Upload the file at the given local filename to the S3 URL.""" diff --git a/lib/catalog/owid/catalog/tables.py b/lib/catalog/owid/catalog/tables.py index 125db287028..88fa0e1f9e2 100644 --- a/lib/catalog/owid/catalog/tables.py +++ b/lib/catalog/owid/catalog/tables.py @@ -320,9 +320,9 @@ def read_csv(cls, path: Union[str, Path], **kwargs) -> "Table": raise ValueError(f'filename must end in ".csv": {path}') # load the data and add metadata - df = Table(pd.read_csv(path, index_col=False, na_values=[""], keep_default_na=False)) - cls._add_metadata(df, path, **kwargs) - return df + tb = Table(pd.read_csv(path, index_col=False, na_values=[""], keep_default_na=False)) + cls._add_metadata(tb, path, **kwargs) + return tb def update_metadata(self, **kwargs) -> "Table": """Set Table metadata.""" @@ -332,7 +332,7 @@ def update_metadata(self, **kwargs) -> "Table": return self @classmethod - def _add_metadata(cls, df: pd.DataFrame, path: str, primary_key: Optional[list[str]] = None) -> None: + def _add_metadata(cls, tb: "Table", path: str, primary_key: Optional[list[str]] = None) -> None: """Read metadata from JSON sidecar and add it to the dataframe.""" metadata = cls._read_metadata(path) @@ -340,12 +340,12 @@ def _add_metadata(cls, df: pd.DataFrame, path: str, primary_key: Optional[list[s primary_key = metadata.get("primary_key", []) fields = metadata.pop("fields") if "fields" in metadata else {} - df.metadata = TableMeta.from_dict(metadata) - df._set_fields_from_dict(fields) + tb.metadata = TableMeta.from_dict(metadata) + tb._set_fields_from_dict(fields) # NOTE: setting index is really slow for large datasets if primary_key: - df.set_index(primary_key, inplace=True) + tb.set_index(primary_key, inplace=True) @classmethod def read_feather(cls, path: Union[str, Path], **kwargs) -> "Table": diff --git a/lib/catalog/owid/catalog/utils.py b/lib/catalog/owid/catalog/utils.py index 23b093ee5a7..bd0fe80dc8a 100644 --- a/lib/catalog/owid/catalog/utils.py +++ b/lib/catalog/owid/catalog/utils.py @@ -1,17 +1,21 @@ +import dataclasses import datetime as dt import hashlib import re from dataclasses import fields, is_dataclass from pathlib import Path -from typing import Any, Optional, TypeVar, Union, overload +from typing import Any, Dict, Optional, Type, TypeVar, Union, get_args, get_origin, overload import dynamic_yaml import pytz +import structlog import yaml from unidecode import unidecode T = TypeVar("T") +log = structlog.get_logger() + def prune_dict(d: dict) -> dict: """Remove all keys starting with underscore and all empty values from a dictionary.""" @@ -268,3 +272,60 @@ def hash_any(x: Any) -> int: else: # Fallback for other types: use the built-in hash() function return hash(x) + + +def dataclass_from_dict(cls: Optional[Type[T]], d: Dict[str, Any]) -> T: + """Recursively create an instance of a dataclass from a dictionary. We've implemented custom + method because original dataclasses_json.from_dict was too slow (this gives us more than 2x + speedup). See https://github.com/owid/etl/pull/3517#issuecomment-2468084380 for more details. + """ + if d is None or not dataclasses.is_dataclass(cls) or not isinstance(d, dict): + return d # type: ignore + + field_types = {f.name: f.type for f in dataclasses.fields(cls)} + + init_args = {} + for field_name, v in d.items(): + # Skip values in a dictionary that are not in the dataclass + if field_name not in field_types: + continue + + # Handle None values right away + if v is None: + init_args[field_name] = None + continue + + field_type = field_types[field_name] + origin = get_origin(field_type) + args = get_args(field_type) + + # unwrap Optional (e.g. Optional[License] -> License) + if type(None) in args: + filtered_args = tuple(a for a in args if a is not type(None)) + if len(filtered_args) == 1: + field_type = filtered_args[0] + + if origin is list: + item_type = args[0] + init_args[field_name] = [dataclass_from_dict(item_type, item) for item in v] + elif origin is dict: + key_type, value_type = args + init_args[field_name] = {k: dataclass_from_dict(value_type, item) for k, item in v.items()} + elif dataclasses.is_dataclass(field_type): + init_args[field_name] = dataclass_from_dict(field_type, v) # type: ignore + elif isinstance(field_type, type) and field_type not in (Any,): + try: + init_args[field_name] = field_type(v) + except ValueError as e: + log.error( + "conversion.failed", + field_name=field_name, + field_type=field_type, + path=f"{d.get('channel')}/{d.get('namespace')}/{d.get('version')}/{d.get('short_name')}", + error=str(e), + ) + continue + else: + init_args[field_name] = v + + return cls(**init_args) diff --git a/lib/catalog/owid/catalog/variables.py b/lib/catalog/owid/catalog/variables.py index d32a7a3d318..f6e529b567d 100644 --- a/lib/catalog/owid/catalog/variables.py +++ b/lib/catalog/owid/catalog/variables.py @@ -195,6 +195,12 @@ def __imul__(self, other: Union[Scalar, Series, "Variable"]) -> "Variable": # t return self.__mul__(other) def __truediv__(self, other: Union[Scalar, Series, "Variable"]) -> "Variable": # type: ignore + if is_nullable_series(self) or is_nullable_series(other): + # 0/0 should return pd.NA, not np.nan + zero_div_zero = (other == 0) & (self == 0) + if zero_div_zero.any(): + other = other.replace({0: pd.NA}) # type: ignore + variable_name = self.name or UNNAMED_VARIABLE variable = Variable(super().__truediv__(other), name=variable_name) variable.metadata = combine_variables_metadata(variables=[self, other], operation="/", name=variable_name) @@ -585,3 +591,24 @@ def copy_metadata(from_variable: Variable, to_variable: Variable, inplace: bool new_variable = to_variable.copy() new_variable.metadata = from_variable.metadata.copy() return new_variable + + +def is_nullable_series(s: Any) -> bool: + """Check if a series has a nullable pandas dtype.""" + if not hasattr(s, "dtype"): + return False + + nullable_types = { + "Int8", + "Int16", + "Int32", + "Int64", + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "Float32", + "Float64", + "boolean", + } + return str(s.dtype) in nullable_types diff --git a/lib/catalog/pyproject.toml b/lib/catalog/pyproject.toml index 5147db6a0f5..415ece029fe 100644 --- a/lib/catalog/pyproject.toml +++ b/lib/catalog/pyproject.toml @@ -17,12 +17,12 @@ dependencies = [ "Unidecode>=1.3.4", "PyYAML>=6.0.1", "structlog>=21.5.0", - "owid-repack>=0.1.1", "dynamic-yaml>=1.3.5", "mistune>=3.0.1", - "dataclasses-json==0.5.8", + "dataclasses-json>=0.6.7", "rdata==0.9", "owid-datautils", + "owid-repack", ] [tool.uv] @@ -38,6 +38,7 @@ dev-dependencies = [ [tool.uv.sources] owid-datautils = { path = "../datautils", editable = true } +owid-repack = { path = "../repack", editable = true } [tool.ruff] extend = "../../pyproject.toml" diff --git a/lib/catalog/tests/test_variables.py b/lib/catalog/tests/test_variables.py index 2b5b3dd30d6..919e1b2fd81 100644 --- a/lib/catalog/tests/test_variables.py +++ b/lib/catalog/tests/test_variables.py @@ -599,3 +599,12 @@ def test_variable_rolling(variable_1: Variable): # make sure we are not modifying the original table rolling.m.title = "new" assert v.m.title != "new" + + +def test_truediv_zero_division() -> None: + v1 = Variable([0, 1, 2], name="v1") + v2 = Variable([0, 1, 2], name="v2") + result = v1 / v2 + assert result.isnull()[0] # Check that 0/0 results in pandas NaN + assert not result.isnull()[1] # Check that 1/1 does not result in NaN + assert not result.isnull()[2] # Check that 2/2 does not result in NaN diff --git a/lib/catalog/uv.lock b/lib/catalog/uv.lock index 0f204bc09c9..2d2a84f22b5 100644 --- a/lib/catalog/uv.lock +++ b/lib/catalog/uv.lock @@ -367,16 +367,15 @@ wheels = [ [[package]] name = "dataclasses-json" -version = "0.5.8" +version = "0.6.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "marshmallow" }, - { name = "marshmallow-enum" }, { name = "typing-inspect" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ed/63/10cdc14908f3c9fdb9c21631275d2805853f6156b761a391e6f8918377e1/dataclasses-json-0.5.8.tar.gz", hash = "sha256:6572ac08ad9340abcb74fd8c4c8e9752db2a182a402c8e871d0a8aa119e3804e", size = 44113 } +sdist = { url = "https://files.pythonhosted.org/packages/64/a4/f71d9cf3a5ac257c993b5ca3f93df5f7fb395c725e7f1e6479d2514173c3/dataclasses_json-0.6.7.tar.gz", hash = "sha256:b6b3e528266ea45b9535223bc53ca645f5208833c29229e847b3f26a1cc55fc0", size = 32227 } wheels = [ - { url = "https://files.pythonhosted.org/packages/ae/bc/892e03650133583d5babbb7c7e5c1be6b68df86829c91fdc30cca996630d/dataclasses_json-0.5.8-py3-none-any.whl", hash = "sha256:65b167c15fdf9bde27569c09ac18dd39bf1cc5b7998525024cb4678d2653946c", size = 26211 }, + { url = "https://files.pythonhosted.org/packages/c3/be/d0d44e092656fe7a06b55e6103cbce807cdbdee17884a5367c68c9860853/dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a", size = 28686 }, ] [[package]] @@ -652,18 +651,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3c/78/c1de55eb3311f2c200a8b91724414b8d6f5ae78891c15d9d936ea43c3dba/marshmallow-3.22.0-py3-none-any.whl", hash = "sha256:71a2dce49ef901c3f97ed296ae5051135fd3febd2bf43afe0ae9a82143a494d9", size = 49334 }, ] -[[package]] -name = "marshmallow-enum" -version = "1.5.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "marshmallow" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8e/8c/ceecdce57dfd37913143087fffd15f38562a94f0d22823e3c66eac0dca31/marshmallow-enum-1.5.1.tar.gz", hash = "sha256:38e697e11f45a8e64b4a1e664000897c659b60aa57bfa18d44e226a9920b6e58", size = 4013 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/59/ef3a3dc499be447098d4a89399beb869f813fee1b5a57d5d79dee2c1bf51/marshmallow_enum-1.5.1-py2.py3-none-any.whl", hash = "sha256:57161ab3dbfde4f57adeb12090f39592e992b9c86d206d02f6bd03ebec60f072", size = 4186 }, -] - [[package]] name = "matplotlib-inline" version = "0.1.7" @@ -798,14 +785,14 @@ dev = [ [package.metadata] requires-dist = [ { name = "boto3", specifier = ">=1.21.13" }, - { name = "dataclasses-json", specifier = "==0.5.8" }, + { name = "dataclasses-json", specifier = ">=0.6.7" }, { name = "dynamic-yaml", specifier = ">=1.3.5" }, { name = "ipdb", specifier = ">=0.13.9" }, { name = "jsonschema", specifier = ">=3.2.0" }, { name = "mistune", specifier = ">=3.0.1" }, { name = "owid-datautils", editable = "../datautils" }, - { name = "owid-repack", specifier = ">=0.1.1" }, - { name = "pandas", specifier = "==2.2.1" }, + { name = "owid-repack", editable = "../repack" }, + { name = "pandas", specifier = ">=2.2.1" }, { name = "pyarrow", specifier = ">=10.0.1" }, { name = "pyyaml", specifier = ">=6.0.1" }, { name = "rdata", specifier = "==0.9" }, @@ -849,7 +836,7 @@ requires-dist = [ { name = "colorama", specifier = ">=0.4.4" }, { name = "gdown", specifier = ">=4.5.2" }, { name = "gsheets", specifier = ">=0.6.1" }, - { name = "pandas", specifier = "==2.2.1" }, + { name = "pandas", specifier = ">=2.2.1" }, { name = "pyarrow", specifier = ">=10.0.1" }, { name = "pydrive2", specifier = ">=1.15.0" }, { name = "structlog", specifier = ">=21.5.0" }, @@ -875,15 +862,27 @@ dev = [ [[package]] name = "owid-repack" -version = "0.1.3" -source = { registry = "https://pypi.org/simple" } +version = "0.1.4" +source = { editable = "../repack" } dependencies = [ { name = "numpy" }, { name = "pandas" }, + { name = "pyarrow" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/50/5f/6b750fd47f0ac9074fb146c4a9dec6efc483eb73d6b26b9245f25059737e/owid_repack-0.1.3.tar.gz", hash = "sha256:b65075af87c63945795801a2d7fd744f3d9a47ce7faa20736f389051655bff4a", size = 4124 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/de/a9/8388e07e9a6e4da1188e2df3f85849808a44d8fd6765510c9d8a3a1d54a5/owid_repack-0.1.3-py3-none-any.whl", hash = "sha256:c1a5e58964e4d83db6377f286cfc1c766aa60ae58d9a37003bdcda0194957b26", size = 4532 }, + +[package.metadata] +requires-dist = [ + { name = "numpy", specifier = ">=1.24.0" }, + { name = "pandas", specifier = ">=2.2.3" }, + { name = "pyarrow", specifier = ">=10.0.1,<18.0.0" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "ipdb", specifier = ">=0.13.13" }, + { name = "pyright", specifier = "==1.1.373" }, + { name = "pytest", specifier = ">=7.2.0" }, + { name = "ruff", specifier = "==0.1.6" }, ] [[package]] @@ -897,7 +896,7 @@ wheels = [ [[package]] name = "pandas" -version = "2.2.1" +version = "2.2.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, @@ -905,36 +904,49 @@ dependencies = [ { name = "pytz" }, { name = "tzdata" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/3d/59/2afa81b9fb300c90531803c0fd43ff4548074fa3e8d0f747ef63b3b5e77a/pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572", size = 4395256 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/39/f4495f8ab5a58b1eeee06b5abd811e0a93f7b75acdc89380797f99bdf91a/pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88", size = 12543124 }, - { url = "https://files.pythonhosted.org/packages/4f/19/0ae5f1557badfcae1052c1397041a2c5441e9f31e1c7b0cce7f8bc585f4e/pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944", size = 11285572 }, - { url = "https://files.pythonhosted.org/packages/5d/d2/df8047f8c3648eb6b3ee86ef7ee811ad01e55b47a14ea02fe36d601e12cd/pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359", size = 15629656 }, - { url = "https://files.pythonhosted.org/packages/19/df/8d789d96a9e338cf28cb7978fa93ef5da53137624b7ef032f30748421c2b/pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51", size = 13024911 }, - { url = "https://files.pythonhosted.org/packages/11/a1/9d5505c6c56740f7ed8bd78c8756fb76aeff1c706b30e6930ddf90693aee/pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06", size = 16275635 }, - { url = "https://files.pythonhosted.org/packages/d6/99/378e9108cf3562c7c6294249f1bfd3be08325af5e96af435fb221dd1c320/pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9", size = 13880082 }, - { url = "https://files.pythonhosted.org/packages/93/26/2a695303a4a3194014dca7cb5d5ce08f0d2c6baa344fb5f562c642e77b2b/pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0", size = 11592785 }, - { url = "https://files.pythonhosted.org/packages/f1/8b/617792ad1feef330e87d7459584a1f91aa8aea373d8b168ac5d24fddd808/pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b", size = 12564385 }, - { url = "https://files.pythonhosted.org/packages/a5/78/1d859bfb619c067e3353ed079248ae9532c105c4e018fa9a776d04b34572/pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a", size = 11303028 }, - { url = "https://files.pythonhosted.org/packages/91/bf/8c57707e440f944ba2cf3d6f6ae6c29883fac20fbe5d2ad485229149f273/pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02", size = 15594865 }, - { url = "https://files.pythonhosted.org/packages/d4/47/1ccf9f62d2674d3ca3e95452c5f9dd114234d1535dec77c96528bf6a31fc/pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403", size = 13034628 }, - { url = "https://files.pythonhosted.org/packages/e3/da/9522ba4b32b20a344c37a970d7835d261df1427d943e02d48820253833ee/pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd", size = 16243608 }, - { url = "https://files.pythonhosted.org/packages/e0/c3/da6ffa0d3d510c378f6e46496cf7f84f35e15836d0de4e9880f40247eb60/pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7", size = 13884355 }, - { url = "https://files.pythonhosted.org/packages/61/11/1812ef6cbd7433ad240f72161ce5f84c4c450cede4db080365d371d29117/pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e", size = 11602637 }, - { url = "https://files.pythonhosted.org/packages/ed/b9/660353ce2b1bd5b6e0f5c992836d91909c0da1ccb59c16565ad0a37e839d/pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c", size = 12493183 }, - { url = "https://files.pythonhosted.org/packages/19/4e/6a7f400d4b65f82e37eefa7dbbe3e6f0a4fa542ca7ebb68c787eeebdc497/pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee", size = 11335860 }, - { url = "https://files.pythonhosted.org/packages/d7/2b/3e00e92a6b430313da68b15e925c6dba05f672d716cf3b02bcd3d0381974/pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2", size = 15189183 }, - { url = "https://files.pythonhosted.org/packages/78/f4/19f1dda9ab1eaa38301e445925f92b303d415d4c4115e56c0d62774421f7/pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0", size = 12742656 }, - { url = "https://files.pythonhosted.org/packages/6f/cd/8b84912b5bfab19b1fcea2f732d2e3a2d134d558f141e9dffa5dbfd9d23b/pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc", size = 15861331 }, - { url = "https://files.pythonhosted.org/packages/11/e7/65bf50aff86da6554cdffdcd87ced857c79a29dfaf1d85fdf97955d76d02/pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89", size = 13410754 }, - { url = "https://files.pythonhosted.org/packages/71/00/6beaeeba7f075d15ea167a5caa039b861e58ff2f58a5b659abb9b544c8f6/pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb", size = 11478767 }, - { url = "https://files.pythonhosted.org/packages/1a/f6/621a5a90727c839aafd4a2e40f8fab4645efb534f96454d31a257ce693ed/pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397", size = 12561981 }, - { url = "https://files.pythonhosted.org/packages/bc/57/8c61a6b2f9798349748701938dfed6d645bd329bfd96245ad98245238b6f/pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16", size = 11301393 }, - { url = "https://files.pythonhosted.org/packages/3e/a6/6dbcb4b72687c8df8f3dca5f16b296b4ae5c9fa3084a32a165113d594b71/pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019", size = 15646609 }, - { url = "https://files.pythonhosted.org/packages/1a/5e/71bb0eef0dc543f7516d9ddeca9ee8dc98207043784e3f7e6c08b4a6b3d9/pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df", size = 13040474 }, - { url = "https://files.pythonhosted.org/packages/60/f0/765326197f1759004d07a3e5e060cecfc90fd7af22eadd4cb02ef5e74555/pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6", size = 16261844 }, - { url = "https://files.pythonhosted.org/packages/5f/96/0f208a3f7bb6f930060c1930fe4d2d24ce491d044a6ace1cb6cc52d3a319/pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be", size = 13914313 }, - { url = "https://files.pythonhosted.org/packages/41/a3/349df1721beb447142b8b11e27875a3da00f85d713f1a4bed0afb3a62e14/pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab", size = 11610656 }, +sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5", size = 12580827 }, + { url = "https://files.pythonhosted.org/packages/99/f2/c4527768739ffa4469b2b4fff05aa3768a478aed89a2f271a79a40eee984/pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348", size = 11303897 }, + { url = "https://files.pythonhosted.org/packages/ed/12/86c1747ea27989d7a4064f806ce2bae2c6d575b950be087837bdfcabacc9/pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed", size = 66480908 }, + { url = "https://files.pythonhosted.org/packages/44/50/7db2cd5e6373ae796f0ddad3675268c8d59fb6076e66f0c339d61cea886b/pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57", size = 13064210 }, + { url = "https://files.pythonhosted.org/packages/61/61/a89015a6d5536cb0d6c3ba02cebed51a95538cf83472975275e28ebf7d0c/pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42", size = 16754292 }, + { url = "https://files.pythonhosted.org/packages/ce/0d/4cc7b69ce37fac07645a94e1d4b0880b15999494372c1523508511b09e40/pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f", size = 14416379 }, + { url = "https://files.pythonhosted.org/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645", size = 11598471 }, + { url = "https://files.pythonhosted.org/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222 }, + { url = "https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274 }, + { url = "https://files.pythonhosted.org/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836 }, + { url = "https://files.pythonhosted.org/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505 }, + { url = "https://files.pythonhosted.org/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420 }, + { url = "https://files.pythonhosted.org/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457 }, + { url = "https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166 }, + { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893 }, + { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475 }, + { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645 }, + { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445 }, + { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235 }, + { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756 }, + { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 }, + { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643 }, + { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573 }, + { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085 }, + { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809 }, + { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316 }, + { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055 }, + { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175 }, + { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650 }, + { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177 }, + { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526 }, + { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013 }, + { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620 }, + { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 }, + { url = "https://files.pythonhosted.org/packages/ca/8c/8848a4c9b8fdf5a534fe2077af948bf53cd713d77ffbcd7bd15710348fd7/pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39", size = 12595535 }, + { url = "https://files.pythonhosted.org/packages/9c/b9/5cead4f63b6d31bdefeb21a679bc5a7f4aaf262ca7e07e2bc1c341b68470/pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30", size = 11319822 }, + { url = "https://files.pythonhosted.org/packages/31/af/89e35619fb573366fa68dc26dad6ad2c08c17b8004aad6d98f1a31ce4bb3/pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c", size = 15625439 }, + { url = "https://files.pythonhosted.org/packages/3d/dd/bed19c2974296661493d7acc4407b1d2db4e2a482197df100f8f965b6225/pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c", size = 13068928 }, + { url = "https://files.pythonhosted.org/packages/31/a3/18508e10a31ea108d746c848b5a05c0711e0278fa0d6f1c52a8ec52b80a5/pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea", size = 16783266 }, + { url = "https://files.pythonhosted.org/packages/c4/a5/3429bd13d82bebc78f4d78c3945efedef63a7cd0c15c17b2eeb838d1121f/pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761", size = 14450871 }, + { url = "https://files.pythonhosted.org/packages/2f/49/5c30646e96c684570925b772eac4eb0a8cb0ca590fa978f56c5d3ae73ea1/pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e", size = 11618011 }, ] [[package]] diff --git a/lib/datautils/owid/datautils/dataframes.py b/lib/datautils/owid/datautils/dataframes.py index 8eac2d987a2..195f6712336 100644 --- a/lib/datautils/owid/datautils/dataframes.py +++ b/lib/datautils/owid/datautils/dataframes.py @@ -108,7 +108,7 @@ def compare( # Compare, column by column, the elements of the two dataframes. compared = pd.DataFrame() for col in columns: - if (df1[col].dtype in (object, "category")) or (df2[col].dtype in (object, "category")): + if (df1[col].dtype in (object, "category", "string")) or (df2[col].dtype in (object, "category", "string")): # Apply a direct comparison for strings or categories compared_row = df1[col].values == df2[col].values else: @@ -461,6 +461,13 @@ def map_series( # Replace those nans with their original values, except if they were actually meant to be mapped to nan. # For example, if {"bad_value": np.nan} was part of the mapping, do not replace those nans back to "bad_value". + # if we are setting values from the original series, ensure we have the same dtype + try: + series_mapped = series_mapped.astype(series.dtype, copy=False) + except ValueError: + # casting NaNs to integer will fail + pass + # Detect values in the mapping that were intended to be mapped to nan. values_mapped_to_nan = [ original_value for original_value, target_value in mapping.items() if pd.isnull(target_value) @@ -632,6 +639,14 @@ def combine_two_overlapping_dataframes( # Give priority to df1 on overlapping values. combined, df2 = df1.align(df2) + new_columns = df2.columns.difference(df1.columns) + for col in new_columns: + try: + combined[col] = combined[col].astype(df2[col].dtype, copy=False) + except ValueError: + # casting NaNs to integer will fail + pass + # Fill missing values in df1 with values from df2. combined = combined.fillna(df2) diff --git a/lib/datautils/owid/datautils/io/archive.py b/lib/datautils/owid/datautils/io/archive.py index e9c51495b6e..f0a4e1ec200 100644 --- a/lib/datautils/owid/datautils/io/archive.py +++ b/lib/datautils/owid/datautils/io/archive.py @@ -3,7 +3,9 @@ import tarfile import zipfile from pathlib import Path -from typing import Union +from typing import Union, cast + +from py7zr import SevenZipFile from owid.datautils.decorators import enable_file_download @@ -28,10 +30,17 @@ def decompress_file( Overwrite decompressed content if it already exists (otherwise raises an error if content already exists). """ + if isinstance(input_file, str): + input_file = Path(input_file) + input_file = cast(Path, input_file) + if zipfile.is_zipfile(input_file): _decompress_zip_file(input_file, output_folder, overwrite) elif tarfile.is_tarfile(input_file): _decompress_tar_file(input_file, output_folder, overwrite) + elif input_file.suffix.lower() == ".7z": + with SevenZipFile(input_file, mode="r") as z: + z.extractall(path=output_folder) else: raise ValueError("File is neither a zip nor a tar file.") diff --git a/lib/datautils/pyproject.toml b/lib/datautils/pyproject.toml index a173445e218..09895036ef0 100644 --- a/lib/datautils/pyproject.toml +++ b/lib/datautils/pyproject.toml @@ -23,7 +23,8 @@ dependencies = [ "pydrive2>=1.15.0", "pyarrow>=10.0.1", "urllib3<2", - "click>=8.1.7" + "click>=8.1.7", + "py7zr>=0.22.0", ] [tool.uv] diff --git a/lib/datautils/tests/test_dataframes.py b/lib/datautils/tests/test_dataframes.py index ebb87011d49..d6b81a51c7b 100644 --- a/lib/datautils/tests/test_dataframes.py +++ b/lib/datautils/tests/test_dataframes.py @@ -326,13 +326,14 @@ def test_default_aggregate_with_some_nans_ignored_different_types_and_more_nans( } ).set_index("year") df_out["value_03"] = df_out["value_03"].astype(object) - assert dataframes.groupby_agg( + result = dataframes.groupby_agg( df_in, ["year"], aggregations=None, num_allowed_nans=None, frac_allowed_nans=None, - ).equals(df_out) + ) + assert result.equals(df_out) def test_default_aggregate_with_num_allowed_nans_zero(self): df_in = pd.DataFrame( diff --git a/lib/datautils/uv.lock b/lib/datautils/uv.lock index 08812693d0d..92536172416 100644 --- a/lib/datautils/uv.lock +++ b/lib/datautils/uv.lock @@ -117,6 +117,119 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/29/1d/9776890c7c4f070edb8d35ee986d9cde951e1f4187439f73f47922a6c6e9/botocore_stubs-1.35.20-py3-none-any.whl", hash = "sha256:f34d8908fa6a29d3dec57a373160d2defca3e2250d689864b23c9fea95669f97", size = 60132 }, ] +[[package]] +name = "brotli" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/c2/f9e977608bdf958650638c3f1e28f85a1b075f075ebbe77db8555463787b/Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724", size = 7372270 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/3a/dbf4fb970c1019a57b5e492e1e0eae745d32e59ba4d6161ab5422b08eefe/Brotli-1.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e1140c64812cb9b06c922e77f1c26a75ec5e3f0fb2bf92cc8c58720dec276752", size = 873045 }, + { url = "https://files.pythonhosted.org/packages/dd/11/afc14026ea7f44bd6eb9316d800d439d092c8d508752055ce8d03086079a/Brotli-1.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c8fd5270e906eef71d4a8d19b7c6a43760c6abcfcc10c9101d14eb2357418de9", size = 446218 }, + { url = "https://files.pythonhosted.org/packages/36/83/7545a6e7729db43cb36c4287ae388d6885c85a86dd251768a47015dfde32/Brotli-1.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ae56aca0402a0f9a3431cddda62ad71666ca9d4dc3a10a142b9dce2e3c0cda3", size = 2903872 }, + { url = "https://files.pythonhosted.org/packages/32/23/35331c4d9391fcc0f29fd9bec2c76e4b4eeab769afbc4b11dd2e1098fb13/Brotli-1.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:43ce1b9935bfa1ede40028054d7f48b5469cd02733a365eec8a329ffd342915d", size = 2941254 }, + { url = "https://files.pythonhosted.org/packages/3b/24/1671acb450c902edb64bd765d73603797c6c7280a9ada85a195f6b78c6e5/Brotli-1.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7c4855522edb2e6ae7fdb58e07c3ba9111e7621a8956f481c68d5d979c93032e", size = 2857293 }, + { url = "https://files.pythonhosted.org/packages/d5/00/40f760cc27007912b327fe15bf6bfd8eaecbe451687f72a8abc587d503b3/Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:38025d9f30cf4634f8309c6874ef871b841eb3c347e90b0851f63d1ded5212da", size = 3002385 }, + { url = "https://files.pythonhosted.org/packages/b8/cb/8aaa83f7a4caa131757668c0fb0c4b6384b09ffa77f2fba9570d87ab587d/Brotli-1.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e6a904cb26bfefc2f0a6f240bdf5233be78cd2488900a2f846f3c3ac8489ab80", size = 2911104 }, + { url = "https://files.pythonhosted.org/packages/bc/c4/65456561d89d3c49f46b7fbeb8fe6e449f13bdc8ea7791832c5d476b2faf/Brotli-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a37b8f0391212d29b3a91a799c8e4a2855e0576911cdfb2515487e30e322253d", size = 2809981 }, + { url = "https://files.pythonhosted.org/packages/05/1b/cf49528437bae28abce5f6e059f0d0be6fecdcc1d3e33e7c54b3ca498425/Brotli-1.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e84799f09591700a4154154cab9787452925578841a94321d5ee8fb9a9a328f0", size = 2935297 }, + { url = "https://files.pythonhosted.org/packages/81/ff/190d4af610680bf0c5a09eb5d1eac6e99c7c8e216440f9c7cfd42b7adab5/Brotli-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f66b5337fa213f1da0d9000bc8dc0cb5b896b726eefd9c6046f699b169c41b9e", size = 2930735 }, + { url = "https://files.pythonhosted.org/packages/80/7d/f1abbc0c98f6e09abd3cad63ec34af17abc4c44f308a7a539010f79aae7a/Brotli-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5dab0844f2cf82be357a0eb11a9087f70c5430b2c241493fc122bb6f2bb0917c", size = 2933107 }, + { url = "https://files.pythonhosted.org/packages/34/ce/5a5020ba48f2b5a4ad1c0522d095ad5847a0be508e7d7569c8630ce25062/Brotli-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4fe605b917c70283db7dfe5ada75e04561479075761a0b3866c081d035b01c1", size = 2845400 }, + { url = "https://files.pythonhosted.org/packages/44/89/fa2c4355ab1eecf3994e5a0a7f5492c6ff81dfcb5f9ba7859bd534bb5c1a/Brotli-1.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1e9a65b5736232e7a7f91ff3d02277f11d339bf34099a56cdab6a8b3410a02b2", size = 3031985 }, + { url = "https://files.pythonhosted.org/packages/af/a4/79196b4a1674143d19dca400866b1a4d1a089040df7b93b88ebae81f3447/Brotli-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58d4b711689366d4a03ac7957ab8c28890415e267f9b6589969e74b6e42225ec", size = 2927099 }, + { url = "https://files.pythonhosted.org/packages/e9/54/1c0278556a097f9651e657b873ab08f01b9a9ae4cac128ceb66427d7cd20/Brotli-1.1.0-cp310-cp310-win32.whl", hash = "sha256:be36e3d172dc816333f33520154d708a2657ea63762ec16b62ece02ab5e4daf2", size = 333172 }, + { url = "https://files.pythonhosted.org/packages/f7/65/b785722e941193fd8b571afd9edbec2a9b838ddec4375d8af33a50b8dab9/Brotli-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:0c6244521dda65ea562d5a69b9a26120769b7a9fb3db2fe9545935ed6735b128", size = 357255 }, + { url = "https://files.pythonhosted.org/packages/96/12/ad41e7fadd5db55459c4c401842b47f7fee51068f86dd2894dd0dcfc2d2a/Brotli-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3daabb76a78f829cafc365531c972016e4aa8d5b4bf60660ad8ecee19df7ccc", size = 873068 }, + { url = "https://files.pythonhosted.org/packages/95/4e/5afab7b2b4b61a84e9c75b17814198ce515343a44e2ed4488fac314cd0a9/Brotli-1.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c8146669223164fc87a7e3de9f81e9423c67a79d6b3447994dfb9c95da16e2d6", size = 446244 }, + { url = "https://files.pythonhosted.org/packages/9d/e6/f305eb61fb9a8580c525478a4a34c5ae1a9bcb12c3aee619114940bc513d/Brotli-1.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30924eb4c57903d5a7526b08ef4a584acc22ab1ffa085faceb521521d2de32dd", size = 2906500 }, + { url = "https://files.pythonhosted.org/packages/3e/4f/af6846cfbc1550a3024e5d3775ede1e00474c40882c7bf5b37a43ca35e91/Brotli-1.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ceb64bbc6eac5a140ca649003756940f8d6a7c444a68af170b3187623b43bebf", size = 2943950 }, + { url = "https://files.pythonhosted.org/packages/b3/e7/ca2993c7682d8629b62630ebf0d1f3bb3d579e667ce8e7ca03a0a0576a2d/Brotli-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a469274ad18dc0e4d316eefa616d1d0c2ff9da369af19fa6f3daa4f09671fd61", size = 2918527 }, + { url = "https://files.pythonhosted.org/packages/b3/96/da98e7bedc4c51104d29cc61e5f449a502dd3dbc211944546a4cc65500d3/Brotli-1.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:524f35912131cc2cabb00edfd8d573b07f2d9f21fa824bd3fb19725a9cf06327", size = 2845489 }, + { url = "https://files.pythonhosted.org/packages/e8/ef/ccbc16947d6ce943a7f57e1a40596c75859eeb6d279c6994eddd69615265/Brotli-1.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:5b3cc074004d968722f51e550b41a27be656ec48f8afaeeb45ebf65b561481dd", size = 2914080 }, + { url = "https://files.pythonhosted.org/packages/80/d6/0bd38d758d1afa62a5524172f0b18626bb2392d717ff94806f741fcd5ee9/Brotli-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:19c116e796420b0cee3da1ccec3b764ed2952ccfcc298b55a10e5610ad7885f9", size = 2813051 }, + { url = "https://files.pythonhosted.org/packages/14/56/48859dd5d129d7519e001f06dcfbb6e2cf6db92b2702c0c2ce7d97e086c1/Brotli-1.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:510b5b1bfbe20e1a7b3baf5fed9e9451873559a976c1a78eebaa3b86c57b4265", size = 2938172 }, + { url = "https://files.pythonhosted.org/packages/3d/77/a236d5f8cd9e9f4348da5acc75ab032ab1ab2c03cc8f430d24eea2672888/Brotli-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a1fd8a29719ccce974d523580987b7f8229aeace506952fa9ce1d53a033873c8", size = 2933023 }, + { url = "https://files.pythonhosted.org/packages/f1/87/3b283efc0f5cb35f7f84c0c240b1e1a1003a5e47141a4881bf87c86d0ce2/Brotli-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c247dd99d39e0338a604f8c2b3bc7061d5c2e9e2ac7ba9cc1be5a69cb6cd832f", size = 2935871 }, + { url = "https://files.pythonhosted.org/packages/f3/eb/2be4cc3e2141dc1a43ad4ca1875a72088229de38c68e842746b342667b2a/Brotli-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1b2c248cd517c222d89e74669a4adfa5577e06ab68771a529060cf5a156e9757", size = 2847784 }, + { url = "https://files.pythonhosted.org/packages/66/13/b58ddebfd35edde572ccefe6890cf7c493f0c319aad2a5badee134b4d8ec/Brotli-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2a24c50840d89ded6c9a8fdc7b6ed3692ed4e86f1c4a4a938e1e92def92933e0", size = 3034905 }, + { url = "https://files.pythonhosted.org/packages/84/9c/bc96b6c7db824998a49ed3b38e441a2cae9234da6fa11f6ed17e8cf4f147/Brotli-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f31859074d57b4639318523d6ffdca586ace54271a73ad23ad021acd807eb14b", size = 2929467 }, + { url = "https://files.pythonhosted.org/packages/e7/71/8f161dee223c7ff7fea9d44893fba953ce97cf2c3c33f78ba260a91bcff5/Brotli-1.1.0-cp311-cp311-win32.whl", hash = "sha256:39da8adedf6942d76dc3e46653e52df937a3c4d6d18fdc94a7c29d263b1f5b50", size = 333169 }, + { url = "https://files.pythonhosted.org/packages/02/8a/fece0ee1057643cb2a5bbf59682de13f1725f8482b2c057d4e799d7ade75/Brotli-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:aac0411d20e345dc0920bdec5548e438e999ff68d77564d5e9463a7ca9d3e7b1", size = 357253 }, + { url = "https://files.pythonhosted.org/packages/5c/d0/5373ae13b93fe00095a58efcbce837fd470ca39f703a235d2a999baadfbc/Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:32d95b80260d79926f5fab3c41701dbb818fde1c9da590e77e571eefd14abe28", size = 815693 }, + { url = "https://files.pythonhosted.org/packages/8e/48/f6e1cdf86751300c288c1459724bfa6917a80e30dbfc326f92cea5d3683a/Brotli-1.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b760c65308ff1e462f65d69c12e4ae085cff3b332d894637f6273a12a482d09f", size = 422489 }, + { url = "https://files.pythonhosted.org/packages/06/88/564958cedce636d0f1bed313381dfc4b4e3d3f6015a63dae6146e1b8c65c/Brotli-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:316cc9b17edf613ac76b1f1f305d2a748f1b976b033b049a6ecdfd5612c70409", size = 873081 }, + { url = "https://files.pythonhosted.org/packages/58/79/b7026a8bb65da9a6bb7d14329fd2bd48d2b7f86d7329d5cc8ddc6a90526f/Brotli-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:caf9ee9a5775f3111642d33b86237b05808dafcd6268faa492250e9b78046eb2", size = 446244 }, + { url = "https://files.pythonhosted.org/packages/e5/18/c18c32ecea41b6c0004e15606e274006366fe19436b6adccc1ae7b2e50c2/Brotli-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70051525001750221daa10907c77830bc889cb6d865cc0b813d9db7fefc21451", size = 2906505 }, + { url = "https://files.pythonhosted.org/packages/08/c8/69ec0496b1ada7569b62d85893d928e865df29b90736558d6c98c2031208/Brotli-1.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7f4bf76817c14aa98cc6697ac02f3972cb8c3da93e9ef16b9c66573a68014f91", size = 2944152 }, + { url = "https://files.pythonhosted.org/packages/ab/fb/0517cea182219d6768113a38167ef6d4eb157a033178cc938033a552ed6d/Brotli-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0c5516f0aed654134a2fc936325cc2e642f8a0e096d075209672eb321cff408", size = 2919252 }, + { url = "https://files.pythonhosted.org/packages/c7/53/73a3431662e33ae61a5c80b1b9d2d18f58dfa910ae8dd696e57d39f1a2f5/Brotli-1.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c3020404e0b5eefd7c9485ccf8393cfb75ec38ce75586e046573c9dc29967a0", size = 2845955 }, + { url = "https://files.pythonhosted.org/packages/55/ac/bd280708d9c5ebdbf9de01459e625a3e3803cce0784f47d633562cf40e83/Brotli-1.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4ed11165dd45ce798d99a136808a794a748d5dc38511303239d4e2363c0695dc", size = 2914304 }, + { url = "https://files.pythonhosted.org/packages/76/58/5c391b41ecfc4527d2cc3350719b02e87cb424ef8ba2023fb662f9bf743c/Brotli-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4093c631e96fdd49e0377a9c167bfd75b6d0bad2ace734c6eb20b348bc3ea180", size = 2814452 }, + { url = "https://files.pythonhosted.org/packages/c7/4e/91b8256dfe99c407f174924b65a01f5305e303f486cc7a2e8a5d43c8bec3/Brotli-1.1.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e4c4629ddad63006efa0ef968c8e4751c5868ff0b1c5c40f76524e894c50248", size = 2938751 }, + { url = "https://files.pythonhosted.org/packages/5a/a6/e2a39a5d3b412938362bbbeba5af904092bf3f95b867b4a3eb856104074e/Brotli-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:861bf317735688269936f755fa136a99d1ed526883859f86e41a5d43c61d8966", size = 2933757 }, + { url = "https://files.pythonhosted.org/packages/13/f0/358354786280a509482e0e77c1a5459e439766597d280f28cb097642fc26/Brotli-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:87a3044c3a35055527ac75e419dfa9f4f3667a1e887ee80360589eb8c90aabb9", size = 2936146 }, + { url = "https://files.pythonhosted.org/packages/80/f7/daf538c1060d3a88266b80ecc1d1c98b79553b3f117a485653f17070ea2a/Brotli-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c5529b34c1c9d937168297f2c1fde7ebe9ebdd5e121297ff9c043bdb2ae3d6fb", size = 2848055 }, + { url = "https://files.pythonhosted.org/packages/ad/cf/0eaa0585c4077d3c2d1edf322d8e97aabf317941d3a72d7b3ad8bce004b0/Brotli-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ca63e1890ede90b2e4454f9a65135a4d387a4585ff8282bb72964fab893f2111", size = 3035102 }, + { url = "https://files.pythonhosted.org/packages/d8/63/1c1585b2aa554fe6dbce30f0c18bdbc877fa9a1bf5ff17677d9cca0ac122/Brotli-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e79e6520141d792237c70bcd7a3b122d00f2613769ae0cb61c52e89fd3443839", size = 2930029 }, + { url = "https://files.pythonhosted.org/packages/5f/3b/4e3fd1893eb3bbfef8e5a80d4508bec17a57bb92d586c85c12d28666bb13/Brotli-1.1.0-cp312-cp312-win32.whl", hash = "sha256:5f4d5ea15c9382135076d2fb28dde923352fe02951e66935a9efaac8f10e81b0", size = 333276 }, + { url = "https://files.pythonhosted.org/packages/3d/d5/942051b45a9e883b5b6e98c041698b1eb2012d25e5948c58d6bf85b1bb43/Brotli-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:906bc3a79de8c4ae5b86d3d75a8b77e44404b0f4261714306e3ad248d8ab0951", size = 357255 }, + { url = "https://files.pythonhosted.org/packages/0a/9f/fb37bb8ffc52a8da37b1c03c459a8cd55df7a57bdccd8831d500e994a0ca/Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8bf32b98b75c13ec7cf774164172683d6e7891088f6316e54425fde1efc276d5", size = 815681 }, + { url = "https://files.pythonhosted.org/packages/06/b3/dbd332a988586fefb0aa49c779f59f47cae76855c2d00f450364bb574cac/Brotli-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bc37c4d6b87fb1017ea28c9508b36bbcb0c3d18b4260fcdf08b200c74a6aee8", size = 422475 }, + { url = "https://files.pythonhosted.org/packages/bb/80/6aaddc2f63dbcf2d93c2d204e49c11a9ec93a8c7c63261e2b4bd35198283/Brotli-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c0ef38c7a7014ffac184db9e04debe495d317cc9c6fb10071f7fefd93100a4f", size = 2906173 }, + { url = "https://files.pythonhosted.org/packages/ea/1d/e6ca79c96ff5b641df6097d299347507d39a9604bde8915e76bf026d6c77/Brotli-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91d7cc2a76b5567591d12c01f019dd7afce6ba8cba6571187e21e2fc418ae648", size = 2943803 }, + { url = "https://files.pythonhosted.org/packages/ac/a3/d98d2472e0130b7dd3acdbb7f390d478123dbf62b7d32bda5c830a96116d/Brotli-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93dde851926f4f2678e704fadeb39e16c35d8baebd5252c9fd94ce8ce68c4a0", size = 2918946 }, + { url = "https://files.pythonhosted.org/packages/c4/a5/c69e6d272aee3e1423ed005d8915a7eaa0384c7de503da987f2d224d0721/Brotli-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0db75f47be8b8abc8d9e31bc7aad0547ca26f24a54e6fd10231d623f183d089", size = 2845707 }, + { url = "https://files.pythonhosted.org/packages/58/9f/4149d38b52725afa39067350696c09526de0125ebfbaab5acc5af28b42ea/Brotli-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6967ced6730aed543b8673008b5a391c3b1076d834ca438bbd70635c73775368", size = 2936231 }, + { url = "https://files.pythonhosted.org/packages/5a/5a/145de884285611838a16bebfdb060c231c52b8f84dfbe52b852a15780386/Brotli-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7eedaa5d036d9336c95915035fb57422054014ebdeb6f3b42eac809928e40d0c", size = 2848157 }, + { url = "https://files.pythonhosted.org/packages/50/ae/408b6bfb8525dadebd3b3dd5b19d631da4f7d46420321db44cd99dcf2f2c/Brotli-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d487f5432bf35b60ed625d7e1b448e2dc855422e87469e3f450aa5552b0eb284", size = 3035122 }, + { url = "https://files.pythonhosted.org/packages/af/85/a94e5cfaa0ca449d8f91c3d6f78313ebf919a0dbd55a100c711c6e9655bc/Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7", size = 2930206 }, + { url = "https://files.pythonhosted.org/packages/c2/f0/a61d9262cd01351df22e57ad7c34f66794709acab13f34be2675f45bf89d/Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0", size = 333804 }, + { url = "https://files.pythonhosted.org/packages/7e/c1/ec214e9c94000d1c1974ec67ced1c970c148aa6b8d8373066123fc3dbf06/Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b", size = 358517 }, + { url = "https://files.pythonhosted.org/packages/1b/aa/aa6e0c9848ee4375514af0b27abf470904992939b7363ae78fc8aca8a9a8/Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a", size = 873048 }, + { url = "https://files.pythonhosted.org/packages/ae/32/38bba1a8bef9ecb1cda08439fd28d7e9c51aff13b4783a4f1610da90b6c2/Brotli-1.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7905193081db9bfa73b1219140b3d315831cbff0d8941f22da695832f0dd188f", size = 446207 }, + { url = "https://files.pythonhosted.org/packages/3c/6a/14cc20ddc53efc274601c8195791a27cfb7acc5e5134e0f8c493a8b8821a/Brotli-1.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a77def80806c421b4b0af06f45d65a136e7ac0bdca3c09d9e2ea4e515367c7e9", size = 2903803 }, + { url = "https://files.pythonhosted.org/packages/9a/26/62b2d894d4e82d7a7f4e0bb9007a42bbc765697a5679b43186acd68d7a79/Brotli-1.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8dadd1314583ec0bf2d1379f7008ad627cd6336625d6679cf2f8e67081b83acf", size = 2941149 }, + { url = "https://files.pythonhosted.org/packages/a9/ca/00d55bbdd8631236c61777742d8a8454cf6a87eb4125cad675912c68bec7/Brotli-1.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:901032ff242d479a0efa956d853d16875d42157f98951c0230f69e69f9c09bac", size = 2672253 }, + { url = "https://files.pythonhosted.org/packages/e2/e6/4a730f6e5b5d538e92d09bc51bf69119914f29a222f9e1d65ae4abb27a4e/Brotli-1.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:22fc2a8549ffe699bfba2256ab2ed0421a7b8fadff114a3d201794e45a9ff578", size = 2757005 }, + { url = "https://files.pythonhosted.org/packages/cb/6b/8cf297987fe3c1bf1c87f0c0b714af2ce47092b8d307b9f6ecbc65f98968/Brotli-1.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ae15b066e5ad21366600ebec29a7ccbc86812ed267e4b28e860b8ca16a2bc474", size = 2910658 }, + { url = "https://files.pythonhosted.org/packages/2c/1f/be9443995821c933aad7159803f84ef4923c6f5b72c2affd001192b310fc/Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c", size = 2809728 }, + { url = "https://files.pythonhosted.org/packages/76/2f/213bab6efa902658c80a1247142d42b138a27ccdd6bade49ca9cd74e714a/Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d", size = 2935043 }, + { url = "https://files.pythonhosted.org/packages/27/89/bbb14fa98e895d1e601491fba54a5feec167d262f0d3d537a3b0d4cd0029/Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59", size = 2930639 }, + { url = "https://files.pythonhosted.org/packages/14/87/03a6d6e1866eddf9f58cc57e35befbeb5514da87a416befe820150cae63f/Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419", size = 2932834 }, + { url = "https://files.pythonhosted.org/packages/a4/d5/e5f85e04f75144d1a89421ba432def6bdffc8f28b04f5b7d540bbd03362c/Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2", size = 2845213 }, + { url = "https://files.pythonhosted.org/packages/99/bf/25ef07add7afbb1aacd4460726a1a40370dfd60c0810b6f242a6d3871d7e/Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f", size = 3031573 }, + { url = "https://files.pythonhosted.org/packages/55/22/948a97bda5c9dc9968d56b9ed722d9727778db43739cf12ef26ff69be94d/Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb", size = 2926885 }, + { url = "https://files.pythonhosted.org/packages/31/ba/e53d107399b535ef89deb6977dd8eae468e2dde7b1b74c6cbe2c0e31fda2/Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64", size = 333171 }, + { url = "https://files.pythonhosted.org/packages/99/b3/f7b3af539f74b82e1c64d28685a5200c631cc14ae751d37d6ed819655627/Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467", size = 357258 }, +] + +[[package]] +name = "brotlicffi" +version = "1.1.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/95/9d/70caa61192f570fcf0352766331b735afa931b4c6bc9a348a0925cc13288/brotlicffi-1.1.0.0.tar.gz", hash = "sha256:b77827a689905143f87915310b93b273ab17888fd43ef350d4832c4a71083c13", size = 465192 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/11/7b96009d3dcc2c931e828ce1e157f03824a69fb728d06bfd7b2fc6f93718/brotlicffi-1.1.0.0-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9b7ae6bd1a3f0df532b6d67ff674099a96d22bc0948955cb338488c31bfb8851", size = 453786 }, + { url = "https://files.pythonhosted.org/packages/d6/e6/a8f46f4a4ee7856fbd6ac0c6fb0dc65ed181ba46cd77875b8d9bbe494d9e/brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19ffc919fa4fc6ace69286e0a23b3789b4219058313cf9b45625016bf7ff996b", size = 2911165 }, + { url = "https://files.pythonhosted.org/packages/be/20/201559dff14e83ba345a5ec03335607e47467b6633c210607e693aefac40/brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9feb210d932ffe7798ee62e6145d3a757eb6233aa9a4e7db78dd3690d7755814", size = 2927895 }, + { url = "https://files.pythonhosted.org/packages/cd/15/695b1409264143be3c933f708a3f81d53c4a1e1ebbc06f46331decbf6563/brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84763dbdef5dd5c24b75597a77e1b30c66604725707565188ba54bab4f114820", size = 2851834 }, + { url = "https://files.pythonhosted.org/packages/b4/40/b961a702463b6005baf952794c2e9e0099bde657d0d7e007f923883b907f/brotlicffi-1.1.0.0-cp37-abi3-win32.whl", hash = "sha256:1b12b50e07c3911e1efa3a8971543e7648100713d4e0971b13631cce22c587eb", size = 341731 }, + { url = "https://files.pythonhosted.org/packages/1c/fa/5408a03c041114ceab628ce21766a4ea882aa6f6f0a800e04ee3a30ec6b9/brotlicffi-1.1.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:994a4f0681bb6c6c3b0925530a1926b7a189d878e6e5e38fae8efa47c5d9c613", size = 366783 }, + { url = "https://files.pythonhosted.org/packages/e5/3b/bd4f3d2bcf2306ae66b0346f5b42af1962480b200096ffc7abc3bd130eca/brotlicffi-1.1.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2e4aeb0bd2540cb91b069dbdd54d458da8c4334ceaf2d25df2f4af576d6766ca", size = 397397 }, + { url = "https://files.pythonhosted.org/packages/54/10/1fd57864449360852c535c2381ee7120ba8f390aa3869df967c44ca7eba1/brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b7b0033b0d37bb33009fb2fef73310e432e76f688af76c156b3594389d81391", size = 379698 }, + { url = "https://files.pythonhosted.org/packages/e5/95/15aa422aa6450e6556e54a5fd1650ff59f470aed77ac739aa90ab63dc611/brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54a07bb2374a1eba8ebb52b6fafffa2afd3c4df85ddd38fcc0511f2bb387c2a8", size = 378635 }, + { url = "https://files.pythonhosted.org/packages/6c/a7/f254e13b2cb43337d6d99a4ec10394c134e41bfda8a2eff15b75627f4a3d/brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7901a7dc4b88f1c1475de59ae9be59799db1007b7d059817948d8e4f12e24e35", size = 385719 }, + { url = "https://files.pythonhosted.org/packages/72/a9/0971251c4427c14b2a827dba3d910d4d3330dabf23d4278bf6d06a978847/brotlicffi-1.1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce01c7316aebc7fce59da734286148b1d1b9455f89cf2c8a4dfce7d41db55c2d", size = 361760 }, + { url = "https://files.pythonhosted.org/packages/35/9b/e0b577351e1d9d5890e1a56900c4ceaaef783b807145cd229446a43cf437/brotlicffi-1.1.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1a807d760763e398bbf2c6394ae9da5815901aa93ee0a37bca5efe78d4ee3171", size = 397392 }, + { url = "https://files.pythonhosted.org/packages/4f/7f/a16534d28386f74781db8b4544a764cf955abae336379a76f50e745bb0ee/brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa8ca0623b26c94fccc3a1fdd895be1743b838f3917300506d04aa3346fd2a14", size = 379695 }, + { url = "https://files.pythonhosted.org/packages/50/2a/699388b5e489726991132441b55aff0691dd73c49105ef220408a5ab98d6/brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3de0cf28a53a3238b252aca9fed1593e9d36c1d116748013339f0949bfc84112", size = 378629 }, + { url = "https://files.pythonhosted.org/packages/4a/3f/58254e7fbe6011bf043e4dcade0e16995a9f82b731734fad97220d201f42/brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6be5ec0e88a4925c91f3dea2bb0013b3a2accda6f77238f76a34a1ea532a1cb0", size = 385712 }, + { url = "https://files.pythonhosted.org/packages/40/16/2a29a625a6f74d13726387f83484dfaaf6fcdaafaadfbe26a0412ae268cc/brotlicffi-1.1.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d9eb71bb1085d996244439154387266fd23d6ad37161f6f52f1cd41dd95a3808", size = 361747 }, +] + [[package]] name = "bump2version" version = "1.0.1" @@ -666,13 +779,61 @@ name = "importlib-metadata" version = "8.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp", marker = "python_full_version < '3.11'" }, + { name = "zipp", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/cd/12/33e59336dca5be0c398a7482335911a33aa0e20776128f038019f1a95f1b/importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7", size = 55304 } wheels = [ { url = "https://files.pythonhosted.org/packages/a0/d9/a1e041c5e7caa9a05c925f4bdbdfb7f006d1f74996af53467bc394c97be7/importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b", size = 26514 }, ] +[[package]] +name = "inflate64" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8c/99/18f9940d4a3f2cabc4396a587ddf1bd93236bdb372d9e78e2b0365e40990/inflate64-1.0.0.tar.gz", hash = "sha256:3278827b803cf006a1df251f3e13374c7d26db779e5a33329cc11789b804bc2d", size = 895853 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/cf/06af80e81dd4bbb7e883291cf1726035d526f066a37c4ed4d4cd88a7a49d/inflate64-1.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a90c0bdf4a7ecddd8a64cc977181810036e35807f56b0bcacee9abb0fcfd18dc", size = 59418 }, + { url = "https://files.pythonhosted.org/packages/c9/4b/6f18918220b1a8e935121cece1dc917e62fa593fc637a621470f9b9a601a/inflate64-1.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:57fe7c14aebf1c5a74fc3b70d355be1280a011521a76aa3895486e62454f4242", size = 36231 }, + { url = "https://files.pythonhosted.org/packages/aa/f4/f4b5dbd78dd5af66b6ca32778ebaa9c14d67b68ea84e96592ccf40786a41/inflate64-1.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d90730165f471d61a1a694a5e354f3ffa938227e8dcecb62d5d728e8069cee94", size = 35738 }, + { url = "https://files.pythonhosted.org/packages/10/23/26289a700550767cf5eb7550f78ad826529706287393f224bbaee3c1b1e2/inflate64-1.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:543f400201f5c101141af3c79c82059e1aa6ef4f1584a7f1fa035fb2e465097f", size = 92855 }, + { url = "https://files.pythonhosted.org/packages/b8/f4/e387a50f5027194eac4f9712d57b97e3e1a012402eaae98bcf1ebe8a97d1/inflate64-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ceca14f7ec19fb44b047f56c50efb7521b389d222bba2b0a10286a0caeb03fa", size = 93141 }, + { url = "https://files.pythonhosted.org/packages/33/c8/e516aecd9ed0dc75d8df041ed4ef80f2e2be39d0e516c7269b7f274e760a/inflate64-1.0.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b559937a42f0c175b4d2dfc7eb53b97bdc87efa9add15ed5549c6abc1e89d02f", size = 95262 }, + { url = "https://files.pythonhosted.org/packages/0b/aa/ed3ab5f8c13afc432fb382edf97cede7a6f9be73ecf98bfe64b686c8d223/inflate64-1.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5ff8bd2a562343fcbc4eea26fdc368904a3b5f6bb8262344274d3d74a1de15bb", size = 95912 }, + { url = "https://files.pythonhosted.org/packages/e0/64/5637c4f67ed15518c0765b85b528ed79536caaf8ba167a9f7173e334d4a8/inflate64-1.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:0fe481f31695d35a433c3044ac8fd5d9f5069aaad03a0c04b570eb258ce655aa", size = 35166 }, + { url = "https://files.pythonhosted.org/packages/af/92/701b3c76b1cf244026c3e78dff8487955cf6960c1d9f350e2820a0d1a5d9/inflate64-1.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a45f6979ad5874d4d4898c2fc770b136e61b96b850118fdaec5a5af1b9123a", size = 59450 }, + { url = "https://files.pythonhosted.org/packages/bb/1d/af0253fafc27cadd29e3b111ebb3011b8c913a3554b403c90c7595f5933e/inflate64-1.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:022ca1cc928e7365a05f7371ff06af143c6c667144965e2cf9a9236a2ae1c291", size = 36267 }, + { url = "https://files.pythonhosted.org/packages/b6/22/7949030be11f4754bd6ed7067e9bebdf614013b89ccd4638330a85821b51/inflate64-1.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46792ecf3565d64fd2c519b0a780c03a57e195613c9954ef94e739a057b3fd06", size = 35740 }, + { url = "https://files.pythonhosted.org/packages/e4/87/c6ce0093a345c04811f6171a367665dec17dcc4617ca150dd37e9ae7bd33/inflate64-1.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a70ea2e456c15f7aa7c74b8ab8f20b4f8940ec657604c9f0a9de3342f280fff", size = 95896 }, + { url = "https://files.pythonhosted.org/packages/62/d6/fe113b12773cad2c093d381c2b1629f9cfa240c9ad86a7f9f9079e7a51b5/inflate64-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e243ea9bd36a035059f2365bd6d156ff59717fbafb0255cb0c75bf151bf6904", size = 96007 }, + { url = "https://files.pythonhosted.org/packages/f0/a6/9165bee4b7fc5af949fec12a2cea7ad73bf9ee97dfb96a0276274c48e709/inflate64-1.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4dc392dec1cd11cacda3d2637214ca45e38202e8a4f31d4a4e566d6e90625fc4", size = 98297 }, + { url = "https://files.pythonhosted.org/packages/ee/72/0aeb360101eeed32696fc6c623bc1780fac895a9fc2e93b582cb1e22ca54/inflate64-1.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8b402a50eda7ee75f342fc346d33a41bca58edc222a4b17f9be0db1daed459fa", size = 98858 }, + { url = "https://files.pythonhosted.org/packages/94/4a/8301ad59b57d9de504b0fdce22bf980dfb231753e6d7aed12af938f7f9fd/inflate64-1.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:f5924499dc8800928c0ee4580fa8eb4ffa880b2cce4431537d0390e503a9c9ee", size = 35167 }, + { url = "https://files.pythonhosted.org/packages/18/82/47021b8919c1dc276d0502296f15ffac1cd648b94b35cadb14cb812b6199/inflate64-1.0.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0c644bf7208e20825ca3bbb5fb1f7f495cfcb49eb01a5f67338796d44a42f2bf", size = 59509 }, + { url = "https://files.pythonhosted.org/packages/e0/c9/00701be8e48dc9c9b9488001d9c66d6cb6f6bb0c48af9abf33a69726d130/inflate64-1.0.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9964a4eaf26a9d36f82a1d9b12c28e35800dd3d99eb340453ed12ac90c2976a8", size = 36305 }, + { url = "https://files.pythonhosted.org/packages/25/c0/11dea5e298b2e7d61f0fbd1005553e8796e35536751980b676547fcc57ef/inflate64-1.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2cccded63865640d03253897be7232b2bbac295fe43914c61f86a57aa23bb61d", size = 35756 }, + { url = "https://files.pythonhosted.org/packages/86/ba/4debdaaafdc21853621caf463a498a754ee4352893454c596dbd65294e9f/inflate64-1.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d491f104fb3701926ebd82b8c9250dfba0ddcab584504e26f1e4adb26730378d", size = 96127 }, + { url = "https://files.pythonhosted.org/packages/89/81/8f559c199ec13d0b70d0dc46811490b2976873c96c564941583777e9b343/inflate64-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ebad4a6cd2a2c1d81be0b09d4006479f3b258803c49a9224ef8ca0b649072fa", size = 96903 }, + { url = "https://files.pythonhosted.org/packages/46/41/39ac4c7e17d0690578b716a0ff34e00600616994795b0645fd61fc600c0f/inflate64-1.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:6823b2c0cff3a8159140f3b17ec64fb8ec0e663b45a6593618ecdde8aeecb5b2", size = 98855 }, + { url = "https://files.pythonhosted.org/packages/44/dd/be5d69492c180f94a6af8a15564ce365bdcb84bd1a6fb32949d6913959aa/inflate64-1.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:228d504239d27958e71fc77e3119a6ac4528127df38468a0c95a5bd3927204b8", size = 99884 }, + { url = "https://files.pythonhosted.org/packages/8c/0d/a5266bd4f2cdb7fad1eae3ffe4dcc16f9769323660a0a6cfbe9cc1d2cf03/inflate64-1.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae2572e06bcfe15e3bbf77d4e4a6d6c55e2a70d6abceaaf60c5c3653ddb96dfd", size = 35334 }, + { url = "https://files.pythonhosted.org/packages/04/1c/47f9c93df339c381f3f3e7b983d7abf0756f8bd227bf9fb5a2e8b09ea9e5/inflate64-1.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bf2981b95c1f26242bb084d9a07f3feb0cfe3d6d0a8d90f42389803bc1252c4a", size = 59410 }, + { url = "https://files.pythonhosted.org/packages/b9/28/bde1595cae0379c521ae74f698f7da1345d990d2df7558df59470ced639b/inflate64-1.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9373ccf0661cc72ac84a0ad622634144da5ce7d57c9572ed0723d67a149feed2", size = 36230 }, + { url = "https://files.pythonhosted.org/packages/13/64/4350547de7c8a3ac27c1a0ab5807ac2fcbcde121c44190f87da1039421eb/inflate64-1.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e4650c6f65011ec57cf5cd96b92d5b7c6f59e502930c86eb8227c93cf02dc270", size = 35739 }, + { url = "https://files.pythonhosted.org/packages/1a/d6/b55e43c3503d7780aabe3d1d5493c983a99947dff228d9b622d49c58af68/inflate64-1.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a475e8822f1a74c873e60b8f270773757ade024097ca39e43402d47c049c67d4", size = 92655 }, + { url = "https://files.pythonhosted.org/packages/c0/c0/417e5183543445818930b3fe181d718e519d26a227b5b77871d8f0c8502d/inflate64-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4367480733ac8daf368f6fc704b7c9db85521ee745eb5bd443f4b97d2051acc", size = 92966 }, + { url = "https://files.pythonhosted.org/packages/6d/8e/ada659c83abb78222c666bb8d35b4791cca25a8a4a750f4bc457402d2430/inflate64-1.0.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:6c5775c91f94f5eced9160fb0af12a09f3e030194f91a6a46e706a79350bd056", size = 95075 }, + { url = "https://files.pythonhosted.org/packages/7f/15/59c0e45e091599a05a098374166ff74d3da5f9c0ffa72e8c78b7f0fcaa2b/inflate64-1.0.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d76d205b844d78ce04768060084ef20e64dcc63a3e9166674f857acaf4d140ed", size = 95751 }, + { url = "https://files.pythonhosted.org/packages/80/21/a90d085b9ea67729d5cef57566c69fd4dec8205c22415cde4be62b7adc64/inflate64-1.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:92f0dc6af0e8e97324981178dc442956cbff1247a56d1e201af8d865244653f8", size = 35164 }, + { url = "https://files.pythonhosted.org/packages/53/91/43238dd8a7e5bab71abae872c09931db4b31aebf672afccb305f79aacb3e/inflate64-1.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f79542478e49e471e8b23556700e6f688a40dc93e9a746f77a546c13251b59b1", size = 34648 }, + { url = "https://files.pythonhosted.org/packages/ef/6f/ce090934a80c1fd0b5b07c125ed6eb2845f11a78af344d69c0f051dcab97/inflate64-1.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a270be6b10cde01258c0097a663a307c62d12c78eb8f62f8e29f205335942c9", size = 36473 }, + { url = "https://files.pythonhosted.org/packages/b4/fe/2cd4bf78696213b807860002c182dd1751ba52c1559143b1b8daa7904733/inflate64-1.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1616a87ff04f583e9558cc247ec0b72a30d540ee0c17cc77823be175c0ec92f0", size = 36478 }, + { url = "https://files.pythonhosted.org/packages/43/dd/e62444c0ef7d1228b622e6d3dacf9ea237d8807a78619a83832a3b4a5adf/inflate64-1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:137ca6b315f0157a786c3a755a09395ca69aed8bcf42ad3437cb349f5ebc86d2", size = 35630 }, + { url = "https://files.pythonhosted.org/packages/2b/1e/0e346cb5eced6f2f30bb110e6d7c7ee7ab70ad1ff44b743d4cf314f60f7d/inflate64-1.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:dd6d3e7d47df43210a995fd1f5989602b64de3f2a17cf4cbff553518b3577fd4", size = 34644 }, + { url = "https://files.pythonhosted.org/packages/d4/f7/7ac502391ea56af23661a707e9c94efc01376a27c1b06f7a0cfb553c0f17/inflate64-1.0.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f033b2879696b855200cde5ca4e293132c7499df790acb2c0dacb336d5e83b1", size = 36467 }, + { url = "https://files.pythonhosted.org/packages/09/ea/7c3e7c856f42bee178a9fea816e259b02bd4198873cf07b5a839b96d9696/inflate64-1.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f816d1c8a0593375c289e285c96deaee9c2d8742cb0edbd26ee05588a9ae657", size = 36475 }, + { url = "https://files.pythonhosted.org/packages/ae/b1/954207a9bd259a57c8c71ba0a26739a32f0d6227529abc9412751f3a87d2/inflate64-1.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1facd35319b6a391ee4c3d709c7c650bcada8cd7141d86cd8c2257287f45e6e6", size = 35626 }, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -971,6 +1132,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899 }, ] +[[package]] +name = "multivolumefile" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/50/f0/a7786212b5a4cb9ba05ae84a2bbd11d1d0279523aea0424b6d981d652a14/multivolumefile-0.2.3.tar.gz", hash = "sha256:a0648d0aafbc96e59198d5c17e9acad7eb531abea51035d08ce8060dcad709d6", size = 77984 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/31/ec5f46fd4c83185b806aa9c736e228cb780f13990a9cf4da0beb70025fcc/multivolumefile-0.2.3-py3-none-any.whl", hash = "sha256:237f4353b60af1703087cf7725755a1f6fcaeeea48421e1896940cd1c920d678", size = 17037 }, +] + [[package]] name = "mypy-boto3-s3" version = "1.35.16" @@ -1083,6 +1253,7 @@ dependencies = [ { name = "gdown" }, { name = "gsheets" }, { name = "pandas" }, + { name = "py7zr" }, { name = "pyarrow" }, { name = "pydrive2" }, { name = "structlog" }, @@ -1113,7 +1284,8 @@ requires-dist = [ { name = "colorama", specifier = ">=0.4.4" }, { name = "gdown", specifier = ">=4.5.2" }, { name = "gsheets", specifier = ">=0.6.1" }, - { name = "pandas", specifier = "==2.2.1" }, + { name = "pandas", specifier = ">=2.2.1" }, + { name = "py7zr", specifier = ">=0.22.0" }, { name = "pyarrow", specifier = ">=10.0.1" }, { name = "pydrive2", specifier = ">=1.15.0" }, { name = "structlog", specifier = ">=21.5.0" }, @@ -1273,8 +1445,6 @@ version = "6.0.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/18/c7/8c6872f7372eb6a6b2e4708b88419fb46b857f7a2e1892966b851cc79fc9/psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2", size = 508067 } wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/66/78c9c3020f573c58101dc43a44f6855d01bbbd747e24da2f0c4491200ea3/psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35", size = 249766 }, - { url = "https://files.pythonhosted.org/packages/e1/3f/2403aa9558bea4d3854b0e5e567bc3dd8e9fbc1fc4453c0aa9aafeb75467/psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1", size = 253024 }, { url = "https://files.pythonhosted.org/packages/0b/37/f8da2fbd29690b3557cca414c1949f92162981920699cd62095a984983bf/psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0", size = 250961 }, { url = "https://files.pythonhosted.org/packages/35/56/72f86175e81c656a01c4401cd3b1c923f891b31fbcebe98985894176d7c9/psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0", size = 287478 }, { url = "https://files.pythonhosted.org/packages/19/74/f59e7e0d392bc1070e9a70e2f9190d652487ac115bb16e2eff6b22ad1d24/psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd", size = 290455 }, @@ -1302,6 +1472,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842 }, ] +[[package]] +name = "py7zr" +version = "0.22.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "brotli", marker = "platform_python_implementation == 'CPython'" }, + { name = "brotlicffi", marker = "platform_python_implementation == 'PyPy'" }, + { name = "inflate64" }, + { name = "multivolumefile" }, + { name = "psutil", marker = "sys_platform != 'cygwin'" }, + { name = "pybcj" }, + { name = "pycryptodomex" }, + { name = "pyppmd" }, + { name = "pyzstd" }, + { name = "texttable" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/c3/0e05c711c16af0b9c47f3f77323303b338b9a871ba020d95d2b8dd6605ae/py7zr-0.22.0.tar.gz", hash = "sha256:c6c7aea5913535184003b73938490f9a4d8418598e533f9ca991d3b8e45a139e", size = 4992926 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/59/dd1750002c0f46099281116f8165247bc62dc85edad41cdd26e7b26de19d/py7zr-0.22.0-py3-none-any.whl", hash = "sha256:993b951b313500697d71113da2681386589b7b74f12e48ba13cc12beca79d078", size = 67906 }, +] + [[package]] name = "pyarrow" version = "17.0.0" @@ -1362,6 +1553,46 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/77/89/bc88a6711935ba795a679ea6ebee07e128050d6382eaa35a0a47c8032bdc/pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd", size = 181537 }, ] +[[package]] +name = "pybcj" +version = "1.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/37/d2/22e808b9d25ce3b43f5c8a9e22d873d403485ba55d84a4d6d5d044881762/pybcj-1.0.2.tar.gz", hash = "sha256:c7f5bef7f47723c53420e377bc64d2553843bee8bcac5f0ad076ab1524780018", size = 2111002 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/93/4735636b5905b7597068a2c7a10a8df0f668f28659207c274d64a4468b97/pybcj-1.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7bff28d97e47047d69a4ac6bf59adda738cf1d00adde8819117fdb65d966bdbc", size = 32556 }, + { url = "https://files.pythonhosted.org/packages/a6/37/443cd704397b6df54ff0822032e4815aca4e9badabc5ce1faac34235a40c/pybcj-1.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:198e0b4768b4025eb3309273d7e81dc53834b9a50092be6e0d9b3983cfd35c35", size = 23751 }, + { url = "https://files.pythonhosted.org/packages/9a/aa/5a19ed8661e979a4d3237a11706f9a16a474a2227fdd99ccb284be100a98/pybcj-1.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fa26415b4a118ea790de9d38f244312f2510a9bb5c65e560184d241a6f391a2d", size = 23980 }, + { url = "https://files.pythonhosted.org/packages/fe/5f/638ce03948905d267c8c0ccab81b8b4943a0324f63d8bdb0a0e2a85d4503/pybcj-1.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fabb2be57e4ca28ea36c13146cdf97d73abd27c51741923fc6ba1e8cd33e255c", size = 50155 }, + { url = "https://files.pythonhosted.org/packages/09/70/8b6a6cc2a5721f67f629bdc17875c0d603d57f360a19b099a7b4de19383d/pybcj-1.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75d6d613bae6f27678d5e44e89d61018779726aa6aa950c516d33a04b8af8c59", size = 49729 }, + { url = "https://files.pythonhosted.org/packages/89/06/2e41e34da0bb2adb3644cbf4366c344e5804a10f1153da7b3a23333f7db8/pybcj-1.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3ffae79ef8a1ea81ea2748ad7b7ad9b882aa88ddf65ce90f9e944df639eccc61", size = 54310 }, + { url = "https://files.pythonhosted.org/packages/b5/0f/de9e76c305d4dcd9d428a90ccac030f06c780bc30549fc449a944a6321bc/pybcj-1.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bdb4d8ff5cba3e0bd1adee7d20dbb2b4d80cb31ac04d6ea1cd06cfc02d2ecd0d", size = 53679 }, + { url = "https://files.pythonhosted.org/packages/1a/41/a807ff6b77ec8e49c749ed1d0db5649fbb1150c6fb5fb391115f4f1d743a/pybcj-1.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a29be917fbc99eca204b08407e0971e0205bfdad4b74ec915930675f352b669d", size = 24690 }, + { url = "https://files.pythonhosted.org/packages/27/0a/20bf70a7eb7c6b2668ff2af798254033c32a09d6c58ec9a87cd6aa843df5/pybcj-1.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a2562ebe5a0abec4da0229f8abb5e90ee97b178f19762eb925c1159be36828b3", size = 32581 }, + { url = "https://files.pythonhosted.org/packages/a9/b6/43977fe4296d2778c6dc67b596bb6a851eaea80f3dd4ff454e5fca8142c2/pybcj-1.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:af19bc61ded933001cd68f004ae2042bf1a78eb498a3c685ebd655fa1be90dbe", size = 23767 }, + { url = "https://files.pythonhosted.org/packages/89/c7/a61010f59406b8a45bb4865faa4b61d6b177dcfac04247fb56c7538d997d/pybcj-1.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f3f4a447800850aba7724a2274ea0a4800724520c1caf38f7d0dabf2f89a5e15", size = 23976 }, + { url = "https://files.pythonhosted.org/packages/10/7a/78848edbb6f12d9b86e375fc46135d9a204ededbf96682b05cb4b4fbd942/pybcj-1.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce1c8af7a4761d2b1b531864d84113948daa0c4245775c63bd9874cb955f4662", size = 51246 }, + { url = "https://files.pythonhosted.org/packages/9e/13/af86c86cdfb293e82dd0b6c4bbdf08645cd8993456ee3fb911c3eeed1b22/pybcj-1.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8007371f6f2b462f5aa05d5c2135d0a1bcf5b7bdd9bd15d86c730f588d10b7d3", size = 50754 }, + { url = "https://files.pythonhosted.org/packages/39/52/88600aa374b100612a1d82fca4b03eb4315e0084a05ee314ba1b771f7190/pybcj-1.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1079ca63ff8da5c936b76863690e0bd2489e8d4e0a3a340e032095dae805dd91", size = 55334 }, + { url = "https://files.pythonhosted.org/packages/56/67/3cf9747ef5b53e16a844217c6c9840be6289d05ec785500da2cc55cc25f2/pybcj-1.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e9a785eb26884429d9b9f6326e68c3638828c83bf6d42d2463c97ad5385caff2", size = 54714 }, + { url = "https://files.pythonhosted.org/packages/78/81/a71197903b503f54b85f4d352f909e701e9d26953577bd34d3fbe0520d5d/pybcj-1.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:9ea46e2d45469d13b7f25b08efcdb140220bab1ac5a850db0954591715b8caaa", size = 24693 }, + { url = "https://files.pythonhosted.org/packages/83/60/a3b43836895654aa93b5a8422adc3717359db98da9147abfabffef79f1e7/pybcj-1.0.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:21b5f2460629167340403d359289a173e0729ce8e84e3ce99462009d5d5e01a4", size = 32677 }, + { url = "https://files.pythonhosted.org/packages/50/b9/96c8d9577b0f5a701e4497408e6a331a08eb902aca8dfd4c5bb1eaab4779/pybcj-1.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:2940fb85730b9869254559c491cd83cf777e56c76a8a60df60e4be4f2a4248d7", size = 23813 }, + { url = "https://files.pythonhosted.org/packages/b7/1a/c80132feb084ec4098c0315a132799bddda8878113b5f956e21c4377f5f1/pybcj-1.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f40f3243139d675f43793a4e35c410c370f7b91ccae74e70c8b2f4877869f90e", size = 24019 }, + { url = "https://files.pythonhosted.org/packages/b1/94/62c3bf8a60b4787b46e21f43277d9cb8b6037c8ee183450f035a19a2bc4b/pybcj-1.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c2b3e60b65c7ac73e44335934e1e122da8d56db87840984601b3c5dc0ae4c19", size = 51927 }, + { url = "https://files.pythonhosted.org/packages/8b/9e/4ebd092251ef8d15408388be508617d5949cbba4baa2a6cfbb7e0a9b62c0/pybcj-1.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746550dc7b5af4d04bb5fa4d065f18d39c925bcb5dee30db75747cd9a58bb6e8", size = 51665 }, + { url = "https://files.pythonhosted.org/packages/24/ea/da4637563468854bd361a69cd883946015f54fa119a5d9c655d26f151954/pybcj-1.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:8ce9b62b6aaa5b08773be8a919ecc4e865396c969f982b685eeca6e80c82abb7", size = 56041 }, + { url = "https://files.pythonhosted.org/packages/cf/b2/9b9e670818af925ed9a0168a5c021ccfcc089637d0e6651d16fd05896425/pybcj-1.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:493eab2b1f6f546730a6de0c5ceb75ce16f3767154e8ae30e2b70d41b928b7d2", size = 55606 }, + { url = "https://files.pythonhosted.org/packages/72/e9/d6b1bdf3a5aca8f3981145a5228ad51d72e2477a55927604a4768765e915/pybcj-1.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:ef55b96b7f2ed823e0b924de902065ec42ade856366c287dbb073fabd6b90ec1", size = 24719 }, + { url = "https://files.pythonhosted.org/packages/6e/18/ca43a186a570b3f6820a24f3ad726d0f0322f0b08f5550a92f99741a2e58/pybcj-1.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fdb7cd8271471a5979d84915c1ee57eea7e0a69c893225fc418db66883b0e2a7", size = 32548 }, + { url = "https://files.pythonhosted.org/packages/f1/10/ee383b3450f7f13fbe234668fcf143b9a1e916e0cb84f5267ff3a5c6ad60/pybcj-1.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e96ae14062bdcddc3197300e6ee4efa6fbc6749be917db934eac66d0daaecb68", size = 23743 }, + { url = "https://files.pythonhosted.org/packages/1f/90/2d2851d694a7dc3640c6309af0e534cab145e7463bc408114db7eaa20115/pybcj-1.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a54ebdc8423ba99d75372708a882fcfc3b14d9d52cf195295ad53e5a47dab37f", size = 23971 }, + { url = "https://files.pythonhosted.org/packages/9a/29/8326dbfea26d643d5a95f836103ac278eb297143d881188d94b987e3a520/pybcj-1.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3602be737c6e9553c45ae89e6b0e556f64f34dabf27d5260317d1824d31b79d3", size = 49937 }, + { url = "https://files.pythonhosted.org/packages/ac/b2/26fa2cba6bc488380515929757cafbdbf01f30184a1aa11ef7ee35bb21a2/pybcj-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63dd2ca52a48841f561bfec0fa3f208d375b0a8dcd3d7b236459e683ae29221d", size = 49517 }, + { url = "https://files.pythonhosted.org/packages/58/7b/d5e39a73202eb7b67793a4313ae5a85bdbf1470899dc2d3119c6a2414e9b/pybcj-1.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:8204a714029784b1a08a3d790430d80b423b68615c5b1e67aabca5bd5419b77d", size = 54141 }, + { url = "https://files.pythonhosted.org/packages/8e/f0/5abc858fe9d07338e485e86b2d82d1f1a0aa36c2af5271156c1140d04d15/pybcj-1.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fde2376b180ae2620c102fbc3ef06638d306feae83964aaa5051ecbdda54845a", size = 53499 }, + { url = "https://files.pythonhosted.org/packages/9f/ac/4bad26429aab693235035f813dc60ff00ff5164acbbb98c4e26f190a21cd/pybcj-1.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:3b8d7810fb587adbffba025330cf212d9bbed8f29559656d05cb6609673f306a", size = 24686 }, +] + [[package]] name = "pycparser" version = "2.22" @@ -1371,6 +1602,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 }, ] +[[package]] +name = "pycryptodomex" +version = "3.21.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/dc/e66551683ade663b5f07d7b3bc46434bf703491dbd22ee12d1f979ca828f/pycryptodomex-3.21.0.tar.gz", hash = "sha256:222d0bd05381dd25c32dd6065c071ebf084212ab79bab4599ba9e6a3e0009e6c", size = 4818543 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/5e/99f217d9881eead69607a2248dd7bbdf610837d7f5ad53f45a6cb71bbbfb/pycryptodomex-3.21.0-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:34325b84c8b380675fd2320d0649cdcbc9cf1e0d1526edbe8fce43ed858cdc7e", size = 2499490 }, + { url = "https://files.pythonhosted.org/packages/ce/8f/4d0e2a859a6470289d64e39b419f01d2494dfa2e4995342d50f6c2834237/pycryptodomex-3.21.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:103c133d6cd832ae7266feb0a65b69e3a5e4dbbd6f3a3ae3211a557fd653f516", size = 1638037 }, + { url = "https://files.pythonhosted.org/packages/0c/9e/6e748c1fa814c956d356f93cf7192b19487ca56fc9e2a0bcde2bbc057601/pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77ac2ea80bcb4b4e1c6a596734c775a1615d23e31794967416afc14852a639d3", size = 2172279 }, + { url = "https://files.pythonhosted.org/packages/46/3f/f5bef92b11750af9e3516d4e69736eeeff20a2818d34611508bef5a7b381/pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9aa0cf13a1a1128b3e964dc667e5fe5c6235f7d7cfb0277213f0e2a783837cc2", size = 2258130 }, + { url = "https://files.pythonhosted.org/packages/de/4d/f0c65afd64ce435fd0547187ce6f99dfb37cdde16b05b57bca9f5c06966e/pycryptodomex-3.21.0-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:46eb1f0c8d309da63a2064c28de54e5e614ad17b7e2f88df0faef58ce192fc7b", size = 2297719 }, + { url = "https://files.pythonhosted.org/packages/1c/6a/2a1a101b0345ee70376ba93df8de6c8c01aac8341fda02970800873456a7/pycryptodomex-3.21.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:cc7e111e66c274b0df5f4efa679eb31e23c7545d702333dfd2df10ab02c2a2ce", size = 2164079 }, + { url = "https://files.pythonhosted.org/packages/3d/00/90a15f16c234815b660303c2d7266b41b401ea2605f3a90373e9d425e39f/pycryptodomex-3.21.0-cp36-abi3-musllinux_1_2_i686.whl", hash = "sha256:770d630a5c46605ec83393feaa73a9635a60e55b112e1fb0c3cea84c2897aa0a", size = 2333060 }, + { url = "https://files.pythonhosted.org/packages/61/74/49f5d20c514ccc631b940cc9dfec45dcce418dc84a98463a2e2ebec33904/pycryptodomex-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:52e23a0a6e61691134aa8c8beba89de420602541afaae70f66e16060fdcd677e", size = 2257982 }, + { url = "https://files.pythonhosted.org/packages/92/4b/d33ef74e2cc0025a259936661bb53432c5bbbadc561c5f2e023bcd73ce4c/pycryptodomex-3.21.0-cp36-abi3-win32.whl", hash = "sha256:a3d77919e6ff56d89aada1bd009b727b874d464cb0e2e3f00a49f7d2e709d76e", size = 1779052 }, + { url = "https://files.pythonhosted.org/packages/5b/be/7c991840af1184009fc86267160948350d1bf875f153c97bb471ad944e40/pycryptodomex-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b0e9765f93fe4890f39875e6c90c96cb341767833cfa767f41b490b506fa9ec0", size = 1816307 }, + { url = "https://files.pythonhosted.org/packages/af/ac/24125ad36778914a36f08d61ba5338cb9159382c638d9761ee19c8de822c/pycryptodomex-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:feaecdce4e5c0045e7a287de0c4351284391fe170729aa9182f6bd967631b3a8", size = 1694999 }, + { url = "https://files.pythonhosted.org/packages/93/73/be7a54a5903508070e5508925ba94493a1f326cfeecfff750e3eb250ea28/pycryptodomex-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:365aa5a66d52fd1f9e0530ea97f392c48c409c2f01ff8b9a39c73ed6f527d36c", size = 1769437 }, + { url = "https://files.pythonhosted.org/packages/e5/9f/39a6187f3986841fa6a9f35c6fdca5030ef73ff708b45a993813a51d7d10/pycryptodomex-3.21.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3efddfc50ac0ca143364042324046800c126a1d63816d532f2e19e6f2d8c0c31", size = 1619607 }, + { url = "https://files.pythonhosted.org/packages/f8/70/60bb08e9e9841b18d4669fb69d84b64ce900aacd7eb0ebebd4c7b9bdecd3/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df2608682db8279a9ebbaf05a72f62a321433522ed0e499bc486a6889b96bf3", size = 1653571 }, + { url = "https://files.pythonhosted.org/packages/c9/6f/191b73509291c5ff0dddec9cc54797b1d73303c12b2e4017b24678e57099/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5823d03e904ea3e53aebd6799d6b8ec63b7675b5d2f4a4bd5e3adcb512d03b37", size = 1691548 }, + { url = "https://files.pythonhosted.org/packages/2d/c7/a0d3356f3074ac548afefa515ff46f3bea011deca607faf1c09b26dd5330/pycryptodomex-3.21.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:27e84eeff24250ffec32722334749ac2a57a5fd60332cd6a0680090e7c42877e", size = 1792099 }, + { url = "https://files.pythonhosted.org/packages/55/ee/9349856ee02826899fdc489016756865158217909a82dcc74cc4d55d33af/pycryptodomex-3.21.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8ef436cdeea794015263853311f84c1ff0341b98fc7908e8a70595a68cefd971", size = 1619490 }, + { url = "https://files.pythonhosted.org/packages/07/93/e68fac121fcf761fd7a85a27f024c9238217e4d943c861a856ca354f412e/pycryptodomex-3.21.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a1058e6dfe827f4209c5cae466e67610bcd0d66f2f037465daa2a29d92d952b", size = 1653481 }, + { url = "https://files.pythonhosted.org/packages/b8/47/8a39243d09fd294c339c59834ba3c92715584f3ed0d92b6bacb26f803ce0/pycryptodomex-3.21.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ba09a5b407cbb3bcb325221e346a140605714b5e880741dc9a1e9ecf1688d42", size = 1691442 }, + { url = "https://files.pythonhosted.org/packages/21/1c/f8860c558b44776573acd719c1e86fec14d42f29cf248eaba9c770151d14/pycryptodomex-3.21.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:8a9d8342cf22b74a746e3c6c9453cb0cfbb55943410e3a2619bd9164b48dc9d9", size = 1791966 }, +] + [[package]] name = "pydrive2" version = "1.20.0" @@ -1416,6 +1675,68 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/0c/0e3c05b1c87bb6a1c76d281b0f35e78d2d80ac91b5f8f524cebf77f51049/pyparsing-3.1.4-py3-none-any.whl", hash = "sha256:a6a7ee4235a3f944aa1fa2249307708f893fe5717dc603503c6c7969c070fb7c", size = 104100 }, ] +[[package]] +name = "pyppmd" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/39/c8/9039c7503577de08a3f4c81e7619583efdc16030da6d1a25268d3dca49c8/pyppmd-1.1.0.tar.gz", hash = "sha256:1d38ce2e4b7eb84b53bc8a52380b94f66ba6c39328b8800b30c2b5bf31693973", size = 1348949 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/10/b19621035862e2ae12a1ba14c5b5c0a0befb27906bc00691642d7bdbdce6/pyppmd-1.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5cd428715413fe55abf79dc9fc54924ba7e518053e1fc0cbdf80d0d99cf1442", size = 75756 }, + { url = "https://files.pythonhosted.org/packages/85/4a/a7c172cd431c4e1ddf9be349dc4bcfea81c2a236d2fe51bbfdcd697af55a/pyppmd-1.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0e96cc43f44b7658be2ea764e7fa99c94cb89164dbb7cdf209178effc2168319", size = 47347 }, + { url = "https://files.pythonhosted.org/packages/0d/32/f7357e0412e977ede4d63ba8bf55d014e5ea5b311818b2b0a1fee6d91baa/pyppmd-1.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dd20142869094bceef5ab0b160f4fff790ad1f612313a1e3393a51fc3ba5d57e", size = 46640 }, + { url = "https://files.pythonhosted.org/packages/b5/8e/1f416819f0aab17de47b15b72d0e9b05e2bf795c6e28d9f403ac01398b74/pyppmd-1.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4f9b51e45c11e805e74ea6f6355e98a6423b5bbd92f45aceee24761bdc3d3b8", size = 135666 }, + { url = "https://files.pythonhosted.org/packages/73/ac/7d07d3ac6874f235554de392de08e6a369001db43cd6a619af4fbe02fb55/pyppmd-1.1.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:459f85e928fb968d0e34fb6191fd8c4e710012d7d884fa2b317b2e11faac7c59", size = 132892 }, + { url = "https://files.pythonhosted.org/packages/09/76/61db4268a439cfba8736b14130d928d199633fab2360a2c5043332a427d2/pyppmd-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f73cf2aaf60477eef17f5497d14b6099d8be9748390ad2b83d1c88214d050c05", size = 138901 }, + { url = "https://files.pythonhosted.org/packages/8b/9c/546729489ae07c0d7c2bfe37c69ae1cd3ce35a18ab000480ea4e8f12754f/pyppmd-1.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:2ea3ae0e92c0b5345cd3a4e145e01bbd79c2d95355481ea5d833b5c0cb202a2d", size = 139725 }, + { url = "https://files.pythonhosted.org/packages/f7/db/4e734e97541554a389e7adb2a2a5c86ad8ae35c4dafe817b12fdc317de1a/pyppmd-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:775172c740133c0162a01c1a5443d0e312246881cdd6834421b644d89a634b91", size = 131598 }, + { url = "https://files.pythonhosted.org/packages/b1/8f/530e47290e07d2fdedfd345fc72af08226ccdd4cc913c2b895a8396c17b6/pyppmd-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:14421030f1d46f69829698bdd960698a3b3df0925e3c470e82cfcdd4446b7bc1", size = 142767 }, + { url = "https://files.pythonhosted.org/packages/a5/f9/16e0adfef500b171a96ed3c95f4a4d999f99cc79de3e415146808b19c2fb/pyppmd-1.1.0-cp310-cp310-win32.whl", hash = "sha256:b691264f9962532aca3bba5be848b6370e596d0a2ca722c86df388be08d0568a", size = 41283 }, + { url = "https://files.pythonhosted.org/packages/37/8d/c4846ab632e13ead87189f31bcc51fc825c75078d162a4a9dc8aed0a5b97/pyppmd-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:216b0d969a3f06e35fbfef979706d987d105fcb1e37b0b1324f01ee143719c4a", size = 46078 }, + { url = "https://files.pythonhosted.org/packages/27/0e/9db5d7c6ca3159aa0f07c0f1d5c59079176e7c57740a61aca62a39661178/pyppmd-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1f8c51044ee4df1b004b10bf6b3c92f95ea86cfe1111210d303dca44a56e4282", size = 75781 }, + { url = "https://files.pythonhosted.org/packages/f0/1b/4894b5c71feee76d3dfccf4383b59841f9bfd27aecf912b6542a2ab1e073/pyppmd-1.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ac25b3a13d1ac9b8f0bde46952e10848adc79d932f2b548a6491ef8825ae0045", size = 47370 }, + { url = "https://files.pythonhosted.org/packages/50/98/57b2c281e546f682279bd4a2577045d1f6d527c8fa2151a990b2a9bc48c2/pyppmd-1.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c8d3003eebe6aabe22ba744a38a146ed58a25633420d5da882b049342b7c8036", size = 46633 }, + { url = "https://files.pythonhosted.org/packages/06/72/b7e37aa69b7a105bcc119bc171437fbcb104aef2568b68ec8ed21a3fcdd1/pyppmd-1.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c520656bc12100aa6388df27dd7ac738577f38bf43f4a4bea78e1861e579ea5", size = 138233 }, + { url = "https://files.pythonhosted.org/packages/60/73/4f53a3c7730e1cba3f210b35ed6779e0fe302739196f43452664e079c0b5/pyppmd-1.1.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c2a3e807028159a705951f5cb5d005f94caed11d0984e59cc50506de543e22d", size = 135486 }, + { url = "https://files.pythonhosted.org/packages/31/7c/956ebf1f07506bb59e6f13ef068d91f1bec828758d399b455b175b668f6c/pyppmd-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec8a2447e69444703e2b273247bfcd4b540ec601780eff07da16344c62d2993d", size = 141183 }, + { url = "https://files.pythonhosted.org/packages/73/b4/4863499e012c555f4619dbebc5b83d79818e0161d9b6fb8b1e709fb1d6c7/pyppmd-1.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b9e0c8053e69cad6a92a0889b3324f567afc75475b4f54727de553ac4fc85780", size = 141752 }, + { url = "https://files.pythonhosted.org/packages/b4/cc/44e175222b31f86d0192d1d0d2c46c4bf0e933c9a06a65ff39596ad05666/pyppmd-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:5938d256e8d2a2853dc3af8bb58ae6b4a775c46fc891dbe1826a0b3ceb624031", size = 133921 }, + { url = "https://files.pythonhosted.org/packages/f1/d9/2f2e222d43ab274909e8dcd16d25cd4cc0245a8d59f93f8d6397cd4dc49f/pyppmd-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1ce5822d8bea920856232ccfb3c26b56b28b6846ea1b0eb3d5cb9592a026649e", size = 145191 }, + { url = "https://files.pythonhosted.org/packages/6d/e7/1214571442624e2314ed1ed5ba0081358335fc760fb455c3d8df83b118c6/pyppmd-1.1.0-cp311-cp311-win32.whl", hash = "sha256:2a9e894750f2a52b03e3bc0d7cf004d96c3475a59b1af7e797d808d7d29c9ffe", size = 41286 }, + { url = "https://files.pythonhosted.org/packages/8e/7f/d3cc8443bd2b56bc54ea205dcf73d70ef8d4342096ff33fc8719956f45e9/pyppmd-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:969555c72e72fe2b4dd944127521a8f2211caddb5df452bbc2506b5adfac539e", size = 46087 }, + { url = "https://files.pythonhosted.org/packages/bf/0b/4c8e3a92c4366a9aa2d801ab4bd7ba72bd1d214da890dd91ab4d73e52878/pyppmd-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9d6ef8fd818884e914bc209f7961c9400a4da50d178bba25efcef89f09ec9169", size = 76116 }, + { url = "https://files.pythonhosted.org/packages/e1/0b/45fdf5a28c810ed4d3c0cb05ae5346e2972cdbfe89f374b263e07c5b820d/pyppmd-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:95f28e2ecf3a9656bd7e766aaa1162b6872b575627f18715f8b046e8617c124a", size = 47633 }, + { url = "https://files.pythonhosted.org/packages/56/a4/4aa1d36d98f3786c8b12ac96ac8234d7dc3c2a9e8f5174a5698f424099ec/pyppmd-1.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:37f3557ea65ee417abcdf5f49d35df00bb9f6f252639cae57aeefcd0dd596133", size = 46704 }, + { url = "https://files.pythonhosted.org/packages/d9/70/a49389a6666f670db5ecc7caa37030c9a9abfeea455c387172584551a271/pyppmd-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e84b25d088d7727d50218f57f92127cdb839acd6ec3de670b6680a4cf0b2d2a", size = 139145 }, + { url = "https://files.pythonhosted.org/packages/30/4c/f08cdf618744a3cce0da106ecf6e427b24d27b0bb1484afc40b88ca23a39/pyppmd-1.1.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99ed42891986dac8c2ecf52bddfb777900233d867aa18849dbba6f3335600466", size = 136618 }, + { url = "https://files.pythonhosted.org/packages/bb/e0/afc0fb971c893e9e72cc8d70df93c50b3f3ebb12b4bdb21f869b775faf7e/pyppmd-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6fe69b82634488ada75ba07efb90cd5866fa3d64a2c12932b6e8ae207a14e5f", size = 142757 }, + { url = "https://files.pythonhosted.org/packages/26/b2/793e92c7a66de0b0b8d777c3c4df3ee5a5bec7fbaf0b69ab7374cefefa43/pyppmd-1.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:60981ffde1fe6ade750b690b35318c41a1160a8505597fda2c39a74409671217", size = 142749 }, + { url = "https://files.pythonhosted.org/packages/5e/6e/a1bf750bc7ed025a06600c65917d02e3c6dea7dfa728746c7251d4910d37/pyppmd-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:46e8240315476f57aac23d71e6de003e122b65feba7c68f4cc46a089a82a7cd4", size = 135033 }, + { url = "https://files.pythonhosted.org/packages/1e/ee/4a12a4b1990f1fabb77f9ef94d2cd6c795690eec79ad135b8236dc59dbd2/pyppmd-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c0308e2e76ecb4c878a18c2d7a7c61dbca89b4ef138f65d5f5ead139154dcdea", size = 146510 }, + { url = "https://files.pythonhosted.org/packages/04/cd/a6571420345315f5340ac10897726303ae07260cb025dc4a60371d1e8b97/pyppmd-1.1.0-cp312-cp312-win32.whl", hash = "sha256:b4fa4c27dc1314d019d921f2aa19e17f99250557e7569eeb70e180558f46af74", size = 41332 }, + { url = "https://files.pythonhosted.org/packages/c0/a4/af77129d671d6adcc6c82e1b0f03f0ad0b70c44ac70ed4c72b5c8952553b/pyppmd-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:c269d21e15f4175df27cf00296476097af76941f948734c642d7fb6e85b9b3b9", size = 46193 }, + { url = "https://files.pythonhosted.org/packages/b4/10/144f811290a36d6cf5b5c8ae9b68533abe4bea160285be73435d55c361e1/pyppmd-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2aeea1bf585c6b8771fa43a6abd704da92f8a46a6d0020953af15d7f3c82e48c", size = 75760 }, + { url = "https://files.pythonhosted.org/packages/dc/2b/242b6ba7938e77f14dadc0e5d638288b5588f0aff5e5d2c0428726606e5e/pyppmd-1.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7759bdb137694d4ab0cfa5ff2c75c212d90714c7da93544694f68001a0c38e12", size = 47339 }, + { url = "https://files.pythonhosted.org/packages/ec/80/91a13a5d0da916e7243f66839941976b2729bac1d3ca1737c20f1d59b216/pyppmd-1.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:db64a4fe956a2e700a737a1d019f526e6ccece217c163b28b354a43464cc495b", size = 46636 }, + { url = "https://files.pythonhosted.org/packages/aa/18/f6126af21186eee49f9aa090c36acc9d2bccef4c7d077d23b2f24dfb804c/pyppmd-1.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f788ae8f5a9e79cd777b7969d3401b2a2b87f47abe306c2a03baca30595e9bd", size = 135484 }, + { url = "https://files.pythonhosted.org/packages/03/55/ebfeb5d1085f8a2bd03f498aa7e6ef9635380bf1f88badd1f3b944198ada/pyppmd-1.1.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:324a178935c140210fca2043c688b77e79281da8172d2379a06e094f41735851", size = 132741 }, + { url = "https://files.pythonhosted.org/packages/df/92/f0a7a6e372c4bd659b5528ff179676522aa72bd8c7a071e757a490ff988e/pyppmd-1.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:363030bbcb7902fb9eeb59ffc262581ca5dd7790ba950328242fd2491c54d99b", size = 138761 }, + { url = "https://files.pythonhosted.org/packages/43/32/0f32a70ef3fbe287dc53fd86408e9c2f60515ab356bd728bde9fcebcb598/pyppmd-1.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:31b882584f86440b0ff7906385c9f9d9853e5799197abaafdae2245f87d03f01", size = 139568 }, + { url = "https://files.pythonhosted.org/packages/fd/02/730882ea61653af4ef22b3621e9a03e85e61c90884fb47df4495706439f5/pyppmd-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b991b4501492ec3380b605fe30bee0b61480d305e98519d81c2a658b2de01593", size = 131441 }, + { url = "https://files.pythonhosted.org/packages/9e/cd/624b6582766b8e60c4356ebcfe73504c81d1395f36d0d409ff081be56e3d/pyppmd-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b6108044d943b826f97a9e79201242f61392d6c1fadba463b2069c4e6bc961e1", size = 142558 }, + { url = "https://files.pythonhosted.org/packages/65/46/1908b1ef3ba6f1450bc1a3e45a6b39cfe4b33456a312d1add0041ba1bbe4/pyppmd-1.1.0-cp39-cp39-win32.whl", hash = "sha256:c45ce2968b7762d2cacf622b0a8f260295c6444e0883fd21a21017e3eaef16ed", size = 41284 }, + { url = "https://files.pythonhosted.org/packages/63/29/081b03f989deb7ce2f70461dac12ecc422e9abef6b7b7a1933945c96b06f/pyppmd-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:f5289f32ab4ec5f96a95da51309abd1769f928b0bff62047b3bc25c878c16ccb", size = 46071 }, + { url = "https://files.pythonhosted.org/packages/6a/e2/1d5fbd6dde1234b635000072c8d1d87c7ed3acf01a3c4aa8082504d58bc5/pyppmd-1.1.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ad5da9f7592158e6b6b51d7cd15e536d8b23afbb4d22cba4e5744c7e0a3548b1", size = 41505 }, + { url = "https://files.pythonhosted.org/packages/24/66/9215c5dda61b3aa3259902a586dacd198b4b0793ab99228734091b5e7fa7/pyppmd-1.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc6543e7d12ef0a1466d291d655e3d6bca59c7336dbb53b62ccdd407822fb52b", size = 44814 }, + { url = "https://files.pythonhosted.org/packages/1a/87/cc2aa429688f238ae30f26b8334194a21e25643d3257c9e5b14cccdc578e/pyppmd-1.1.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a5e4008a45910e3c8c227f6f240de67eb14454c015dc3d8060fc41e230f395d3", size = 43629 }, + { url = "https://files.pythonhosted.org/packages/9f/96/cd3f64f6bdce091ffb6d2c1c23dc91e8b94e312a5d08cd648625555fb69e/pyppmd-1.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9301fa39d1fb0ed09a10b4c5d7f0074113e96a1ead16ba7310bedf95f7ef660c", size = 43911 }, + { url = "https://files.pythonhosted.org/packages/e6/ab/02ab90e2dddf2dd55e30e64fa0509627c6e0c86b26503a6df95ae55b1e45/pyppmd-1.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:59521a3c6028da0cb5780ba16880047b00163432a6b975da2f6123adfc1b0be8", size = 42427 }, + { url = "https://files.pythonhosted.org/packages/fa/0e/05db05c0da6a9bbb1f32de107b9f92e95ca9cb407c2082c7a0bee0a8868b/pyppmd-1.1.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:cce8cd2d4ceebe2dbf41db6dfebe4c2e621314b3af8a2df2cba5eb5fa277f122", size = 41500 }, + { url = "https://files.pythonhosted.org/packages/ba/89/48b01bcab274c59548897de336b807777b8e4abbd0465ed37c04152d13e2/pyppmd-1.1.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62e57927dbcb91fb6290a41cd83743b91b9d85858efb16a0dd34fac208ee1c6b", size = 44810 }, + { url = "https://files.pythonhosted.org/packages/a9/e4/bf4aa2305eda1bdf530d8127e07dd324ae923e664db3aa7bf153f1d749ec/pyppmd-1.1.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:435317949a6f35e54cdf08e0af6916ace427351e7664ac1593980114668f0aaa", size = 43625 }, + { url = "https://files.pythonhosted.org/packages/fc/be/33478c13fad90049c35034d9f7de658353f82fc2cd91c57efd904b71ebb7/pyppmd-1.1.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f66b0d0e32b8fb8707f1d2552f13edfc2917e8ed0bdf4d62e2ce190d2c70834", size = 43909 }, + { url = "https://files.pythonhosted.org/packages/60/34/922d8ca6879f08d17f9771e6ef65e9491c2ebbd48934997a3ff01285e55e/pyppmd-1.1.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:650a663a591e06fb8096c213f4070b158981c8c3bf9c166ce7e4c360873f2750", size = 42422 }, +] + [[package]] name = "pyright" version = "1.1.373" @@ -1648,6 +1969,96 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ca/63/27e6142b4f67a442ee480986ca5b88edb01462dd2319843057683a5148bd/pyzmq-26.2.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:4f78c88905461a9203eac9faac157a2a0dbba84a0fd09fd29315db27be40af9f", size = 550757 }, ] +[[package]] +name = "pyzstd" +version = "0.16.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/62/14/878fee4072cecb1cc6e061c7d0d933e481389c27de939538c9cc3f18894a/pyzstd-0.16.2.tar.gz", hash = "sha256:179c1a2ea1565abf09c5f2fd72f9ce7c54b2764cf7369e05c0bfd8f1f67f63d2", size = 789505 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/a9/efad061c5a982f859ba8bf5de565d73567f87ad8bba3364fe28e9a8672b6/pyzstd-0.16.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:637376c8f8cbd0afe1cab613f8c75fd502bd1016bf79d10760a2d5a00905fe62", size = 372191 }, + { url = "https://files.pythonhosted.org/packages/b6/36/eb6dcfacb273ca13dfa20d296f27ffd0a6c53677965f868625edf764b71e/pyzstd-0.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3e7a7118cbcfa90ca2ddbf9890c7cb582052a9a8cf2b7e2c1bbaf544bee0f16a", size = 295083 }, + { url = "https://files.pythonhosted.org/packages/fb/76/a7862487402123f221439808ed50915e00cfc8e1df7365af366610176347/pyzstd-0.16.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a74cb1ba05876179525144511eed3bd5a509b0ab2b10632c1215a85db0834dfd", size = 390166 }, + { url = "https://files.pythonhosted.org/packages/b8/52/1e1ab63026d67f18b9841285576d59bb799b838a5de4f852ad9e054674a1/pyzstd-0.16.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7c084dde218ffbf112e507e72cbf626b8f58ce9eb23eec129809e31037984662", size = 472043 }, + { url = "https://files.pythonhosted.org/packages/0d/24/14c8948b9d16d399ff80504bc404bb091b0eb5339f6fbdad0481da751c09/pyzstd-0.16.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4646459ebd3d7a59ddbe9312f020bcf7cdd1f059a2ea07051258f7af87a0b31", size = 415258 }, + { url = "https://files.pythonhosted.org/packages/6b/3e/e4c7f449af9d19975ff5d333a58330317cf8b05fe4754106c694a29e7c25/pyzstd-0.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14bfc2833cc16d7657fc93259edeeaa793286e5031b86ca5dc861ba49b435fce", size = 413680 }, + { url = "https://files.pythonhosted.org/packages/10/09/8918853028cf593c141456b9a42d68420beec3f16a8cc4f1aa5d0b8b0c84/pyzstd-0.16.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f27d488f19e5bf27d1e8aa1ae72c6c0a910f1e1ffbdf3c763d02ab781295dd27", size = 412630 }, + { url = "https://files.pythonhosted.org/packages/47/20/5a4c899530571e0e8ecdcb9dc7e3fc38491d4b342fbd7d8413805c88013b/pyzstd-0.16.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:91e134ca968ff7dcfa8b7d433318f01d309b74ee87e0d2bcadc117c08e1c80db", size = 404980 }, + { url = "https://files.pythonhosted.org/packages/0a/1d/aeeeebb702d3500a01b5b1029ba1716aea3afa75e8aacb904806b3f1afe5/pyzstd-0.16.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:6b5f64cd3963c58b8f886eb6139bb8d164b42a74f8a1bb95d49b4804f4592d61", size = 418000 }, + { url = "https://files.pythonhosted.org/packages/fc/0c/66ca36d24ad97af40a8fe8de9e3f316a5f4fd2fb3cab8634a2f7da5571c8/pyzstd-0.16.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:0b4a8266871b9e0407f9fd8e8d077c3558cf124d174e6357b523d14f76971009", size = 485576 }, + { url = "https://files.pythonhosted.org/packages/39/66/6c1de1347de94aa85f60e854cccae0948bda2eda2351e4d47c8bb0a7cf18/pyzstd-0.16.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1bb19f7acac30727354c25125922aa59f44d82e0e6a751df17d0d93ff6a73853", size = 564542 }, + { url = "https://files.pythonhosted.org/packages/6d/46/75365a3ab279d58e69d410ce0a21527e689fa651837227e23dee294d096f/pyzstd-0.16.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3008325b7368e794d66d4d98f2ee1d867ef5afd09fd388646ae02b25343c420d", size = 430619 }, + { url = "https://files.pythonhosted.org/packages/0d/62/17bf81d42acbd39bffdea559b6fbd7ec331cd74bc52f249e536fefe5480d/pyzstd-0.16.2-cp310-cp310-win32.whl", hash = "sha256:66f2d5c0bbf5bf32c577aa006197b3525b80b59804450e2c32fbcc2d16e850fd", size = 218224 }, + { url = "https://files.pythonhosted.org/packages/f7/b6/281245890df08a567186c6e262c43d68581291cca107c8d7304c37708e46/pyzstd-0.16.2-cp310-cp310-win_amd64.whl", hash = "sha256:5fe5f5459ebe1161095baa7a86d04ab625b35148f6c425df0347ed6c90a2fd58", size = 245012 }, + { url = "https://files.pythonhosted.org/packages/10/5a/19d7aec81853f6dc53eabad388227e3beecfaca4788af23b8807a0ea2112/pyzstd-0.16.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c1bdbe7f01c7f37d5cd07be70e32a84010d7dfd6677920c0de04cf7d245b60d", size = 372192 }, + { url = "https://files.pythonhosted.org/packages/29/35/2eb025e6a0fff49b5de8bea20e82e4d7d5456e634bf3809123fbe5e5f194/pyzstd-0.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1882a3ceaaf9adc12212d587d150ec5e58cfa9a765463d803d739abbd3ac0f7a", size = 295084 }, + { url = "https://files.pythonhosted.org/packages/04/1f/03785d7ff1ce73b9347533f798cb27afa57768e66012f97b18b7b7303158/pyzstd-0.16.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea46a8b9d60f6a6eba29facba54c0f0d70328586f7ef0da6f57edf7e43db0303", size = 390167 }, + { url = "https://files.pythonhosted.org/packages/b7/59/e307622115a2df30075efbd28933dc0ad8f2007c5ba5a3eb49c956de3d56/pyzstd-0.16.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d7865bc06589cdcecdede0deefe3da07809d5b7ad9044c224d7b2a0867256957", size = 472038 }, + { url = "https://files.pythonhosted.org/packages/97/21/870fda5454240089e9c37625320580d392b03beaeae4889c67c0a21c4d34/pyzstd-0.16.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:52f938a65b409c02eb825e8c77fc5ea54508b8fc44b5ce226db03011691ae8cc", size = 415217 }, + { url = "https://files.pythonhosted.org/packages/3c/35/b33faeeb9c96fddd08bf7871c9f5c4638c32ad79227155922fd4a63190c5/pyzstd-0.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e97620d3f53a0282947304189deef7ca7f7d0d6dfe15033469dc1c33e779d5e5", size = 413714 }, + { url = "https://files.pythonhosted.org/packages/aa/a3/b9058dd43eb52025a2ca78946dcb9ef9d8984acac172a698bcf12712217c/pyzstd-0.16.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7c40e9983d017108670dc8df68ceef14c7c1cf2d19239213274783041d0e64c", size = 412568 }, + { url = "https://files.pythonhosted.org/packages/12/31/fe7d462c912f2040775bfa2af4327f9fcebb16e8fa9c3bfa058bc1306722/pyzstd-0.16.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7cd4b3b2c6161066e4bde6af1cf78ed3acf5d731884dd13fdf31f1db10830080", size = 404988 }, + { url = "https://files.pythonhosted.org/packages/48/4c/582aca0e5210436499bce1639a8d15da3f76f8d5827da1aa3eeb2c4e271c/pyzstd-0.16.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:454f31fd84175bb203c8c424f2255a343fa9bd103461a38d1bf50487c3b89508", size = 417961 }, + { url = "https://files.pythonhosted.org/packages/39/e9/54f53641ff10b4ea18d3ba159b03bd07e6ae5a5b7ae01f1329b0c35b8ca2/pyzstd-0.16.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:5ef754a93743f08fb0386ce3596780bfba829311b49c8f4107af1a4bcc16935d", size = 485587 }, + { url = "https://files.pythonhosted.org/packages/ce/65/25243b3fea9e52a20bfece1b12e3d3ee3125f17b1735aab08cb9a7a760b4/pyzstd-0.16.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:be81081db9166e10846934f0e3576a263cbe18d81eca06e6a5c23533f8ce0dc6", size = 564543 }, + { url = "https://files.pythonhosted.org/packages/3b/3c/324b8ddca55b4b073b413cea3e0587af3c8153ccf7d6d63ed294831f2095/pyzstd-0.16.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:738bcb2fa1e5f1868986f5030955e64de53157fa1141d01f3a4daf07a1aaf644", size = 430628 }, + { url = "https://files.pythonhosted.org/packages/db/a1/aca18925e23bceb833fc742ebaf87aa9d1ba8b178f0332bd108fc8966482/pyzstd-0.16.2-cp311-cp311-win32.whl", hash = "sha256:0ea214c9b97046867d1657d55979021028d583704b30c481a9c165191b08d707", size = 218215 }, + { url = "https://files.pythonhosted.org/packages/c0/7f/0f5d1d1891e6c6e14d846d2881a06ab7e5e97cabeb5e1e9e53debec4091a/pyzstd-0.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:c17c0fc02f0e75b0c7cd21f8eaf4c6ce4112333b447d93da1773a5f705b2c178", size = 245055 }, + { url = "https://files.pythonhosted.org/packages/28/15/20046759d138733e7150afa6aa15f322022d7587968e2dbd5b36fbf8aa86/pyzstd-0.16.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4081fd841a9efe9ded7290ee7502dbf042c4158b90edfadea3b8a072c8ec4e1", size = 373230 }, + { url = "https://files.pythonhosted.org/packages/51/8d/55b536edaecf19d2f8dbd8fbaefd184f2f9cc6b71d241caa6d86bed96813/pyzstd-0.16.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fd3fa45d2aeb65367dd702806b2e779d13f1a3fa2d13d5ec777cfd09de6822de", size = 295699 }, + { url = "https://files.pythonhosted.org/packages/11/14/086e7f690154c6f3d9bdb46da26a4cd3c9e0b284346ce10943711ca48c32/pyzstd-0.16.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8b5f0d2c07994a5180d8259d51df6227a57098774bb0618423d7eb4a7303467", size = 390556 }, + { url = "https://files.pythonhosted.org/packages/90/d2/c6d854705d6fa0ad876209b4ba796ab31d85b710d1459029f2cb41085a8d/pyzstd-0.16.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60c9d25b15c7ae06ed5d516d096a0d8254f9bed4368b370a09cccf191eaab5cb", size = 472928 }, + { url = "https://files.pythonhosted.org/packages/aa/38/f97dd871e446adc834349caa605dbaf5bac86763a255f62c809cc2459c85/pyzstd-0.16.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:29acf31ce37254f6cad08deb24b9d9ba954f426fa08f8fae4ab4fdc51a03f4ae", size = 416057 }, + { url = "https://files.pythonhosted.org/packages/53/be/0c5ad7bf29dc890f6a3303760b9802aeeafa4e3ffb598de625f501986bfe/pyzstd-0.16.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec77612a17697a9f7cf6634ffcee616eba9b997712fdd896e77fd19ab3a0618", size = 414613 }, + { url = "https://files.pythonhosted.org/packages/1f/1a/d3a1edcd59e2f62a35ac6257d2b86a2c872ae9a8e925380620a8db0d9a9a/pyzstd-0.16.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:313ea4974be93be12c9a640ab40f0fc50a023178aae004a8901507b74f190173", size = 413236 }, + { url = "https://files.pythonhosted.org/packages/f2/8d/912430c2310466c14a89a5a529b72eddef7e73fa733806dbe0b030cf3495/pyzstd-0.16.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e91acdefc8c2c6c3b8d5b1b5fe837dce4e591ecb7c0a2a50186f552e57d11203", size = 405536 }, + { url = "https://files.pythonhosted.org/packages/9e/83/4edb419a13b9d1e1debc01e88084eba93a5f7c10ef198da11f6782857c73/pyzstd-0.16.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:929bd91a403539e72b5b5cb97f725ac4acafe692ccf52f075e20cd9bf6e5493d", size = 419145 }, + { url = "https://files.pythonhosted.org/packages/8f/e9/62a169eddc37aefac480ee3b3318c221f6731e1e342dafd9e05b7fdaa7c5/pyzstd-0.16.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:740837a379aa32d110911ebcbbc524f9a9b145355737527543a884bd8777ca4f", size = 487157 }, + { url = "https://files.pythonhosted.org/packages/57/9d/5949f2a0144d1f99fab7914f854b582d2784c73139cc190e603e4d6b7b37/pyzstd-0.16.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:adfc0e80dd157e6d1e0b0112c8ecc4b58a7a23760bd9623d74122ef637cfbdb6", size = 565918 }, + { url = "https://files.pythonhosted.org/packages/de/ce/647b9c7602ac477c9e62cf9399810f72bb5dba8f508e7cdf8be1d260e6f9/pyzstd-0.16.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:79b183beae1c080ad3dca39019e49b7785391947f9aab68893ad85d27828c6e7", size = 431373 }, + { url = "https://files.pythonhosted.org/packages/8b/fb/4141e3d4549eea26e5a59ec723eade271980816cb2ed7613df855baa672f/pyzstd-0.16.2-cp312-cp312-win32.whl", hash = "sha256:b8d00631a3c466bc313847fab2a01f6b73b3165de0886fb03210e08567ae3a89", size = 218541 }, + { url = "https://files.pythonhosted.org/packages/51/b9/e1373b179129c2095d70bd1df02a51d388f4c7e4ecb62acb4e5e9570269b/pyzstd-0.16.2-cp312-cp312-win_amd64.whl", hash = "sha256:c0d43764e9a60607f35d8cb3e60df772a678935ab0e02e2804d4147377f4942c", size = 245320 }, + { url = "https://files.pythonhosted.org/packages/66/10/cc7c764c7673f1af1728abdcf58e58f88ef5d44ab4500677a2b7b4c01e7d/pyzstd-0.16.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3ae9ae7ad730562810912d7ecaf1fff5eaf4c726f4b4dfe04784ed5f06d7b91f", size = 373223 }, + { url = "https://files.pythonhosted.org/packages/3f/a7/bcaf7d635ee929dd4d08ae1c35101892db56a11542471eecfbf46b9dd988/pyzstd-0.16.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2ce8d3c213f76a564420f3d0137066ac007ce9fb4e156b989835caef12b367a7", size = 295701 }, + { url = "https://files.pythonhosted.org/packages/93/49/a604113a2f3135b29371a894c0faad22d7ea3f7b58f38d77baad8a817483/pyzstd-0.16.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c2c14dac23c865e2d78cebd9087e148674b7154f633afd4709b4cd1520b99a61", size = 392395 }, + { url = "https://files.pythonhosted.org/packages/b0/38/886ecf3ebb13a4b6e3ee85f448f54eef37a5ae2b453bd9d5d9edc909e119/pyzstd-0.16.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4527969d66a943e36ef374eda847e918077de032d58b5df84d98ffd717b6fa77", size = 474523 }, + { url = "https://files.pythonhosted.org/packages/14/98/121da6ac072c00090c218b4888ef00ead15979f09a657d9a5ff770d6bb17/pyzstd-0.16.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd8256149b88e657e99f31e6d4b114c8ff2935951f1d8bb8e1fe501b224999c0", size = 417974 }, + { url = "https://files.pythonhosted.org/packages/b6/ba/56652a67c0bcfaceb2945e5f07d5aa21af86e07cf33d1ae47bb3529a56c3/pyzstd-0.16.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5bd1f1822d65c9054bf36d35307bf8ed4aa2d2d6827431761a813628ff671b1d", size = 414587 }, + { url = "https://files.pythonhosted.org/packages/cc/30/cab6f45101f0113ced609ef65482aedd276e0f022d9f25a327d4284142f5/pyzstd-0.16.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6733f4d373ec9ad2c1976cf06f973a3324c1f9abe236d114d6bb91165a397d", size = 415071 }, + { url = "https://files.pythonhosted.org/packages/6d/44/2187fc8a46662926943aeb16d639dd4f3d06267c7e8abb2c6f97700ab11c/pyzstd-0.16.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7bec165ab6524663f00b69bfefd13a46a69fed3015754abaf81b103ec73d92c6", size = 407835 }, + { url = "https://files.pythonhosted.org/packages/de/d5/6edca97d5453cba820d2ad5630e6ec1fcfad66f69af5ad7d6c688ea301be/pyzstd-0.16.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e4460fa6949aac6528a1ad0de8871079600b12b3ef4db49316306786a3598321", size = 421755 }, + { url = "https://files.pythonhosted.org/packages/54/c1/1a0339e014ed97f4e6fd9166b0409ceda8f32e28e8ecda70fd7bb0915566/pyzstd-0.16.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:75df79ea0315c97d88337953a17daa44023dbf6389f8151903d371513f503e3c", size = 489174 }, + { url = "https://files.pythonhosted.org/packages/07/01/c65f2c9f0b902b33efcb0bdf3cbd07fc828fda6ff6333189eb71cf7acc60/pyzstd-0.16.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:93e1d45f4a196afb6f18682c79bdd5399277ead105b67f30b35c04c207966071", size = 573025 }, + { url = "https://files.pythonhosted.org/packages/a7/54/7ab9cc54171b7f8bb97cfd1c1aa7fcb706a4babeb629732529d8111bc4e6/pyzstd-0.16.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:075e18b871f38a503b5d23e40a661adfc750bd4bd0bb8b208c1e290f3ceb8fa2", size = 429582 }, + { url = "https://files.pythonhosted.org/packages/6c/a5/f9c950bb378dd1335bc4cc56444ec2ab40b1dab085c5798c5d16a9bf9d0b/pyzstd-0.16.2-cp313-cp313-win32.whl", hash = "sha256:9e4295eb299f8d87e3487852bca033d30332033272a801ca8130e934475e07a9", size = 218544 }, + { url = "https://files.pythonhosted.org/packages/9a/df/a15b9a8a59cd9908ae2b70bce2cb4ac3e2d7da11414ee0d0ceb46e4d0439/pyzstd-0.16.2-cp313-cp313-win_amd64.whl", hash = "sha256:18deedc70f858f4cf574e59f305d2a0678e54db2751a33dba9f481f91bc71c28", size = 245313 }, + { url = "https://files.pythonhosted.org/packages/e0/38/43002103a545bc953e532973596e905550e9626973c1b282e04e01038ac6/pyzstd-0.16.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a9892b707ef52f599098b1e9528df0e7849c5ec01d3e8035fb0e67de4b464839", size = 372192 }, + { url = "https://files.pythonhosted.org/packages/61/be/28dfeba9dbad8ed19d6aefa0d6623d1ee97e83c6c1e97910439428655f28/pyzstd-0.16.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4fbd647864341f3c174c4a6d7f20e6ea6b4be9d840fb900dc0faf0849561badc", size = 295080 }, + { url = "https://files.pythonhosted.org/packages/63/c2/c7e5244f2dde72df3fb2b7b952e8d01bac20cd78dc0d585d0a060ca565b0/pyzstd-0.16.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ac2c15656cc6194c4fed1cb0e8159f9394d4ea1d58be755448743d2ec6c9c4", size = 390165 }, + { url = "https://files.pythonhosted.org/packages/ff/30/52560cb88179fa3ff7536429c0d7b83aeecea86ecb2d180a4afc991502e5/pyzstd-0.16.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b239fb9a20c1be3374b9a2bd183ba624fd22ad7a3f67738c0d80cda68b4ae1d3", size = 472040 }, + { url = "https://files.pythonhosted.org/packages/69/a7/ab1e19626da5a8ff58493d6928d9d0da4931034e7a124949bf1a1705daaf/pyzstd-0.16.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc52400412cdae2635e0978b8d6bcc0028cc638fdab2fd301f6d157675d26896", size = 415255 }, + { url = "https://files.pythonhosted.org/packages/28/0d/bf7c9388fe43c7051a2ced4645e58a493a35c62e68307b5aaf0fb129b008/pyzstd-0.16.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b766a6aeb8dbb6c46e622e7a1aebfa9ab03838528273796941005a5ce7257b1", size = 413679 }, + { url = "https://files.pythonhosted.org/packages/58/2a/1e0738740a8bd2b1f4a74be86297c5776936b66b3a5340d8e4ae84c5844f/pyzstd-0.16.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd4b8676052f9d59579242bf3cfe5fd02532b6a9a93ab7737c118ae3b8509dc", size = 412623 }, + { url = "https://files.pythonhosted.org/packages/23/d5/7cbfbebbb3ffccb0626fc2fab622fb5a10cf66c2c60481f51e46a92eb2c5/pyzstd-0.16.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1c6c0a677aac7c0e3d2d2605d4d68ffa9893fdeeb2e071040eb7c8750969d463", size = 404981 }, + { url = "https://files.pythonhosted.org/packages/a7/b0/6ac198c753cc135357630e856f40f5998c2d28609713ae2830c679e8248c/pyzstd-0.16.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:15f9c2d612e7e2023d68d321d1b479846751f792af89141931d44e82ae391394", size = 417997 }, + { url = "https://files.pythonhosted.org/packages/c6/8f/0e5685efbf24ae62e135549e37947ca7919616b81108584112e25dd1a55a/pyzstd-0.16.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:11740bff847aad23beef4085a1bb767d101895881fe891f0a911aa27d43c372c", size = 485576 }, + { url = "https://files.pythonhosted.org/packages/30/d6/bf2f05752082967ac748d7c2d7c5a71097ac6fc1b902b5d34764cd0c12f7/pyzstd-0.16.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:b9067483ebe860e4130a03ee665b3d7be4ec1608b208e645d5e7eb3492379464", size = 564538 }, + { url = "https://files.pythonhosted.org/packages/d8/97/1081cc3cbf5eeb6cf4e385226e9989fdebb61f8e48baa210eb774145e667/pyzstd-0.16.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:988f0ba19b14c2fe0afefc444ac1edfb2f497b7d7c3212b2f587504cc2ec804e", size = 430615 }, + { url = "https://files.pythonhosted.org/packages/e0/a7/2a82fbb248b951434306dd77e969fb99305968904c9a7494574d696b1392/pyzstd-0.16.2-cp39-cp39-win32.whl", hash = "sha256:8855acb1c3e3829030b9e9e9973b19e2d70f33efb14ad5c474b4d086864c959c", size = 218215 }, + { url = "https://files.pythonhosted.org/packages/9d/bf/e529ff84b87c8f978ab35906921ac54841270562e65bcb5d0dd9d3240204/pyzstd-0.16.2-cp39-cp39-win_amd64.whl", hash = "sha256:018e88378df5e76f5e1d8cf4416576603b6bc4a103cbc66bb593eaac54c758de", size = 245047 }, + { url = "https://files.pythonhosted.org/packages/f9/ad/c09fb722c12a82b826c97efc50a919e229bfbaf644f5a140adcd71941473/pyzstd-0.16.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4b631117b97a42ff6dfd0ffc885a92fff462d7c34766b28383c57b996f863338", size = 364187 }, + { url = "https://files.pythonhosted.org/packages/57/f9/93175fe72f85fb675fe04abca296fe583112a25d0ec7faa026288d9463c2/pyzstd-0.16.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:56493a3fbe1b651a02102dd0902b0aa2377a732ff3544fb6fb3f114ca18db52f", size = 279825 }, + { url = "https://files.pythonhosted.org/packages/8a/de/0b40acf76d7ed1f7975877535e004de85ec2e869632754b5d4d389258b8a/pyzstd-0.16.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1eae9bdba4a1e5d3181331f403114ff5b8ce0f4b569f48eba2b9beb2deef1e4", size = 321313 }, + { url = "https://files.pythonhosted.org/packages/41/5e/00102bacd1a7c957c88098f3ae2cdac17842ac0f94d2e685ff5b75a05730/pyzstd-0.16.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1be6972391c8aeecc7e61feb96ffc8e77a401bcba6ed994e7171330c45a1948", size = 344376 }, + { url = "https://files.pythonhosted.org/packages/a3/95/27a7da3dbd4460cd9432bdc22d9d5f8ec77c86275d069020fa74ea280f7f/pyzstd-0.16.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:761439d687e3a5687c2ff5c6a1190e1601362a4a3e8c6c82ff89719d51d73e19", size = 328591 }, + { url = "https://files.pythonhosted.org/packages/c2/03/8f4d5fd45f6bfad66d67cdf583492a9f52a21049f60e6b36a7e9f8aa7adc/pyzstd-0.16.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f5fbdb8cf31b60b2dc586fecb9b73e2f172c21a0b320ed275f7b8d8a866d9003", size = 240786 }, + { url = "https://files.pythonhosted.org/packages/91/f6/bd63e2587e0ec40abd9f92278a442bc28b7ff109e418d1240ee2eb6536aa/pyzstd-0.16.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:183f26e34f9becf0f2db38be9c0bfb136753d228bcb47c06c69175901bea7776", size = 364180 }, + { url = "https://files.pythonhosted.org/packages/ac/13/d4c68ad926e79d734f57b26d49447908e8dab7f5c066d3a013b0d0cfa2be/pyzstd-0.16.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:88318b64b5205a67748148d6d244097fa6cf61fcea02ad3435511b9e7155ae16", size = 279816 }, + { url = "https://files.pythonhosted.org/packages/b2/ba/76f0b75ec9e9fc3914496e036f99f345d5e0a99cb7070341f9becdaba2b8/pyzstd-0.16.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:73142aa2571b6480136a1865ebda8257e09eabbc8bcd54b222202f6fa4febe1e", size = 321308 }, + { url = "https://files.pythonhosted.org/packages/a6/ea/9fe52bd777f33f007287f1a37bada7af5cf33d64904360c17bb64fefca21/pyzstd-0.16.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d3f8877c29a97f1b1bba16f3d3ab01ad10ad3da7bad317aecf36aaf8848b37c", size = 344368 }, + { url = "https://files.pythonhosted.org/packages/cc/c0/509077f73fc8e156ceeefb41d4b7e04aceb71b2339084fcd62d0ad3bfd75/pyzstd-0.16.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1f25754562473ac7de856b8331ebd5964f5d85601045627a5f0bb0e4e899990", size = 328585 }, + { url = "https://files.pythonhosted.org/packages/14/74/a854ada61bf4c3c2ad239ec2bd1ff73cc0d718ccbcc56e3ced94e878fd50/pyzstd-0.16.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6ce17e84310080c55c02827ad9bb17893c00a845c8386a328b346f814aabd2c1", size = 240783 }, +] + [[package]] name = "requests" version = "2.32.3" @@ -1860,6 +2271,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bf/65/813fc133609ebcb1299be6a42e5aea99d6344afb35ccb43f67e7daaa3b92/structlog-24.4.0-py3-none-any.whl", hash = "sha256:597f61e80a91cc0749a9fd2a098ed76715a1c8a01f73e336b746504d1aad7610", size = 67180 }, ] +[[package]] +name = "texttable" +version = "1.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1c/dc/0aff23d6036a4d3bf4f1d8c8204c5c79c4437e25e0ae94ffe4bbb55ee3c2/texttable-1.7.0.tar.gz", hash = "sha256:2d2068fb55115807d3ac77a4ca68fa48803e84ebb0ee2340f858107a36522638", size = 12831 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/24/99/4772b8e00a136f3e01236de33b0efda31ee7077203ba5967fcc76da94d65/texttable-1.7.0-py2.py3-none-any.whl", hash = "sha256:72227d592c82b3d7f672731ae73e4d1f88cd8e2ef5b075a7a7f01a23a3743917", size = 10768 }, +] + [[package]] name = "tomli" version = "2.0.1" diff --git a/lib/repack/owid/repack/__init__.py b/lib/repack/owid/repack/__init__.py index 847a751453f..270b5b1cb51 100644 --- a/lib/repack/owid/repack/__init__.py +++ b/lib/repack/owid/repack/__init__.py @@ -59,10 +59,12 @@ def repack_frame( def repack_series(s: pd.Series) -> pd.Series: - if s.dtype.name in ("Int64", "int64", "UInt64", "uint64"): - return shrink_integer(s) + dtype_name = s.dtype.name.replace("[pyarrow]", "").replace("[pyarrow_numpy]", "").lower() - if s.dtype.name in ("object", "string", "float64", "Float64"): + if dtype_name in ("int64", "uint64"): + return shrink_integer(s.astype("Int64")) + + if dtype_name in ("object", "str", "string", "float64"): for strategy in [to_int, to_float, to_category]: try: return strategy(s) @@ -72,11 +74,22 @@ def repack_series(s: pd.Series) -> pd.Series: return s +def _to_float(s: pd.Series) -> pd.Series: + """Convert series to Float64. Replace numpy NaNs with NA. This can + happen when original series is an object and contains 'nan' string.""" + r = s.astype("Float64") + if s.dtype == "object": + r = r.mask(np.isnan(r), pd.NA) + return r + + def to_int(s: pd.Series) -> pd.Series: # values could be integers or strings - v = s.astype("float64").astype("Int64") + s = _to_float(s) + v = s.astype("Int64") - if not series_eq(v, s, cast=float): + # casting to float converts strings to floats, that doesn't work with float64[pyarrow] + if not series_eq(v, s): raise ValueError() # it's an integer, now pack it smaller @@ -87,21 +100,16 @@ def shrink_integer(s: pd.Series) -> pd.Series: """ Take an Int64 series and make it as small as possible. """ - assert s.dtype.name in ("Int64", "int64", "UInt64", "uint64") + assert s.dtype == "Int64" if s.isnull().all(): # shrink all NaNs to Int8 return s.astype("Int8") - elif s.isnull().any(): + else: if s.min() < 0: series = ["Int32", "Int16", "Int8"] else: series = ["UInt32", "UInt16", "UInt8"] - else: - if s.min() < 0: - series = ["int32", "int16", "int8"] - else: - series = ["uint32", "uint16", "uint8"] for dtype in series: v = s.astype(dtype) @@ -114,11 +122,20 @@ def shrink_integer(s: pd.Series) -> pd.Series: def to_float(s: pd.Series) -> pd.Series: - options = ["float32", "float64"] + return shrink_float(_to_float(s)) + + +def shrink_float(s: pd.Series) -> pd.Series: + """ + Take a Float64 series and make it as small as possible. + """ + assert s.dtype.name.replace("[pyarrow]", "") in ("float64", "Float64", "double"), s.dtype + + options = ["Float32", "Float64"] for dtype in options: v = s.astype(dtype) - if series_eq(s, v, float): + if series_eq(s, v): return v raise ValueError() @@ -133,7 +150,7 @@ def to_category(s: pd.Series) -> pd.Series: return s.astype("category") -def series_eq(lhs: pd.Series, rhs: pd.Series, cast: Any, rtol: float = 1e-5, atol: float = 1e-8) -> bool: +def series_eq(lhs: pd.Series, rhs: pd.Series, rtol: float = 1e-5, atol: float = 1e-8) -> bool: """ Check that series are equal, but unlike normal floating point checks where NaN != NaN, we want missing or null values to be reported as equal to each @@ -144,11 +161,33 @@ def series_eq(lhs: pd.Series, rhs: pd.Series, cast: Any, rtol: float = 1e-5, ato if len(lhs) != len(rhs): return False - # improve performance by calling native astype method - if cast == float: - func = lambda s: s.astype(float) # noqa: E731 + return np.allclose(lhs, rhs, rtol=rtol, atol=atol, equal_nan=True) + + +def _safe_dtype(dtype: Any) -> str: + """Determine the appropriate dtype string based on pandas dtype.""" + if pd.api.types.is_integer_dtype(dtype): + return "Int64" + elif pd.api.types.is_float_dtype(dtype): + return "Float64" + elif pd.api.types.is_bool_dtype(dtype): + return "boolean" + elif isinstance(dtype, pd.CategoricalDtype): + return "string[pyarrow]" + elif dtype == object: + return "string[pyarrow]" + else: + return dtype + + +def to_safe_types(t: pd.DataFrame) -> pd.DataFrame: + """Convert numeric columns to Float64 and Int64 and categorical + columns to string[pyarrow].""" + t = t.astype({col: _safe_dtype(t[col].dtype) for col in t.columns}) + + if isinstance(t.index, pd.MultiIndex): + t.index = t.index.set_levels([level.astype(_safe_dtype(level.dtype)) for level in t.index.levels]) else: - # NOTE: this would be extremely slow in practice - func = lambda s: s.apply(cast) # noqa: E731 + t.index = t.index.astype(_safe_dtype(t.index.dtype)) - return np.allclose(func(lhs), func(rhs), rtol=rtol, atol=atol, equal_nan=True) + return t diff --git a/lib/repack/pyproject.toml b/lib/repack/pyproject.toml index 9dc2139168d..fea1f71e18c 100644 --- a/lib/repack/pyproject.toml +++ b/lib/repack/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "owid-repack" -version = "0.1.3" +version = "0.1.4" description = "Pack Pandas data frames into smaller, more memory-efficient data types." authors = [ {name = "Our World in Data", email = "tech@ourworldindata.org"}, @@ -9,7 +9,9 @@ license = "MIT" requires-python = ">=3.9" dependencies = [ "numpy>=1.24.0", - "pandas>=2.2.1" + "pandas>=2.2.3", + # there are problems with installing 18.0.0, but we should undo this once it's fixed + "pyarrow>=10.0.1,<18.0.0", ] [tool.uv] @@ -17,6 +19,7 @@ dev-dependencies = [ "pytest>=7.2.0", "pyright==1.1.373", "ruff==0.1.6", + "ipdb>=0.13.13", ] [tool.ruff] diff --git a/lib/repack/tests/test_repack.py b/lib/repack/tests/test_repack.py index 633a82ca2e9..54a29d24ee4 100644 --- a/lib/repack/tests/test_repack.py +++ b/lib/repack/tests/test_repack.py @@ -16,8 +16,8 @@ def test_repack_non_object_columns(): df2 = df.copy() df2 = repack.repack_frame(df2, {}) - assert df2.myint.dtype.name == "uint8" - assert df2.myfloat.dtype.name == "float32" + assert df2.myint.dtype.name == "UInt8" + assert df2.myfloat.dtype.name == "Float32" assert_frame_equal(df, df2, check_dtype=False) @@ -27,6 +27,9 @@ def test_repack_object_columns(): "myint": [1, 2, None, 3], "myfloat": [1.2, 2.0, 3.0, None], "mycat": ["a", None, "b", "c"], + "myintstr": [1, 2, 3, "4"], + "nans1": [1, 2, 3, pd.NA], + "nans2": [1, 2.1, 3, np.nan], }, dtype="object", ) @@ -35,8 +38,11 @@ def test_repack_object_columns(): df_repack = repack.repack_frame(df_repack) assert df_repack.myint.dtype.name == "UInt8" - assert df_repack.myfloat.dtype.name == "float32" + assert df_repack.myfloat.dtype.name == "Float32" assert df_repack.mycat.dtype.name == "category" + assert df_repack.myintstr.dtype.name == "UInt8" + assert df_repack.nans1.dtype.name == "UInt8" + assert df_repack.nans2.dtype.name == "Float32" def test_repack_frame_with_index(): @@ -63,13 +69,13 @@ def test_repack_integer_strings(): def test_repack_float_strings(): s = pd.Series(["10", "22.2", "30"]) v = repack.repack_series(s) - assert v.dtype.name == "float32" + assert v.dtype.name == "Float32" def test_repack_uint64(): s = pd.Series([10, 20], dtype="uint64") v = repack.repack_series(s) - assert v.dtype.name == "uint8" + assert v.dtype.name == "UInt8" def test_repack_int8_boundaries(): @@ -78,15 +84,15 @@ def test_repack_int8_boundaries(): # check the lower boundary s[0] = info.min - assert repack.repack_series(s).dtype.name == "int8" + assert repack.repack_series(s).dtype.name == "Int8" s[0] -= 1 - assert repack.repack_series(s).dtype.name == "int16" + assert repack.repack_series(s).dtype.name == "Int16" # check the upper boundary s[0] = info.max - assert repack.repack_series(s).dtype.name == "int8" + assert repack.repack_series(s).dtype.name == "Int8" s[0] += 1 - assert repack.repack_series(s).dtype.name == "int16" + assert repack.repack_series(s).dtype.name == "Int16" def test_repack_int16_boundaries(): @@ -95,15 +101,15 @@ def test_repack_int16_boundaries(): # check the lower boundary s[0] = info.min - assert repack.repack_series(s).dtype.name == "int16" + assert repack.repack_series(s).dtype.name == "Int16" s[0] -= 1 - assert repack.repack_series(s).dtype.name == "int32" + assert repack.repack_series(s).dtype.name == "Int32" # check the upper boundary s[0] = info.max - assert repack.repack_series(s).dtype.name == "int16" + assert repack.repack_series(s).dtype.name == "Int16" s[0] += 1 - assert repack.repack_series(s).dtype.name == "int32" + assert repack.repack_series(s).dtype.name == "Int32" def test_repack_int32_boundaries(): @@ -112,15 +118,15 @@ def test_repack_int32_boundaries(): # check the lower boundary s[0] = info.min - assert repack.repack_series(s).dtype.name == "int32" + assert repack.repack_series(s).dtype.name == "Int32" s[0] -= 1 - assert repack.repack_series(s).dtype.name == "int64" + assert repack.repack_series(s).dtype.name == "Int64" # check the upper boundary s[0] = info.max - assert repack.repack_series(s).dtype.name == "int32" + assert repack.repack_series(s).dtype.name == "Int32" s[0] += 1 - assert repack.repack_series(s).dtype.name == "int64" + assert repack.repack_series(s).dtype.name == "Int64" def test_repack_uint_boundaries(): @@ -128,27 +134,27 @@ def test_repack_uint_boundaries(): # uint8 info: Any = np.iinfo(np.uint8) s[0] = info.max - assert repack.repack_series(s).dtypes.name == "uint8" + assert repack.repack_series(s).dtypes.name == "UInt8" s[0] += 1 - assert repack.repack_series(s).dtypes.name == "uint16" + assert repack.repack_series(s).dtypes.name == "UInt16" # uint16 info2: Any = np.iinfo(np.uint16) s[0] = info2.max - assert repack.repack_series(s).dtypes.name == "uint16" + assert repack.repack_series(s).dtypes.name == "UInt16" s[0] += 1 - assert repack.repack_series(s).dtypes.name == "uint32" + assert repack.repack_series(s).dtypes.name == "UInt32" # uint32 info3: Any = np.iinfo(np.uint32) s[0] = info3.max - assert repack.repack_series(s).dtypes.name == "uint32" + assert repack.repack_series(s).dtypes.name == "UInt32" # we don't bother using uint64, we just use int64 s[0] += 1 - assert repack.repack_series(s).dtypes.name == "int64" + assert repack.repack_series(s).dtypes.name == "Int64" def test_repack_int(): @@ -160,7 +166,7 @@ def test_repack_int(): def test_repack_int_no_null(): s = pd.Series([1, 2, 3]).astype("object") v = repack.repack_series(s) - assert v.dtype == "uint8" + assert v.dtype == "UInt8" def test_repack_float_to_int(): @@ -174,12 +180,24 @@ def test_repack_float_object_to_float32(): s = pd.Series([1, 2, None, 3.3], dtype="object") v = repack.repack_series(s) - assert v.dtype == "float32" + assert v.dtype == "Float32" + + +def test_repack_object_with_nan_string(): + s = pd.Series([1, 2, "nan"], dtype="object") + v = repack.repack_series(s) + assert v.dtype == "UInt8" + assert v.isnull().sum() == 1 + + s = pd.Series([1, 2.2, "nan"], dtype="object") + v = repack.repack_series(s) + assert v.dtype == "Float32" + assert v.isnull().sum() == 1 def test_repack_category(): s = pd.Series(["a", "b", "c", None]) - assert s.dtype == np.object_ + assert s.dtype == np.object_ or s.dtype == "str" v = repack.repack_series(s) assert v.dtype == "category" @@ -188,13 +206,13 @@ def test_repack_category(): def test_shrink_integers_uint8(): s = pd.Series([1, 2, 3], dtype="Int64") v = repack.shrink_integer(s) - assert v.dtype.name == "uint8" + assert v.dtype.name == "UInt8" def test_shrink_integers_int8(): s = pd.Series([1, 2, 3, -3], dtype="Int64") v = repack.shrink_integer(s) - assert v.dtype.name == "int8" + assert v.dtype.name == "Int8" def test_repack_frame_keep_dtypes(): @@ -204,7 +222,7 @@ def test_repack_frame_keep_dtypes(): df2 = repack.repack_frame(df2, dtypes={"myint": float}) assert df2.myint.dtype.name == "float64" - assert df2.myfloat.dtype.name == "float32" + assert df2.myfloat.dtype.name == "Float32" def test_repack_int64_all_nans(): @@ -222,11 +240,15 @@ def test_repack_float64_all_nans(): def test_series_eq(): a = pd.Series([1, np.nan], dtype="float64") b = pd.Series([2, np.nan], dtype="float64") - assert not repack.series_eq(a, b, cast=float) + assert not repack.series_eq(a, b) a = pd.Series([1, np.nan], dtype="float64") b = pd.Series([1, np.nan], dtype="float64") - assert repack.series_eq(a, b, cast=float) + assert repack.series_eq(a, b) + + a = pd.Series([1, np.nan], dtype="float64") + b = pd.Series([1, np.nan], dtype="float64").astype("Float64") + assert repack.series_eq(a, b) def test_repack_object_np_str(): @@ -238,7 +260,7 @@ def test_repack_object_np_str(): def test_repack_with_inf(): s = pd.Series([0, np.inf], dtype=object) v = repack.repack_series(s) - assert v.dtype.name == "float32" + assert v.dtype.name == "Float32" def test_repack_with_datetime(): @@ -253,3 +275,79 @@ def test_repack_string_type(): v = repack.repack_series(s) assert v.dtype == "category" + + +def test_to_safe_types(): + # Create a DataFrame with various dtypes + df = pd.DataFrame( + { + "int_col": [1, 2, 3], + "float_col": [1.1, 2.2, 3.3], + "cat_col": pd.Categorical(["a", "b", "c"]), + "object_col": ["x", "y", "z"], + } + ) + + # Set an index with integer dtype + df.set_index("int_col", inplace=True) + + # Apply the to_safe_types function + df_safe = repack.to_safe_types(df) + + # Check that the dtypes have been converted appropriately + assert df_safe.index.dtype == "Int64" + assert df_safe["float_col"].dtype == "Float64" + assert df_safe["cat_col"].dtype == "string[pyarrow]" + assert df_safe["object_col"].dtype == "string[pyarrow]" + + +def test_to_safe_types_multiindex(): + # Create a DataFrame with MultiIndex + df = pd.DataFrame( + { + "int_col": [1, 2, 3], + "cat_col": pd.Categorical(["a", "b", "c"]), + "float_col": [1.1, 2.2, 3.3], + } + ) + df.set_index(["int_col", "cat_col"], inplace=True) + + # Apply the to_safe_types function + df_safe = repack.to_safe_types(df) + + # Check index levels + assert df_safe.index.levels[0].dtype == "Int64" # type: ignore + assert df_safe.index.levels[1].dtype == "string[pyarrow]" # type: ignore + # Check column dtype + assert df_safe["float_col"].dtype == "Float64" + + +def test_to_safe_types_with_nan(): + # Create a DataFrame with NaN values + df = pd.DataFrame( + { + "int_col": [1, 2, 3], + "int_with_nan": [1, np.nan, 3], + "nullable_int_with_nan": [1, np.nan, 3], + "float_col": [1.1, np.nan, 3.3], + "cat_col": pd.Categorical(["a", None, "c"]), + } + ) + df.set_index("float_col", inplace=True) + df["nullable_int_with_nan"] = df["nullable_int_with_nan"].astype("Int32") + + # Apply the to_safe_types function + df_safe = repack.to_safe_types(df) + + # Check that NaN values are handled correctly + assert df_safe.index.dtype == "Float64" + assert df_safe["int_col"].dtype == "Int64" + # NOTE: ints with nans end up as floats, but if they are nullable ints they remain as Int64 (which is our case + # since we store everything as nullable types) + assert df_safe["int_with_nan"].dtype == "Float64" + assert df_safe["nullable_int_with_nan"].dtype == "Int64" + assert df_safe["cat_col"].dtype == "string[pyarrow]" + + # Ensure that the NA value in 'cat_col' remains pd.NA and not the string "NA" + assert pd.isna(df_safe["cat_col"].iloc[1]) + assert df_safe["cat_col"].iloc[1] is pd.NA diff --git a/lib/repack/uv.lock b/lib/repack/uv.lock index d4f9195e965..1e54dea67bf 100644 --- a/lib/repack/uv.lock +++ b/lib/repack/uv.lock @@ -6,6 +6,18 @@ resolution-markers = [ "python_full_version >= '3.12'", ] +[[package]] +name = "asttokens" +version = "2.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/45/1d/f03bcb60c4a3212e15f99a56085d93093a497718adf828d050b9d675da81/asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0", size = 62284 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl", hash = "sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24", size = 27764 }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -15,6 +27,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] +[[package]] +name = "decorator" +version = "5.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/0c/8d907af351aa16b42caae42f9d6aa37b900c67308052d10fdce809f8d952/decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330", size = 35016 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 }, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -24,6 +45,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 }, ] +[[package]] +name = "executing" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8c/e3/7d45f492c2c4a0e8e0fad57d081a7c8a0286cdd86372b070cca1ec0caa1e/executing-2.1.0.tar.gz", hash = "sha256:8ea27ddd260da8150fa5a708269c4a10e76161e2496ec3e587da9e3c0fe4b9ab", size = 977485 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl", hash = "sha256:8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf", size = 25805 }, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -33,6 +63,66 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, ] +[[package]] +name = "ipdb" +version = "0.13.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "decorator" }, + { name = "ipython" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/1b/7e07e7b752017f7693a0f4d41c13e5ca29ce8cbcfdcc1fd6c4ad8c0a27a0/ipdb-0.13.13.tar.gz", hash = "sha256:e3ac6018ef05126d442af680aad863006ec19d02290561ac88b8b1c0b0cfc726", size = 17042 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/4c/b075da0092003d9a55cf2ecc1cae9384a1ca4f650d51b00fc59875fe76f6/ipdb-0.13.13-py3-none-any.whl", hash = "sha256:45529994741c4ab6d2388bfa5d7b725c2cf7fe9deffabdb8a6113aa5ed449ed4", size = 12130 }, +] + +[[package]] +name = "ipython" +version = "8.18.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "decorator" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "jedi" }, + { name = "matplotlib-inline" }, + { name = "pexpect", marker = "sys_platform != 'win32'" }, + { name = "prompt-toolkit" }, + { name = "pygments" }, + { name = "stack-data" }, + { name = "traitlets" }, + { name = "typing-extensions", marker = "python_full_version < '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/b9/3ba6c45a6df813c09a48bac313c22ff83efa26cbb55011218d925a46e2ad/ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27", size = 5486330 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/6b/d9fdcdef2eb6a23f391251fde8781c38d42acd82abe84d054cb74f7863b0/ipython-8.18.1-py3-none-any.whl", hash = "sha256:e8267419d72d81955ec1177f8a29aaa90ac80ad647499201119e2f05e99aa397", size = 808161 }, +] + +[[package]] +name = "jedi" +version = "0.19.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "parso" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/99/99b493cec4bf43176b678de30f81ed003fd6a647a301b9c927280c600f0a/jedi-0.19.1.tar.gz", hash = "sha256:cf0496f3651bc65d7174ac1b7d043eff454892c708a87d1b683e57b569927ffd", size = 1227821 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl", hash = "sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0", size = 1569361 }, +] + +[[package]] +name = "matplotlib-inline" +version = "0.1.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899 }, +] + [[package]] name = "nodeenv" version = "1.9.1" @@ -87,15 +177,17 @@ wheels = [ [[package]] name = "owid-repack" -version = "0.1.3" +version = "0.1.4" source = { editable = "." } dependencies = [ { name = "numpy" }, { name = "pandas" }, + { name = "pyarrow" }, ] [package.dev-dependencies] dev = [ + { name = "ipdb" }, { name = "pyright" }, { name = "pytest" }, { name = "ruff" }, @@ -104,11 +196,13 @@ dev = [ [package.metadata] requires-dist = [ { name = "numpy", specifier = ">=1.24.0" }, - { name = "pandas", specifier = "==2.2.1" }, + { name = "pandas", specifier = ">=2.2.3" }, + { name = "pyarrow", specifier = ">=10.0.1,<18.0.0" }, ] [package.metadata.requires-dev] dev = [ + { name = "ipdb", specifier = ">=0.13.13" }, { name = "pyright", specifier = "==1.1.373" }, { name = "pytest", specifier = ">=7.2.0" }, { name = "ruff", specifier = "==0.1.6" }, @@ -125,7 +219,7 @@ wheels = [ [[package]] name = "pandas" -version = "2.2.1" +version = "2.2.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, @@ -133,36 +227,70 @@ dependencies = [ { name = "pytz" }, { name = "tzdata" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/3d/59/2afa81b9fb300c90531803c0fd43ff4548074fa3e8d0f747ef63b3b5e77a/pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572", size = 4395256 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/39/f4495f8ab5a58b1eeee06b5abd811e0a93f7b75acdc89380797f99bdf91a/pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88", size = 12543124 }, - { url = "https://files.pythonhosted.org/packages/4f/19/0ae5f1557badfcae1052c1397041a2c5441e9f31e1c7b0cce7f8bc585f4e/pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944", size = 11285572 }, - { url = "https://files.pythonhosted.org/packages/5d/d2/df8047f8c3648eb6b3ee86ef7ee811ad01e55b47a14ea02fe36d601e12cd/pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359", size = 15629656 }, - { url = "https://files.pythonhosted.org/packages/19/df/8d789d96a9e338cf28cb7978fa93ef5da53137624b7ef032f30748421c2b/pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51", size = 13024911 }, - { url = "https://files.pythonhosted.org/packages/11/a1/9d5505c6c56740f7ed8bd78c8756fb76aeff1c706b30e6930ddf90693aee/pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06", size = 16275635 }, - { url = "https://files.pythonhosted.org/packages/d6/99/378e9108cf3562c7c6294249f1bfd3be08325af5e96af435fb221dd1c320/pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9", size = 13880082 }, - { url = "https://files.pythonhosted.org/packages/93/26/2a695303a4a3194014dca7cb5d5ce08f0d2c6baa344fb5f562c642e77b2b/pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0", size = 11592785 }, - { url = "https://files.pythonhosted.org/packages/f1/8b/617792ad1feef330e87d7459584a1f91aa8aea373d8b168ac5d24fddd808/pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b", size = 12564385 }, - { url = "https://files.pythonhosted.org/packages/a5/78/1d859bfb619c067e3353ed079248ae9532c105c4e018fa9a776d04b34572/pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a", size = 11303028 }, - { url = "https://files.pythonhosted.org/packages/91/bf/8c57707e440f944ba2cf3d6f6ae6c29883fac20fbe5d2ad485229149f273/pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02", size = 15594865 }, - { url = "https://files.pythonhosted.org/packages/d4/47/1ccf9f62d2674d3ca3e95452c5f9dd114234d1535dec77c96528bf6a31fc/pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403", size = 13034628 }, - { url = "https://files.pythonhosted.org/packages/e3/da/9522ba4b32b20a344c37a970d7835d261df1427d943e02d48820253833ee/pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd", size = 16243608 }, - { url = "https://files.pythonhosted.org/packages/e0/c3/da6ffa0d3d510c378f6e46496cf7f84f35e15836d0de4e9880f40247eb60/pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7", size = 13884355 }, - { url = "https://files.pythonhosted.org/packages/61/11/1812ef6cbd7433ad240f72161ce5f84c4c450cede4db080365d371d29117/pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e", size = 11602637 }, - { url = "https://files.pythonhosted.org/packages/ed/b9/660353ce2b1bd5b6e0f5c992836d91909c0da1ccb59c16565ad0a37e839d/pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c", size = 12493183 }, - { url = "https://files.pythonhosted.org/packages/19/4e/6a7f400d4b65f82e37eefa7dbbe3e6f0a4fa542ca7ebb68c787eeebdc497/pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee", size = 11335860 }, - { url = "https://files.pythonhosted.org/packages/d7/2b/3e00e92a6b430313da68b15e925c6dba05f672d716cf3b02bcd3d0381974/pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2", size = 15189183 }, - { url = "https://files.pythonhosted.org/packages/78/f4/19f1dda9ab1eaa38301e445925f92b303d415d4c4115e56c0d62774421f7/pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0", size = 12742656 }, - { url = "https://files.pythonhosted.org/packages/6f/cd/8b84912b5bfab19b1fcea2f732d2e3a2d134d558f141e9dffa5dbfd9d23b/pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc", size = 15861331 }, - { url = "https://files.pythonhosted.org/packages/11/e7/65bf50aff86da6554cdffdcd87ced857c79a29dfaf1d85fdf97955d76d02/pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89", size = 13410754 }, - { url = "https://files.pythonhosted.org/packages/71/00/6beaeeba7f075d15ea167a5caa039b861e58ff2f58a5b659abb9b544c8f6/pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb", size = 11478767 }, - { url = "https://files.pythonhosted.org/packages/1a/f6/621a5a90727c839aafd4a2e40f8fab4645efb534f96454d31a257ce693ed/pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397", size = 12561981 }, - { url = "https://files.pythonhosted.org/packages/bc/57/8c61a6b2f9798349748701938dfed6d645bd329bfd96245ad98245238b6f/pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16", size = 11301393 }, - { url = "https://files.pythonhosted.org/packages/3e/a6/6dbcb4b72687c8df8f3dca5f16b296b4ae5c9fa3084a32a165113d594b71/pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019", size = 15646609 }, - { url = "https://files.pythonhosted.org/packages/1a/5e/71bb0eef0dc543f7516d9ddeca9ee8dc98207043784e3f7e6c08b4a6b3d9/pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df", size = 13040474 }, - { url = "https://files.pythonhosted.org/packages/60/f0/765326197f1759004d07a3e5e060cecfc90fd7af22eadd4cb02ef5e74555/pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6", size = 16261844 }, - { url = "https://files.pythonhosted.org/packages/5f/96/0f208a3f7bb6f930060c1930fe4d2d24ce491d044a6ace1cb6cc52d3a319/pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be", size = 13914313 }, - { url = "https://files.pythonhosted.org/packages/41/a3/349df1721beb447142b8b11e27875a3da00f85d713f1a4bed0afb3a62e14/pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab", size = 11610656 }, +sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5", size = 12580827 }, + { url = "https://files.pythonhosted.org/packages/99/f2/c4527768739ffa4469b2b4fff05aa3768a478aed89a2f271a79a40eee984/pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348", size = 11303897 }, + { url = "https://files.pythonhosted.org/packages/ed/12/86c1747ea27989d7a4064f806ce2bae2c6d575b950be087837bdfcabacc9/pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed", size = 66480908 }, + { url = "https://files.pythonhosted.org/packages/44/50/7db2cd5e6373ae796f0ddad3675268c8d59fb6076e66f0c339d61cea886b/pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57", size = 13064210 }, + { url = "https://files.pythonhosted.org/packages/61/61/a89015a6d5536cb0d6c3ba02cebed51a95538cf83472975275e28ebf7d0c/pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42", size = 16754292 }, + { url = "https://files.pythonhosted.org/packages/ce/0d/4cc7b69ce37fac07645a94e1d4b0880b15999494372c1523508511b09e40/pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f", size = 14416379 }, + { url = "https://files.pythonhosted.org/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645", size = 11598471 }, + { url = "https://files.pythonhosted.org/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222 }, + { url = "https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274 }, + { url = "https://files.pythonhosted.org/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836 }, + { url = "https://files.pythonhosted.org/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505 }, + { url = "https://files.pythonhosted.org/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420 }, + { url = "https://files.pythonhosted.org/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457 }, + { url = "https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166 }, + { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893 }, + { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475 }, + { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645 }, + { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445 }, + { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235 }, + { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756 }, + { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 }, + { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643 }, + { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573 }, + { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085 }, + { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809 }, + { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316 }, + { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055 }, + { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175 }, + { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650 }, + { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177 }, + { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526 }, + { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013 }, + { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620 }, + { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 }, + { url = "https://files.pythonhosted.org/packages/ca/8c/8848a4c9b8fdf5a534fe2077af948bf53cd713d77ffbcd7bd15710348fd7/pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39", size = 12595535 }, + { url = "https://files.pythonhosted.org/packages/9c/b9/5cead4f63b6d31bdefeb21a679bc5a7f4aaf262ca7e07e2bc1c341b68470/pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30", size = 11319822 }, + { url = "https://files.pythonhosted.org/packages/31/af/89e35619fb573366fa68dc26dad6ad2c08c17b8004aad6d98f1a31ce4bb3/pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c", size = 15625439 }, + { url = "https://files.pythonhosted.org/packages/3d/dd/bed19c2974296661493d7acc4407b1d2db4e2a482197df100f8f965b6225/pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c", size = 13068928 }, + { url = "https://files.pythonhosted.org/packages/31/a3/18508e10a31ea108d746c848b5a05c0711e0278fa0d6f1c52a8ec52b80a5/pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea", size = 16783266 }, + { url = "https://files.pythonhosted.org/packages/c4/a5/3429bd13d82bebc78f4d78c3945efedef63a7cd0c15c17b2eeb838d1121f/pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761", size = 14450871 }, + { url = "https://files.pythonhosted.org/packages/2f/49/5c30646e96c684570925b772eac4eb0a8cb0ca590fa978f56c5d3ae73ea1/pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e", size = 11618011 }, +] + +[[package]] +name = "parso" +version = "0.8.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/94/68e2e17afaa9169cf6412ab0f28623903be73d1b32e208d9e8e541bb086d/parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d", size = 400609 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650 }, +] + +[[package]] +name = "pexpect" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ptyprocess" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772 }, ] [[package]] @@ -174,6 +302,84 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, ] +[[package]] +name = "prompt-toolkit" +version = "3.0.48" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2d/4f/feb5e137aff82f7c7f3248267b97451da3644f6cdc218edfe549fb354127/prompt_toolkit-3.0.48.tar.gz", hash = "sha256:d6623ab0477a80df74e646bdbc93621143f5caf104206aa29294d53de1a03d90", size = 424684 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl", hash = "sha256:f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e", size = 386595 }, +] + +[[package]] +name = "ptyprocess" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993 }, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842 }, +] + +[[package]] +name = "pyarrow" +version = "17.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/27/4e/ea6d43f324169f8aec0e57569443a38bab4b398d09769ca64f7b4d467de3/pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28", size = 1112479 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/5d/78d4b040bc5ff2fc6c3d03e80fca396b742f6c125b8af06bcf7427f931bc/pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07", size = 28994846 }, + { url = "https://files.pythonhosted.org/packages/3b/73/8ed168db7642e91180330e4ea9f3ff8bab404678f00d32d7df0871a4933b/pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655", size = 27165908 }, + { url = "https://files.pythonhosted.org/packages/81/36/e78c24be99242063f6d0590ef68c857ea07bdea470242c361e9a15bd57a4/pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545", size = 39264209 }, + { url = "https://files.pythonhosted.org/packages/18/4c/3db637d7578f683b0a8fb8999b436bdbedd6e3517bd4f90c70853cf3ad20/pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2", size = 39862883 }, + { url = "https://files.pythonhosted.org/packages/81/3c/0580626896c842614a523e66b351181ed5bb14e5dfc263cd68cea2c46d90/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8", size = 38723009 }, + { url = "https://files.pythonhosted.org/packages/ee/fb/c1b47f0ada36d856a352da261a44d7344d8f22e2f7db3945f8c3b81be5dd/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047", size = 39855626 }, + { url = "https://files.pythonhosted.org/packages/19/09/b0a02908180a25d57312ab5919069c39fddf30602568980419f4b02393f6/pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087", size = 25147242 }, + { url = "https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977", size = 29028748 }, + { url = "https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3", size = 27190965 }, + { url = "https://files.pythonhosted.org/packages/3b/c8/5675719570eb1acd809481c6d64e2136ffb340bc387f4ca62dce79516cea/pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15", size = 39269081 }, + { url = "https://files.pythonhosted.org/packages/5e/78/3931194f16ab681ebb87ad252e7b8d2c8b23dad49706cadc865dff4a1dd3/pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597", size = 39864921 }, + { url = "https://files.pythonhosted.org/packages/d8/81/69b6606093363f55a2a574c018901c40952d4e902e670656d18213c71ad7/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420", size = 38740798 }, + { url = "https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4", size = 39871877 }, + { url = "https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03", size = 25151089 }, + { url = "https://files.pythonhosted.org/packages/d4/62/ce6ac1275a432b4a27c55fe96c58147f111d8ba1ad800a112d31859fae2f/pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22", size = 29019418 }, + { url = "https://files.pythonhosted.org/packages/8e/0a/dbd0c134e7a0c30bea439675cc120012337202e5fac7163ba839aa3691d2/pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053", size = 27152197 }, + { url = "https://files.pythonhosted.org/packages/cb/05/3f4a16498349db79090767620d6dc23c1ec0c658a668d61d76b87706c65d/pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a", size = 39263026 }, + { url = "https://files.pythonhosted.org/packages/c2/0c/ea2107236740be8fa0e0d4a293a095c9f43546a2465bb7df34eee9126b09/pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc", size = 39880798 }, + { url = "https://files.pythonhosted.org/packages/f6/b0/b9164a8bc495083c10c281cc65064553ec87b7537d6f742a89d5953a2a3e/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a", size = 38715172 }, + { url = "https://files.pythonhosted.org/packages/f1/c4/9625418a1413005e486c006e56675334929fad864347c5ae7c1b2e7fe639/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b", size = 39874508 }, + { url = "https://files.pythonhosted.org/packages/ae/49/baafe2a964f663413be3bd1cf5c45ed98c5e42e804e2328e18f4570027c1/pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7", size = 25099235 }, + { url = "https://files.pythonhosted.org/packages/43/e0/a898096d35be240aa61fb2d54db58b86d664b10e1e51256f9300f47565e8/pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb", size = 29007881 }, + { url = "https://files.pythonhosted.org/packages/59/22/f7d14907ed0697b5dd488d393129f2738629fa5bcba863e00931b7975946/pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df", size = 27178117 }, + { url = "https://files.pythonhosted.org/packages/bf/ee/661211feac0ed48467b1d5c57298c91403809ec3ab78b1d175e1d6ad03cf/pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687", size = 39273896 }, + { url = "https://files.pythonhosted.org/packages/af/61/bcd9b58e38ead6ad42b9ed00da33a3f862bc1d445e3d3164799c25550ac2/pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b", size = 39875438 }, + { url = "https://files.pythonhosted.org/packages/75/63/29d1bfcc57af73cde3fc3baccab2f37548de512dbe0ab294b033cd203516/pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5", size = 38735092 }, + { url = "https://files.pythonhosted.org/packages/39/f4/90258b4de753df7cc61cefb0312f8abcf226672e96cc64996e66afce817a/pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda", size = 39867610 }, + { url = "https://files.pythonhosted.org/packages/e7/f6/b75d4816c32f1618ed31a005ee635dd1d91d8164495d94f2ea092f594661/pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204", size = 25148611 }, +] + +[[package]] +name = "pygments" +version = "2.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/62/8336eff65bcbc8e4cb5d05b55faf041285951b6e80f33e2bff2024788f31/pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199", size = 4891905 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", size = 1205513 }, +] + [[package]] name = "pyright" version = "1.1.373" @@ -257,6 +463,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254", size = 11053 }, ] +[[package]] +name = "stack-data" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "executing" }, + { name = "pure-eval" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521 }, +] + [[package]] name = "tomli" version = "2.0.1" @@ -266,6 +486,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/97/75/10a9ebee3fd790d20926a90a2547f0bf78f371b2f13aa822c759680ca7b9/tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", size = 12757 }, ] +[[package]] +name = "traitlets" +version = "5.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359 }, +] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 }, +] + [[package]] name = "tzdata" version = "2024.1" @@ -274,3 +512,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/74/5b/e025d02cb3b66b7b7 wheels = [ { url = "https://files.pythonhosted.org/packages/65/58/f9c9e6be752e9fcb8b6a0ee9fb87e6e7a1f6bcab2cdc73f02bb7ba91ada0/tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252", size = 345370 }, ] + +[[package]] +name = "wcwidth" +version = "0.2.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 }, +] diff --git a/lib/walden/uv.lock b/lib/walden/uv.lock index e917e84e394..3a8d2678812 100644 --- a/lib/walden/uv.lock +++ b/lib/walden/uv.lock @@ -1441,7 +1441,7 @@ requires-dist = [ { name = "colorama", specifier = ">=0.4.4" }, { name = "gdown", specifier = ">=4.5.2" }, { name = "gsheets", specifier = ">=0.6.1" }, - { name = "pandas", specifier = "==2.2.1" }, + { name = "pandas", specifier = ">=2.2.1" }, { name = "pyarrow", specifier = ">=10.0.1" }, { name = "pydrive2", specifier = ">=1.15.0" }, { name = "structlog", specifier = ">=21.5.0" }, @@ -1467,21 +1467,24 @@ dev = [ [[package]] name = "owid-repack" -version = "0.1.3" +version = "0.1.4" source = { editable = "../repack" } dependencies = [ { name = "numpy" }, { name = "pandas" }, + { name = "pyarrow" }, ] [package.metadata] requires-dist = [ { name = "numpy", specifier = ">=1.24.0" }, - { name = "pandas", specifier = "==2.2.1" }, + { name = "pandas", specifier = ">=2.2.3" }, + { name = "pyarrow", specifier = ">=10.0.1,<18.0.0" }, ] [package.metadata.requires-dev] dev = [ + { name = "ipdb", specifier = ">=0.13.13" }, { name = "pyright", specifier = "==1.1.373" }, { name = "pytest", specifier = ">=7.2.0" }, { name = "ruff", specifier = "==0.1.6" }, @@ -1498,7 +1501,7 @@ wheels = [ [[package]] name = "pandas" -version = "2.2.1" +version = "2.2.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, @@ -1506,36 +1509,49 @@ dependencies = [ { name = "pytz" }, { name = "tzdata" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/3d/59/2afa81b9fb300c90531803c0fd43ff4548074fa3e8d0f747ef63b3b5e77a/pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572", size = 4395256 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/39/f4495f8ab5a58b1eeee06b5abd811e0a93f7b75acdc89380797f99bdf91a/pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88", size = 12543124 }, - { url = "https://files.pythonhosted.org/packages/4f/19/0ae5f1557badfcae1052c1397041a2c5441e9f31e1c7b0cce7f8bc585f4e/pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944", size = 11285572 }, - { url = "https://files.pythonhosted.org/packages/5d/d2/df8047f8c3648eb6b3ee86ef7ee811ad01e55b47a14ea02fe36d601e12cd/pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359", size = 15629656 }, - { url = "https://files.pythonhosted.org/packages/19/df/8d789d96a9e338cf28cb7978fa93ef5da53137624b7ef032f30748421c2b/pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51", size = 13024911 }, - { url = "https://files.pythonhosted.org/packages/11/a1/9d5505c6c56740f7ed8bd78c8756fb76aeff1c706b30e6930ddf90693aee/pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06", size = 16275635 }, - { url = "https://files.pythonhosted.org/packages/d6/99/378e9108cf3562c7c6294249f1bfd3be08325af5e96af435fb221dd1c320/pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9", size = 13880082 }, - { url = "https://files.pythonhosted.org/packages/93/26/2a695303a4a3194014dca7cb5d5ce08f0d2c6baa344fb5f562c642e77b2b/pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0", size = 11592785 }, - { url = "https://files.pythonhosted.org/packages/f1/8b/617792ad1feef330e87d7459584a1f91aa8aea373d8b168ac5d24fddd808/pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b", size = 12564385 }, - { url = "https://files.pythonhosted.org/packages/a5/78/1d859bfb619c067e3353ed079248ae9532c105c4e018fa9a776d04b34572/pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a", size = 11303028 }, - { url = "https://files.pythonhosted.org/packages/91/bf/8c57707e440f944ba2cf3d6f6ae6c29883fac20fbe5d2ad485229149f273/pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02", size = 15594865 }, - { url = "https://files.pythonhosted.org/packages/d4/47/1ccf9f62d2674d3ca3e95452c5f9dd114234d1535dec77c96528bf6a31fc/pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403", size = 13034628 }, - { url = "https://files.pythonhosted.org/packages/e3/da/9522ba4b32b20a344c37a970d7835d261df1427d943e02d48820253833ee/pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd", size = 16243608 }, - { url = "https://files.pythonhosted.org/packages/e0/c3/da6ffa0d3d510c378f6e46496cf7f84f35e15836d0de4e9880f40247eb60/pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7", size = 13884355 }, - { url = "https://files.pythonhosted.org/packages/61/11/1812ef6cbd7433ad240f72161ce5f84c4c450cede4db080365d371d29117/pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e", size = 11602637 }, - { url = "https://files.pythonhosted.org/packages/ed/b9/660353ce2b1bd5b6e0f5c992836d91909c0da1ccb59c16565ad0a37e839d/pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c", size = 12493183 }, - { url = "https://files.pythonhosted.org/packages/19/4e/6a7f400d4b65f82e37eefa7dbbe3e6f0a4fa542ca7ebb68c787eeebdc497/pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee", size = 11335860 }, - { url = "https://files.pythonhosted.org/packages/d7/2b/3e00e92a6b430313da68b15e925c6dba05f672d716cf3b02bcd3d0381974/pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2", size = 15189183 }, - { url = "https://files.pythonhosted.org/packages/78/f4/19f1dda9ab1eaa38301e445925f92b303d415d4c4115e56c0d62774421f7/pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0", size = 12742656 }, - { url = "https://files.pythonhosted.org/packages/6f/cd/8b84912b5bfab19b1fcea2f732d2e3a2d134d558f141e9dffa5dbfd9d23b/pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc", size = 15861331 }, - { url = "https://files.pythonhosted.org/packages/11/e7/65bf50aff86da6554cdffdcd87ced857c79a29dfaf1d85fdf97955d76d02/pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89", size = 13410754 }, - { url = "https://files.pythonhosted.org/packages/71/00/6beaeeba7f075d15ea167a5caa039b861e58ff2f58a5b659abb9b544c8f6/pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb", size = 11478767 }, - { url = "https://files.pythonhosted.org/packages/1a/f6/621a5a90727c839aafd4a2e40f8fab4645efb534f96454d31a257ce693ed/pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397", size = 12561981 }, - { url = "https://files.pythonhosted.org/packages/bc/57/8c61a6b2f9798349748701938dfed6d645bd329bfd96245ad98245238b6f/pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16", size = 11301393 }, - { url = "https://files.pythonhosted.org/packages/3e/a6/6dbcb4b72687c8df8f3dca5f16b296b4ae5c9fa3084a32a165113d594b71/pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019", size = 15646609 }, - { url = "https://files.pythonhosted.org/packages/1a/5e/71bb0eef0dc543f7516d9ddeca9ee8dc98207043784e3f7e6c08b4a6b3d9/pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df", size = 13040474 }, - { url = "https://files.pythonhosted.org/packages/60/f0/765326197f1759004d07a3e5e060cecfc90fd7af22eadd4cb02ef5e74555/pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6", size = 16261844 }, - { url = "https://files.pythonhosted.org/packages/5f/96/0f208a3f7bb6f930060c1930fe4d2d24ce491d044a6ace1cb6cc52d3a319/pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be", size = 13914313 }, - { url = "https://files.pythonhosted.org/packages/41/a3/349df1721beb447142b8b11e27875a3da00f85d713f1a4bed0afb3a62e14/pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab", size = 11610656 }, +sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5", size = 12580827 }, + { url = "https://files.pythonhosted.org/packages/99/f2/c4527768739ffa4469b2b4fff05aa3768a478aed89a2f271a79a40eee984/pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348", size = 11303897 }, + { url = "https://files.pythonhosted.org/packages/ed/12/86c1747ea27989d7a4064f806ce2bae2c6d575b950be087837bdfcabacc9/pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed", size = 66480908 }, + { url = "https://files.pythonhosted.org/packages/44/50/7db2cd5e6373ae796f0ddad3675268c8d59fb6076e66f0c339d61cea886b/pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57", size = 13064210 }, + { url = "https://files.pythonhosted.org/packages/61/61/a89015a6d5536cb0d6c3ba02cebed51a95538cf83472975275e28ebf7d0c/pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42", size = 16754292 }, + { url = "https://files.pythonhosted.org/packages/ce/0d/4cc7b69ce37fac07645a94e1d4b0880b15999494372c1523508511b09e40/pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f", size = 14416379 }, + { url = "https://files.pythonhosted.org/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645", size = 11598471 }, + { url = "https://files.pythonhosted.org/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222 }, + { url = "https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274 }, + { url = "https://files.pythonhosted.org/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836 }, + { url = "https://files.pythonhosted.org/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505 }, + { url = "https://files.pythonhosted.org/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420 }, + { url = "https://files.pythonhosted.org/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457 }, + { url = "https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166 }, + { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893 }, + { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475 }, + { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645 }, + { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445 }, + { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235 }, + { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756 }, + { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 }, + { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643 }, + { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573 }, + { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085 }, + { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809 }, + { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316 }, + { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055 }, + { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175 }, + { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650 }, + { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177 }, + { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526 }, + { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013 }, + { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620 }, + { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 }, + { url = "https://files.pythonhosted.org/packages/ca/8c/8848a4c9b8fdf5a534fe2077af948bf53cd713d77ffbcd7bd15710348fd7/pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39", size = 12595535 }, + { url = "https://files.pythonhosted.org/packages/9c/b9/5cead4f63b6d31bdefeb21a679bc5a7f4aaf262ca7e07e2bc1c341b68470/pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30", size = 11319822 }, + { url = "https://files.pythonhosted.org/packages/31/af/89e35619fb573366fa68dc26dad6ad2c08c17b8004aad6d98f1a31ce4bb3/pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c", size = 15625439 }, + { url = "https://files.pythonhosted.org/packages/3d/dd/bed19c2974296661493d7acc4407b1d2db4e2a482197df100f8f965b6225/pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c", size = 13068928 }, + { url = "https://files.pythonhosted.org/packages/31/a3/18508e10a31ea108d746c848b5a05c0711e0278fa0d6f1c52a8ec52b80a5/pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea", size = 16783266 }, + { url = "https://files.pythonhosted.org/packages/c4/a5/3429bd13d82bebc78f4d78c3945efedef63a7cd0c15c17b2eeb838d1121f/pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761", size = 14450871 }, + { url = "https://files.pythonhosted.org/packages/2f/49/5c30646e96c684570925b772eac4eb0a8cb0ca590fa978f56c5d3ae73ea1/pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e", size = 11618011 }, ] [[package]] diff --git a/mkdocs.yml b/mkdocs.yml index 8ff65ffa179..004773e1d26 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -93,6 +93,8 @@ extra: link: https://ourworldindata.org - icon: fontawesome/brands/instagram link: https://instagram.com/ourworldindata + - icon: fontawesome/brands/bluesky + link: https://bsky.app/profile/ourworldindata.org - icon: fontawesome/brands/x-twitter link: https://twitter.com/ourworldindata @@ -112,8 +114,7 @@ markdown_extensions: - pymdownx.critic - pymdownx.emoji: emoji_index: !!python/name:materialx.emoji.twemoji - emoji_generator: - !!python/name:materialx.emoji.to_svg # The Details extension supercharges the Admonition extension, making the resulting call-outs collapsible, allowing them to be opened and closed by the user. ref: https://squidfunk.github.io/mkdocs-material/setup/extensions/python-markdown-extensions/#details + emoji_generator: !!python/name:materialx.emoji.to_svg # The Details extension supercharges the Admonition extension, making the resulting call-outs collapsible, allowing them to be opened and closed by the user. ref: https://squidfunk.github.io/mkdocs-material/setup/extensions/python-markdown-extensions/#details - pymdownx.details @@ -149,9 +150,12 @@ plugins: - git-authors: show_email_address: false # authorship_threshold_percent: 1 - # show_contribution: true + show_contribution: true # show_line_count: true # count_empty_lines: true + ignore_authors: + - owidbot + sort_authors_by: contribution - git-revision-date-localized - tags: tags_file: tags.md @@ -205,23 +209,34 @@ nav: - Contributing: "contributing.md" - Guides: - "guides/index.md" - - Data work: + - Adding data: - "guides/data-work/index.md" - - Adding data: "guides/data-work/add-data.md" + - New data: "guides/data-work/add-data.md" - Updating data: "guides/data-work/update-data.md" - Update charts: "guides/data-work/update-charts.md" - - Wizard: "guides/wizard.md" - - CLI: "guides/etl-cli.md" - - Harmonize country names: "guides/harmonize-countries.md" - - Using different environments: "guides/environment.md" - - Staging servers: "guides/staging-servers.md" - - Private dataset import to ETL: "guides/private-import.md" - - Automate regular updates: "guides/auto-regular-updates.md" - - Backport a dataset to ETL: "guides/backport.md" - - Metadata in data pages: "guides/metadata-play.md" - - Edit the documentation: "dev/docs.md" - - OpenAI setup: "guides/openai.md" - - Sharing with external people: "guides/sharing-external.md" + - Export data: "guides/data-work/export-data.md" + - Admin: + - Wizard: "guides/wizard.md" + - CLI: "guides/etl-cli.md" + - Tools: + - Harmonize country names: "guides/harmonize-countries.md" + - Backport from database: "guides/backport.md" + - Regular updates: "guides/auto-regular-updates.md" + - Pull requests: "guides/pull-requests.md" + - Servers & settings: + - Upgrade Python version: "guides/upgrade-python-version.md" + - Environments: "guides/environment.md" + - Staging servers: "guides/staging-servers.md" + - Public servers: "guides/sharing-external.md" + - Private datasets: "guides/private-import.md" + - Types in Tables: "guides/types-tables.md" + - OpenAI setup: "guides/openai.md" + + - Tips when working with ETL: "guides/etl-tips.md" + + - Others: + - Edit the documentation: "dev/docs.md" + - Metadata in data pages: "guides/metadata-play.md" - Design principles: - Design principles & workflow: architecture/index.md diff --git a/pyproject.toml b/pyproject.toml index 0f976621e21..462eaa42eea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "" authors = [ {name = "Our World in Data", email = "tech@ourworldindata.org"}, ] -requires-python = ">=3.10, <3.12" +requires-python = ">=3.10, <3.13" dependencies = [ "click>=8.0.1", "PyYAML>=6.0.1", @@ -32,8 +32,7 @@ dependencies = [ "ruamel.yaml>=0.17.21", "gitpython>=3.1.30", "rapidfuzz>=2.13.7", - # pinned because of frictionless, remove this pin when frictionless is updated - "fsspec==2022.11.0", + "fsspec>=2022.11.0", "openai>=1.3.6", "pdfplumber>=0.9.0", "pyhumps>=3.8.0", @@ -51,21 +50,22 @@ dependencies = [ "rioxarray>=0.15.1", "html2text>=2020.1.16", "pygithub>=2.3.0", - "pandas==2.2.2", + "pandas==2.2.3", "sqlalchemy>=2.0.30", "pymysql>=1.1.1", "tiktoken>=0.7.0", "earthengine-api>=0.1.411", "python-docx>=1.1.2", "h5netcdf>=1.3.0", - # we're not yet ready for frictionless 5.x.x as it raises an error for data://open_numbers/open_numbers/latest/bp__energy - "frictionless[pandas]>=4.40.8, <5.0.0", + "frictionless[pandas]>=5.0.3", "owid-catalog", "owid-datautils", "owid-repack", "walden", "deprecated>=1.2.14", "scikit-learn>=1.5.2", + "geopy>=2.4.1", + "py7zr>=0.22.0", ] [tool.uv.sources] @@ -92,10 +92,11 @@ dev-dependencies = [ "boto3-stubs[s3]>=1.34.154", "gspread>=5.12.4", "jsonref>=1.1.0", + "mkdocs-material>=9.5.34", "mkdocs-jupyter>=0.24.8", "mkdocs-exclude>=1.0.2", "mkdocs-gen-files>=0.5.0", - "mkdocs-git-authors-plugin>=0.7.2", + "mkdocs-git-authors-plugin>=0.9.2", "mkdocs-git-revision-date-localized-plugin>=1.2.6", "mkdocs-click>=0.8.1", "mkdocs-glightbox>=0.3.7", @@ -117,7 +118,7 @@ api = [ "joblib>=1.3.2", ] wizard = [ - "streamlit>=1.39.0", + "streamlit>=1.40.0", "streamlit-aggrid>=0.3.4.post3", "streamlit-ace>=0.1.1", "streamlit-extras>=0.3.6", @@ -127,12 +128,19 @@ wizard = [ "pyproj>=3.6.1", "streamlit-feedback>=0.1.3", "statsmodels>=0.14.4", + # Starting from PyTorch 2.3.0, the minimum requirement for macOS is macOS 11.0+ ARM64. Some poor people on the team still use it. Jeez... + # Torch doesn't work with python 3.13 + # error: distribution torch==2.2.2 @ registry+https://pypi.org/simple can't be installed because it doesn't have a source distribution or wheel for the current platform + "torch<2.3.0", + "sentence-transformers>=2.2.2", + "moviepy>=2.1.1", ] [project.scripts] etl = 'apps.cli:cli' etlwiz = 'apps.wizard.cli:cli' etlr = 'etl.command:main_cli' +etlp = 'apps.pr.cli:cli' etl-wizard = 'apps.wizard.cli:cli' compare = 'etl.compare:cli' backport = 'apps.backport.backport:backport_cli' @@ -157,8 +165,6 @@ build-backend = "hatchling.build" [tool.pyright] exclude = [ "lib/", - "etl/steps/archive", - "etl/snapshots/archive", "apps/wizard/etl_steps/cookiecutter/", "apps/wizard/etl_steps/cookiecutter/snapshot/**", "**/node_modules", diff --git a/schemas/dataset-schema.json b/schemas/dataset-schema.json index 1c7f117b72c..98f56ba4b7b 100644 --- a/schemas/dataset-schema.json +++ b/schemas/dataset-schema.json @@ -1072,11 +1072,6 @@ "default": false, "description": "Exclude entities that do not belong in any color group" }, - "hasChartTab": { - "type": "boolean", - "default": true, - "description": "Whether to show the (non-map) chart tab" - }, "data": { "type": "object", "description": "Obsolete name - used only to store the available entities", @@ -1287,21 +1282,24 @@ ] ] }, - "type": { - "type": "string", - "description": "Which type of chart should be shown (hasMapChart can be used to always also show a map chart)", - "default": "LineChart", - "enum": [ - "LineChart", - "ScatterPlot", - "StackedArea", - "DiscreteBar", - "StackedDiscreteBar", - "SlopeChart", - "StackedBar", - "WorldMap", - "Marimekko" - ] + "chartTypes": { + "type": "array", + "description": "Which types of chart should be shown", + "default": ["LineChart"], + "items": { + "type": "string", + "enum": [ + "LineChart", + "ScatterPlot", + "StackedArea", + "DiscreteBar", + "StackedDiscreteBar", + "SlopeChart", + "StackedBar", + "WorldMap", + "Marimekko" + ] + } }, "hasMapTab": { "type": "boolean", diff --git a/schemas/definitions.json b/schemas/definitions.json index f1b0ed711c2..42a7b2810d3 100644 --- a/schemas/definitions.json +++ b/schemas/definitions.json @@ -663,7 +663,19 @@ "description": "We keep display for the time being as the 'less powerful sibling' of grapher config.", "properties": { "isProjection": { - "type": "boolean", + "oneOf": [ + { + "type": "boolean" + }, + { + "type": "string", + "pattern": "<%" + }, + { + "type": "string", + "pattern": "^\\{.*\\}$" + } + ], "default": false, "description": "Indicates if this time series is a forward projection (if so then this is rendered differently in e.g. line charts)." }, diff --git a/schemas/multidim-schema.json b/schemas/multidim-schema.json index c11d4f366d4..6a4817ff093 100644 --- a/schemas/multidim-schema.json +++ b/schemas/multidim-schema.json @@ -139,8 +139,8 @@ "type": "string", "description": "Url of the concrete schema version to use to validate this document", "format": "uri", - "default": "https://files.ourworldindata.org/schemas/grapher-schema.005.json", - "const": "https://files.ourworldindata.org/schemas/grapher-schema.005.json" + "default": "https://files.ourworldindata.org/schemas/grapher-schema.006.json", + "const": "https://files.ourworldindata.org/schemas/grapher-schema.006.json" }, "id": { "type": "integer", @@ -267,11 +267,6 @@ "default": false, "description": "Exclude entities that do not belong in any color group" }, - "hasChartTab": { - "type": "boolean", - "default": true, - "description": "Whether to show the (non-map) chart tab" - }, "hideLegend": { "type": "boolean", "default": false @@ -564,20 +559,23 @@ "type": "string", "description": "Big title text of the chart" }, - "type": { - "type": "string", - "description": "Which type of chart should be shown (hasMapChart can be used to always also show a map chart)", - "default": "LineChart", - "enum": [ - "LineChart", - "ScatterPlot", - "StackedArea", - "DiscreteBar", - "StackedDiscreteBar", - "SlopeChart", - "StackedBar", - "Marimekko" - ] + "chartTypes": { + "type": "array", + "description": "Which types of chart should be shown", + "default": ["LineChart"], + "items": { + "type": "string", + "enum": [ + "LineChart", + "ScatterPlot", + "StackedArea", + "DiscreteBar", + "StackedDiscreteBar", + "SlopeChart", + "StackedBar", + "Marimekko" + ] + } }, "hasMapTab": { "type": "boolean", @@ -960,4 +958,4 @@ "additionalProperties": false } } -} \ No newline at end of file +} diff --git a/snapshots/antibiotics/2024-10-18/who_glass.py b/snapshots/antibiotics/2024-10-18/who_glass.py new file mode 100644 index 00000000000..af5e7d20118 --- /dev/null +++ b/snapshots/antibiotics/2024-10-18/who_glass.py @@ -0,0 +1,117 @@ +"""Script to create a snapshot of dataset.""" + +import os +import tempfile +import time +import zipfile +from pathlib import Path + +import click +import requests +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from structlog import get_logger + +from etl.snapshot import Snapshot + +log = get_logger() +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/who_glass.zip") + file_path = get_shiny_data() + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload, filename=file_path) + os.remove(file_path) + + +def get_shiny_data() -> str: + """ + Get data from https://worldhealthorg.shinyapps.io/glass-dashboard/_w_679389fb/#!/amr + Specifically - Global maps of testing coverage by infectious syndrome. + This script downloads data for multiple years and syndromes and stores them in a zip file. + """ + + years = range(2016, 2023) + syndromes = ["URINE", "BLOOD", "STOOL", "UROGENITAL"] + + # Set up the driver (ensure you have ChromeDriver or another driver installed) + driver = webdriver.Chrome() + wait = WebDriverWait(driver, 20) + + with tempfile.TemporaryDirectory() as temp_dir: + for syndrome in syndromes: + log.info(f"Downloading data for syndrome: {syndrome}") + for year in years: + log.info(f"Downloading data for year: {year}") + + # Open the webpage + driver.get("https://worldhealthorg.shinyapps.io/glass-dashboard/_w_679389fb/#!/amr") + + # Scroll to the section where the dropdowns are located + section = wait.until(EC.presence_of_element_located((By.ID, "plot-amr-3"))) + time.sleep(1) + driver.execute_script("arguments[0].scrollIntoView(true);", section) + time.sleep(1) + # Wait for the year dropdown to become visible and interactable + year_dropdown = wait.until( + EC.presence_of_element_located((By.XPATH, '//*[@id="amr-gc_infsys-year-select-selectized"]')) + ) + driver.execute_script("arguments[0].click();", year_dropdown) + time.sleep(1) + + # Select the year + option_year = wait.until(EC.element_to_be_clickable((By.XPATH, f'//div[@data-value="{year}"]'))) + driver.execute_script("arguments[0].click();", option_year) + time.sleep(1) + + # Wait for the syndrome dropdown to become visible and interactable + syndrome_dropdown = wait.until( + EC.presence_of_element_located((By.XPATH, '//*[@id="amr-gc_infsys-infsys-select-selectized"]')) + ) + driver.execute_script("arguments[0].click();", syndrome_dropdown) + + # Select the syndrome + option_syndrome = wait.until(EC.element_to_be_clickable((By.XPATH, f'//div[@data-value="{syndrome}"]'))) + driver.execute_script("arguments[0].click();", option_syndrome) + + # Trigger the download button + download_link_element = wait.until( + EC.presence_of_element_located((By.XPATH, '//*[@id="amr-gc_infsys-dl-data"]')) + ) + + # Get the href attribute of the download link + download_link = download_link_element.get_attribute("href") + + if download_link: + response = requests.get(download_link) + file_path = os.path.join(temp_dir, f"{syndrome}_{year}.csv") + # Save the CSV content to a file in the temporary directory + with open(file_path, "wb") as file: + file.write(response.content) + log.info(f"Downloaded {syndrome}_{year}.csv to {file_path}") + else: + log.error(f"No download link found for {syndrome} in {year}.") + + # Zip all downloaded files + zip_file_path = "downloaded_data.zip" + with zipfile.ZipFile(zip_file_path, "w") as zipf: + for foldername, subfolders, filenames in os.walk(temp_dir): + for filename in filenames: + file_path = os.path.join(foldername, filename) + zipf.write(file_path, os.path.basename(file_path)) + + driver.quit() + return zip_file_path + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-10-18/who_glass.zip.dvc b/snapshots/antibiotics/2024-10-18/who_glass.zip.dvc new file mode 100644 index 00000000000..2e245ff7341 --- /dev/null +++ b/snapshots/antibiotics/2024-10-18/who_glass.zip.dvc @@ -0,0 +1,33 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Global Antimicrobial Resistance and Use Surveillance System (GLASS) + description: |- + GLASS provides a standardized approach to the collection, analysis, interpretation and sharing of data by countries and seeks to actively support capacity building and monitor the status of existing and new national surveillance systems. Furthermore, GLASS promotes a shift from surveillance approaches based solely on laboratory data to a system that includes epidemiological, clinical, and population-level data. GLASS has been conceived to progressively incorporate data from surveillance of AMR in humans, such as monitoring of resistance and the use of antimicrobial medicines, including AMR in the food chain and in the environment. + date_published: "2024-09-16" + title_snapshot: Global AMR data + description_snapshot: |- + The global AMR data dashboard describes the implementation status, quality assurance, and standards of national AMR surveillance systems in 2022. It also describes the progress in global coverage of GLASS-AMR since its initiation (2016), and presents global antibiotic resistance estimates for the latest available calendar year (2022). Filters allow users to also access the estimates from the previous years (2020-2021). The main unit of observation for AMR data are bacteriologically confirmed infections (BCIs) with interpretable antibiotic susceptibility test (AST) results. The latter are presented for infectious syndromes, bacterial pathogens and antibiotics under surveillance. The reader is referred to previous report editions and the manual for early implementation for a detailed description of GLASS methods. The 2024 GLASS report uses mid-year population estimates of the United Nations World Population Prospects 2024, to calculate testing coverage per million population each year. + + # Citation + producer: World Health Organization + citation_full: |- + Global AMR data - Global Antimicrobial Resistance and Use Surveillance System (GLASS), World Health Organization (2024) + + attribution_short: WHO + + # Files + url_main: https://worldhealthorg.shinyapps.io/glass-dashboard/_w_679389fb/#!/amr + date_accessed: 2024-10-18 + + # License + license: + name: © 2024 WHO + url: https://www.who.int/about/policies/terms-of-use + +outs: + - md5: b1c4d820b37417b77214db814348233f + size: 55169 + path: who_glass.zip diff --git a/snapshots/antibiotics/2024-10-18/who_glass_by_antibiotic.py b/snapshots/antibiotics/2024-10-18/who_glass_by_antibiotic.py new file mode 100644 index 00000000000..cc02bd701bd --- /dev/null +++ b/snapshots/antibiotics/2024-10-18/who_glass_by_antibiotic.py @@ -0,0 +1,39 @@ +"""Script to create a snapshot of dataset. + +To download the data visit: + +https://worldhealthorg.shinyapps.io/glass-dashboard/_w_679389fb/#!/amr + +and go to the section called 'Global maps of testing coverage by bacterial pathogen and antibiotic group' + +Download each slice of the data (this is quite time-consuming but I tried and failed at automating it using selenium, shiny apps are not fun!) into a folder structured like: bloodstream/acinetobacter_spp/carbapenems/2022.csv + +Then zip this up and upload the file to snapshot. + + +""" + + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/who_glass_by_antibiotic.zip") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-10-18/who_glass_by_antibiotic.zip.dvc b/snapshots/antibiotics/2024-10-18/who_glass_by_antibiotic.zip.dvc new file mode 100644 index 00000000000..b52081a7cc3 --- /dev/null +++ b/snapshots/antibiotics/2024-10-18/who_glass_by_antibiotic.zip.dvc @@ -0,0 +1,32 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Global Antimicrobial Resistance and Use Surveillance System (GLASS) - by antibiotic + description: |- + GLASS provides a standardized approach to the collection, analysis, interpretation and sharing of data by countries and seeks to actively support capacity building and monitor the status of existing and new national surveillance systems. Furthermore, GLASS promotes a shift from surveillance approaches based solely on laboratory data to a system that includes epidemiological, clinical, and population-level data. GLASS has been conceived to progressively incorporate data from surveillance of AMR in humans, such as monitoring of resistance and the use of antimicrobial medicines, including AMR in the food chain and in the environment. + date_published: "2024-09-16" + title_snapshot: Global AMR data + description_snapshot: |- + The global AMR data dashboard describes the implementation status, quality assurance, and standards of national AMR surveillance systems in 2022. It also describes the progress in global coverage of GLASS-AMR since its initiation (2016), and presents global antibiotic resistance estimates for the latest available calendar year (2022). Filters allow users to also access the estimates from the previous years (2020-2021). The main unit of observation for AMR data are bacteriologically confirmed infections (BCIs) with interpretable antibiotic susceptibility test (AST) results. The latter are presented for infectious syndromes, bacterial pathogens and antibiotics under surveillance. The reader is referred to previous report editions and the manual for early implementation for a detailed description of GLASS methods. The 2024 GLASS report uses mid-year population estimates of the United Nations World Population Prospects 2024, to calculate testing coverage per million population each year. + + # Citation + producer: World Health Organization + citation_full: |- + Global AMR data - Global Antimicrobial Resistance and Use Surveillance System (GLASS), World Health Organization (2024) + + attribution_short: WHO + + # Files + url_main: https://worldhealthorg.shinyapps.io/glass-dashboard/_w_679389fb/#!/amr + date_accessed: 2024-10-21 + + # License + license: + name: © 2024 WHO + url: https://www.who.int/about/policies/terms-of-use +outs: + - md5: 6a875574098f305ecfb6a564e64a5a30 + size: 312785 + path: who_glass_by_antibiotic.zip diff --git a/snapshots/antibiotics/2024-11-12/antimicrobial_usage.py b/snapshots/antibiotics/2024-11-12/antimicrobial_usage.py new file mode 100644 index 00000000000..ab102d101d2 --- /dev/null +++ b/snapshots/antibiotics/2024-11-12/antimicrobial_usage.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/antimicrobial_usage.xlsx") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-11-12/antimicrobial_usage.xlsx.dvc b/snapshots/antibiotics/2024-11-12/antimicrobial_usage.xlsx.dvc new file mode 100644 index 00000000000..c7a9190de57 --- /dev/null +++ b/snapshots/antibiotics/2024-11-12/antimicrobial_usage.xlsx.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: WHO GLASS + description: |- + Inappropriate use of antimicrobials in humans is a primary driver of antimicrobial resistance (AMR) emergence and spread. In 2020, WHO launched GLASS antimicrobial use (GLASS AMU), previously called GLASS AMC, to monitor the quantity and types of antimicrobial s us ed at the national and global levels. WHO invites Countries, Areas, and Territories (CTAs) to enrol in GLASS AMU and commit to building or strengthening their national AMU surveillance system and, when ready, to reporting their national AMU data. Data calls are opened every year. + date_published: "2024-09-16" + title_snapshot: WHO GLASS - Global Antimicrobial Use data + + # Citation + producer: WHO Global Antimicrobial Resistance and Use Surveillance System (GLASS) + citation_full: |- + Antimicrobial Resistance and Use Surveillance System (GLASS) 2024: Antimicrobial Use data contextual information and antimicrobial use estimates by ATC4 subgroup and AWaRe, 2016-2022. Geneva, World Health Organization; 2024. + attribution_short: WHO GLASS + + # Files + url_main: https://worldhealthorg.shinyapps.io/glass-dashboard/_w_053a572c/#!/amu + url_download: https://worldhealthorg.shinyapps.io/glass-dashboard/_w_053a572c/data/global/GLASS-AMU_2016-2022_dataset.xlsx + date_accessed: 2024-11-12 + + # License + license: + name: © 2024 WHO + url: https://www.who.int/about/policies/terms-of-use + +outs: + - md5: 8ccea8361fd446d6aa95804c3449e4e5 + size: 1370160 + path: antimicrobial_usage.xlsx diff --git a/snapshots/antibiotics/2024-11-15/testing_coverage.py b/snapshots/antibiotics/2024-11-15/testing_coverage.py new file mode 100644 index 00000000000..92854b7640b --- /dev/null +++ b/snapshots/antibiotics/2024-11-15/testing_coverage.py @@ -0,0 +1,35 @@ +"""Script to create a snapshot of dataset. + +To download the data visit: + +1. https://worldhealthorg.shinyapps.io/glass-dashboard/_w_c75b737c/#!/amr + +2. Scroll to the 'Testing coverage by infectious syndrome' section and download the data for each region. + +3. Then zip this up and upload the file to snapshot. + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/testing_coverage.zip") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-11-15/testing_coverage.zip.dvc b/snapshots/antibiotics/2024-11-15/testing_coverage.zip.dvc new file mode 100644 index 00000000000..d7d76640fc1 --- /dev/null +++ b/snapshots/antibiotics/2024-11-15/testing_coverage.zip.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Global Antimicrobial Resistance and Use Surveillance System (GLASS) - Testing Coverage + description: |- + GLASS provides a standardized approach to the collection, analysis, interpretation and sharing of data by countries and seeks to actively support capacity building and monitor the status of existing and new national surveillance systems. Furthermore, GLASS promotes a shift from surveillance approaches based solely on laboratory data to a system that includes epidemiological, clinical, and population-level data. GLASS has been conceived to progressively incorporate data from surveillance of AMR in humans, such as monitoring of resistance and the use of antimicrobial medicines, including AMR in the food chain and in the environment. + date_published: "2024-09-16" + + # Citation + producer: World Health Organization + citation_full: |- + Global AMR data - Global Antimicrobial Resistance and Use Surveillance System (GLASS), World Health Organization (2024) + attribution_short: WHO + + # Files + url_main: https://worldhealthorg.shinyapps.io/glass-dashboard/_w_679389fb/#!/amr + date_accessed: 2024-11-15 + + # License + license: + name: © 2024 WHO + url: https://www.who.int/about/policies/terms-of-use + +outs: + - md5: 89f2880de706ea7a31a5da8a22ca6fc1 + size: 17256 + path: testing_coverage.zip diff --git a/snapshots/antibiotics/2024-11-20/microbe.py b/snapshots/antibiotics/2024-11-20/microbe.py new file mode 100644 index 00000000000..3332fd8a741 --- /dev/null +++ b/snapshots/antibiotics/2024-11-20/microbe.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/microbe.zip") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-11-20/microbe.zip.dvc b/snapshots/antibiotics/2024-11-20/microbe.zip.dvc new file mode 100644 index 00000000000..8593fe51b47 --- /dev/null +++ b/snapshots/antibiotics/2024-11-20/microbe.zip.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Neonatal deaths by pathogen + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + date_published: "2024-09-28" + + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-11-20 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + + is_public: false +outs: + - md5: e8d3cdcd212a28c245ef9d9539fe8c8a + size: 118009 + path: microbe.zip diff --git a/snapshots/antibiotics/2024-11-20/pathogen_bloodstream.csv.dvc b/snapshots/antibiotics/2024-11-20/pathogen_bloodstream.csv.dvc new file mode 100644 index 00000000000..7850fa88488 --- /dev/null +++ b/snapshots/antibiotics/2024-11-20/pathogen_bloodstream.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Neonatal deaths from bloodstream infections by pathogen + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + date_published: "2024-09-28" + + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-11-20 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + + is_public: false +outs: + - md5: 72e4f5e570394ca8204e628e3a8f1ca3 + size: 6467 + path: pathogen_bloodstream.csv diff --git a/snapshots/antibiotics/2024-11-20/pathogen_bloodstream.py b/snapshots/antibiotics/2024-11-20/pathogen_bloodstream.py new file mode 100644 index 00000000000..db667fe903f --- /dev/null +++ b/snapshots/antibiotics/2024-11-20/pathogen_bloodstream.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/pathogen_bloodstream.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-11-24/total_syndrome.csv.dvc b/snapshots/antibiotics/2024-11-24/total_syndrome.csv.dvc new file mode 100644 index 00000000000..06b71675cf5 --- /dev/null +++ b/snapshots/antibiotics/2024-11-24/total_syndrome.csv.dvc @@ -0,0 +1,33 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Neonatal deaths from infections by syndrome + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + + date_published: "2024-09-28" + + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + + attribution_short: MICROBE + + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-11-24 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + + is_public: false +outs: + - md5: bb68023abf4d3344e1d62919af169628 + size: 3320 + path: total_syndrome.csv diff --git a/snapshots/antibiotics/2024-11-24/total_syndrome.py b/snapshots/antibiotics/2024-11-24/total_syndrome.py new file mode 100644 index 00000000000..4aa7d7ce79b --- /dev/null +++ b/snapshots/antibiotics/2024-11-24/total_syndrome.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/total_syndrome.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-12-02/microbe_amr.csv.dvc b/snapshots/antibiotics/2024-12-02/microbe_amr.csv.dvc new file mode 100644 index 00000000000..3e75a57c69e --- /dev/null +++ b/snapshots/antibiotics/2024-12-02/microbe_amr.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Neonatal deaths from infections attributed to antimicrobial resistance by syndrome + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + date_published: "2024-09-28" + + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-12-02 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + + is_public: false +outs: + - md5: 0236cab87d4198c1ddb914b828c36eb0 + size: 3594 + path: microbe_amr.csv diff --git a/snapshots/antibiotics/2024-12-02/microbe_amr.py b/snapshots/antibiotics/2024-12-02/microbe_amr.py new file mode 100644 index 00000000000..be5d1cd965e --- /dev/null +++ b/snapshots/antibiotics/2024-12-02/microbe_amr.py @@ -0,0 +1,44 @@ +"""Script to create a snapshot of dataset. + +To access the data for this snapshot, follow these steps: + +- Go to https://vizhub.healthdata.org/microbe/ +- Click on Antimicrobial Resistance in the top tab +- Select these options: + - Category: Syndromes + - Burden: By resistance + - Location: Global + - Age: Neonatal + - Counterfactual: Both (or Attributable) + - Year: 2021 (or whatever latest year is available) + - Measure: Deaths + - Metric: Number + +Download the file and upload it using the script below. + + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/microbe_amr.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-12-02/microbe_neonatal_amr.csv.dvc b/snapshots/antibiotics/2024-12-02/microbe_neonatal_amr.csv.dvc new file mode 100644 index 00000000000..c1e29c03deb --- /dev/null +++ b/snapshots/antibiotics/2024-12-02/microbe_neonatal_amr.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Neonatal deaths from bloodstream infections attributed to antimicrobial resistance by pathogen + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + + date_published: "2024-09-28" + + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-12-02 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + is_public: false +outs: + - md5: 1e718707c8e293ef851d7baf0fef36a1 + size: 3014 + path: microbe_neonatal_amr.csv diff --git a/snapshots/antibiotics/2024-12-02/microbe_neonatal_amr.py b/snapshots/antibiotics/2024-12-02/microbe_neonatal_amr.py new file mode 100644 index 00000000000..19af79c2455 --- /dev/null +++ b/snapshots/antibiotics/2024-12-02/microbe_neonatal_amr.py @@ -0,0 +1,45 @@ +"""Script to create a snapshot of dataset. + +To access the data for this snapshot, follow these steps: + +- Go to https://vizhub.healthdata.org/microbe/ +- Click on Antimicrobial Resistance in the top tab +- Select these options: + - Category: Pathogens + - Burden: By resistance + - Infectious syndrome: Bloodstream infections + - Location: Global + - Age: Neonatal + - Counterfactual: Both (or Attributable) + - Year: 2021 (or whatever latest year is available) + - Measure: Deaths + - Metric: Number + +Download the file and upload it using the script below. + + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/microbe_neonatal_amr.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream.csv.dvc b/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream.csv.dvc new file mode 100644 index 00000000000..26192a4daf4 --- /dev/null +++ b/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Total deaths from bloodstream infections by pathogen + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + date_published: "2024-09-28" + + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-12-02 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + + is_public: false +outs: + - md5: 04ae50c86998acf6143a1685f04068ae + size: 6452 + path: total_pathogen_bloodstream.csv diff --git a/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream.py b/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream.py new file mode 100644 index 00000000000..564c409dd7c --- /dev/null +++ b/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream.py @@ -0,0 +1,43 @@ +"""Script to create a snapshot of dataset. + +To access the data for this snapshot, follow these steps: + +- Go to https://vizhub.healthdata.org/microbe/ +- Click on Pathogens in the top tab +- Select these options: + - Infectious syndrome: Bloodsteam infections + - Location: Global + - Age: All ages + - Sex: Both + - Year: 2021 (or whatever latest year is available) + - Measure: Deaths + - Metric: Number + +Download the file and upload it using the script below. + + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/total_pathogen_bloodstream.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.csv.dvc b/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.csv.dvc new file mode 100644 index 00000000000..8225476e843 --- /dev/null +++ b/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Total deaths from bloodstream infections by pathogen and resistance + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + date_published: "2024-09-28" + + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-12-02 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + + is_public: false +outs: + - md5: d76d22fd3288eccd20d838712105d773 + size: 3006 + path: total_pathogen_bloodstream_amr.csv diff --git a/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py b/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py new file mode 100644 index 00000000000..c021512e369 --- /dev/null +++ b/snapshots/antibiotics/2024-12-02/total_pathogen_bloodstream_amr.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/total_pathogen_bloodstream_amr.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-12-03/glass_enrolment.py b/snapshots/antibiotics/2024-12-03/glass_enrolment.py new file mode 100644 index 00000000000..9c31f08a086 --- /dev/null +++ b/snapshots/antibiotics/2024-12-03/glass_enrolment.py @@ -0,0 +1,30 @@ +"""Script to create a snapshot of dataset. + +The data should be available here: https://www.who.int/initiatives/glass/country-participation + +But if it is out of date (e.g not in sync with the image on the page above), then contact glass@who.int to access the latest data. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/glass_enrolment.xlsx") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-12-03/glass_enrolment.xlsx.dvc b/snapshots/antibiotics/2024-12-03/glass_enrolment.xlsx.dvc new file mode 100644 index 00000000000..a124b134985 --- /dev/null +++ b/snapshots/antibiotics/2024-12-03/glass_enrolment.xlsx.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: GLASS Country Participation + description: |- + The list of countries that are enrolled in the WHO's Global Antimicrobial Resistance and Use Surveillance System (GLASS). Countries can be enrolled in GLASS to collect and share data on antimicrobial consumption (AMC), and, or antimicrobial resistance (AMR) in line with the GLASS-AMR and GLASS-AMC methodologies, respectively. + date_published: "2024-12-03" + + # Citation + producer: World Health Organization + citation_full: |- + GLASS Country Participation (2024). Global Antimicrobial Resistance and Use Surveillance System (GLASS), World Health Organization. + attribution_short: WHO + + # Files + url_main: https://www.who.int/initiatives/glass/country-participation + date_accessed: 2024-12-03 + + # License + license: + name: © 2024 WHO + url: https://www.who.int/about/policies/terms-of-use + +outs: + - md5: d10cca0830352c8145c9feb4958b4120 + size: 88534 + path: glass_enrolment.xlsx diff --git a/snapshots/antibiotics/2024-12-04/microbe_total_pathogens.csv.dvc b/snapshots/antibiotics/2024-12-04/microbe_total_pathogens.csv.dvc new file mode 100644 index 00000000000..e904e5d6873 --- /dev/null +++ b/snapshots/antibiotics/2024-12-04/microbe_total_pathogens.csv.dvc @@ -0,0 +1,27 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Total deaths by pathogen + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + date_published: "2024-09-28" + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-12-04 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + is_public: false +outs: + - md5: bff21259d44b1ab5a61ecf675285d13b + size: 10960 + path: microbe_total_pathogens.csv diff --git a/snapshots/antibiotics/2024-12-04/microbe_total_pathogens.py b/snapshots/antibiotics/2024-12-04/microbe_total_pathogens.py new file mode 100644 index 00000000000..754aaafe897 --- /dev/null +++ b/snapshots/antibiotics/2024-12-04/microbe_total_pathogens.py @@ -0,0 +1,37 @@ +"""Script to create a snapshot of dataset. + +To download the data visit: https://vizhub.healthdata.org/microbe/ + +- Select the 'Pathogens' tab. +- Infectious syndrome: 'All infectious syndromes' +- Location: 'Global' +- Age: 'All ages' +- Sex: 'Both' +- Measure: 'Deaths' +- Metric: 'Number' + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/microbe_total_pathogens.csv") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-12-04/microbe_total_pathogens_amr.csv.dvc b/snapshots/antibiotics/2024-12-04/microbe_total_pathogens_amr.csv.dvc new file mode 100644 index 00000000000..77cd02a9cf8 --- /dev/null +++ b/snapshots/antibiotics/2024-12-04/microbe_total_pathogens_amr.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Total deaths by pathogen attributable to antimicrobial resistance + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + date_published: "2024-09-28" + + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-12-04 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + + is_public: false +outs: + - md5: 95bd7ca4c721a4e5113fd54ee598dad3 + size: 3989 + path: microbe_total_pathogens_amr.csv diff --git a/snapshots/antibiotics/2024-12-04/microbe_total_pathogens_amr.py b/snapshots/antibiotics/2024-12-04/microbe_total_pathogens_amr.py new file mode 100644 index 00000000000..7b387e0ef02 --- /dev/null +++ b/snapshots/antibiotics/2024-12-04/microbe_total_pathogens_amr.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/microbe_total_pathogens_amr.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-12-05/microbe_neonatal_total_amr.csv.dvc b/snapshots/antibiotics/2024-12-05/microbe_neonatal_total_amr.csv.dvc new file mode 100644 index 00000000000..357627600e7 --- /dev/null +++ b/snapshots/antibiotics/2024-12-05/microbe_neonatal_total_amr.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Neonatal deaths by pathogen attributable to antimicrobial resistance + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + date_published: "2024-09-28" + + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-12-05 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + + is_public: false +outs: + - md5: 992049355ab469d34378e2ae38ff2ef5 + size: 3919 + path: microbe_neonatal_total_amr.csv diff --git a/snapshots/antibiotics/2024-12-05/microbe_neonatal_total_amr.py b/snapshots/antibiotics/2024-12-05/microbe_neonatal_total_amr.py new file mode 100644 index 00000000000..212a28a0f33 --- /dev/null +++ b/snapshots/antibiotics/2024-12-05/microbe_neonatal_total_amr.py @@ -0,0 +1,36 @@ +"""Script to create a snapshot of dataset. + +To download the data visit: https://vizhub.healthdata.org/microbe/ + +- Select the 'Antimicrobial resistance' tab. +- Cateogory: 'Pathogens' +- Location: 'Global' +- Age: 'Neonatal' +- Counterfactual: 'Attributable' +- Measure: 'Deaths' +- Metric: 'Number' + +""" +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/microbe_neonatal_total_amr.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.csv.dvc b/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.csv.dvc new file mode 100644 index 00000000000..bb61d4c9241 --- /dev/null +++ b/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.csv.dvc @@ -0,0 +1,28 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Total deaths by syndrome + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + date_published: "2024-09-28" + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-12-05 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + is_public: false +outs: + - md5: d494bc6e6c0a61cf05d3d9fb03a2f0b5 + size: 3454 + path: microbe_total_deaths_by_syndrome.csv diff --git a/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.py b/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.py new file mode 100644 index 00000000000..53098bc97eb --- /dev/null +++ b/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome.py @@ -0,0 +1,36 @@ +"""Script to create a snapshot of dataset. + +To download the data visit: https://vizhub.healthdata.org/microbe/ + +- Select the 'Antimicrobial resistance' tab. +- Cateogory: 'Pathogens' +- Location: 'Global' +- Age: 'Neonatal' +- Counterfactual: 'Attributable' +- Measure: 'Deaths' +- Metric: 'Number' + +""" +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/microbe_total_deaths_by_syndrome.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.csv.dvc b/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.csv.dvc new file mode 100644 index 00000000000..f845b6df7bd --- /dev/null +++ b/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.csv.dvc @@ -0,0 +1,28 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Total deaths by syndrome and antimicrobial resistance + description: |- + The MICROBE (Measuring Infectious Causes and Resistance Outcomes for Burden Estimation) tool visualizes the fatal and nonfatal health outcomes of infections, pathogens, and antimicrobial resistance across different countries and regions. The tool shows a novel estimation method, [published in The Lancet](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)01867-1/fulltext), including the burden of infections and their underlying pathogens, as well as, the burden of pathogens that are both susceptible and resistant to antibiotics. This tool is useful for understanding the burden of these outcomes, as well as illustrating how they nest together. The tabs explore different health outcomes by geography, age and sex. All tabs include a bar visualization for comparison, as well as a map view for a global perspective. + date_published: "2024-09-28" + # Citation + producer: Institute for Health Metrics and Evaluation (IHME); University of Oxford + citation_full: |- + Institute for Health Metrics and Evaluation (IHME), University of Oxford. MICROBE. Seattle, WA: IHME, University of Washington, 2024. Available from [https://vizhub.healthdata.org/microbe](https://vizhub.healthdata.org/microbe) + attribution_short: MICROBE + # Files + url_main: https://vizhub.healthdata.org/microbe/ + date_accessed: 2024-12-05 + + # License + license: + name: IHME's Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement + + is_public: false +outs: + - md5: fe3b1da5275f0a31e882fab841b32ee4 + size: 1860 + path: microbe_total_deaths_by_syndrome_amr.csv diff --git a/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py b/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py new file mode 100644 index 00000000000..689613aeba5 --- /dev/null +++ b/snapshots/antibiotics/2024-12-05/microbe_total_deaths_by_syndrome_amr.py @@ -0,0 +1,36 @@ +"""Script to create a snapshot of dataset. + +To download the data visit: https://vizhub.healthdata.org/microbe/ + +- Select the 'Antimicrobial resistance' tab. +- Cateogory: 'Pathogens' +- Location: 'Global' +- Age: 'Neonatal' +- Counterfactual: 'Attributable' +- Measure: 'Deaths' +- Metric: 'Number' + +""" +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"antibiotics/{SNAPSHOT_VERSION}/microbe_total_deaths_by_syndrome_amr.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/artificial_intelligence/2024-02-15/epoch_llms.csv.dvc b/snapshots/artificial_intelligence/2024-02-15/epoch_llms.csv.dvc index 30d76738bb8..6c156a1470b 100644 --- a/snapshots/artificial_intelligence/2024-02-15/epoch_llms.csv.dvc +++ b/snapshots/artificial_intelligence/2024-02-15/epoch_llms.csv.dvc @@ -4,13 +4,13 @@ meta: description: Epoch dataset on how performance on a MMLU language benchmark scales with computational resources. producer: Epoch citation_full: |- - Owen, David. (2023). Large Language Model performance and compute, Epoch (2023) [Data set]. In Extrapolating performance in language modeling benchmarks. Published online at epochai.org. Retrieved from: 'https://epochai.org/blog/extrapolating-performance-in-language-modelling-benchmarks' [online resource]. - url_main: https://epochai.org/blog/extrapolating-performance-in-language-modelling-benchmarks + Owen, David. (2023). Large Language Model performance and compute, Epoch (2023) [Data set]. In Extrapolating performance in language modeling benchmarks. Published online at epoch.ai. Retrieved from: 'https://epoch.ai/blog/extrapolating-performance-in-language-modelling-benchmarks' [online resource]. + url_main: https://epoch.ai/blog/extrapolating-performance-in-language-modelling-benchmarks date_accessed: 2024-02-15 date_published: 2023-07-12 license: name: Creative Commons BY 4.0 - url: https://epochai.org/blog/extrapolating-performance-in-language-modelling-benchmarks + url: https://epoch.ai/blog/extrapolating-performance-in-language-modelling-benchmarks wdir: ../../../data/snapshots/artificial_intelligence/2024-02-15 outs: - md5: 134cb73f28be470ee6566d86a19812e4 diff --git a/snapshots/artificial_intelligence/2024-06-06/epoch_compute_cost.csv.dvc b/snapshots/artificial_intelligence/2024-06-06/epoch_compute_cost.csv.dvc index 97b31a496e2..fcd1e33cdec 100644 --- a/snapshots/artificial_intelligence/2024-06-06/epoch_compute_cost.csv.dvc +++ b/snapshots/artificial_intelligence/2024-06-06/epoch_compute_cost.csv.dvc @@ -17,7 +17,7 @@ meta: Ben Cottier, Robi Rahman, Loredana Fattorini, Nestor Maslej, and David Owen. ‘The rising costs of training frontier AI models’. ArXiv [cs.CY], 2024. arXiv. https://arxiv.org/abs/2405.21015. # Files - url_main: https://epochai.org/blog/how-much-does-it-cost-to-train-frontier-ai-models + url_main: https://epoch.ai/blog/how-much-does-it-cost-to-train-frontier-ai-models date_accessed: 2024-06-06 # License diff --git a/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc b/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc index 2aac9cfe012..824f09ceab4 100644 --- a/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc +++ b/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc @@ -22,10 +22,10 @@ meta: The authors note that: "For new models (from 2020 onward) it is harder to assess these criteria, so we fall back to a subjective selection. We refer to models meeting our selection criteria as 'milestone models." # Citation producer: Epoch - citation_full: "Epoch AI, ‘Parameter, Compute and Data Trends in Machine Learning’. Published online at epochai.org. Retrieved from: ‘https://epochai.org/data/epochdb/visualization’ [online resource]" + citation_full: "Epoch AI, ‘Parameter, Compute and Data Trends in Machine Learning’. Published online at epochai.org. Retrieved from: ‘https://epoch.ai/data/epochdb/visualization’ [online resource]" # Files - url_main: https://epochai.org/mlinputs/visualization - url_download: https://epochai.org/data/epochdb/notable_ai_models.csv + url_main: https://epoch.ai/mlinputs/visualization + url_download: https://epoch.ai/data/epochdb/notable_ai_models.csv date_accessed: 2024-09-09 # License license: diff --git a/snapshots/artificial_intelligence/2024-10-01/epoch.csv.dvc b/snapshots/artificial_intelligence/2024-10-01/epoch.csv.dvc index a1e3a3dd8bd..4c1a7391057 100644 --- a/snapshots/artificial_intelligence/2024-10-01/epoch.csv.dvc +++ b/snapshots/artificial_intelligence/2024-10-01/epoch.csv.dvc @@ -22,10 +22,10 @@ meta: The authors note that: "For new models (from 2020 onward) it is harder to assess these criteria, so we fall back to a subjective selection. We refer to models meeting our selection criteria as 'milestone models." # Citation producer: Epoch - citation_full: "Epoch AI, ‘Parameter, Compute and Data Trends in Machine Learning’. Published online at epochai.org. Retrieved from: ‘https://epochai.org/data/epochdb/visualization’ [online resource]" + citation_full: "Epoch AI, ‘Parameter, Compute and Data Trends in Machine Learning’. Published online at epoch.ai. Retrieved from: ‘https://epoch.ai/data/epochdb/visualization’ [online resource]" # Files - url_main: https://epochai.org/mlinputs/visualization - url_download: https://epochai.org/data/epochdb/notable_ai_models.csv + url_main: https://epoch.ai/mlinputs/visualization + url_download: https://epoch.ai/data/epochdb/notable_ai_models.csv date_accessed: 2024-10-01 # License license: diff --git a/snapshots/artificial_intelligence/2024-10-01/epoch_compute_intensive.csv.dvc b/snapshots/artificial_intelligence/2024-10-01/epoch_compute_intensive.csv.dvc index bfff51b4985..96683f165d0 100644 --- a/snapshots/artificial_intelligence/2024-10-01/epoch_compute_intensive.csv.dvc +++ b/snapshots/artificial_intelligence/2024-10-01/epoch_compute_intensive.csv.dvc @@ -15,17 +15,17 @@ meta: # Citation producer: Epoch citation_full: |- - Robi Rahman, David Owen and Josh You (2024), "Tracking Compute-Intensive AI Models". Published online at epochai.org. Retrieved from: 'https://epochai.org/blog/tracking-compute-intensive-ai-models' [online resource] + Robi Rahman, David Owen and Josh You (2024), "Tracking Compute-Intensive AI Models". Published online at epoch.ai. Retrieved from: 'https://epoch.ai/blog/tracking-compute-intensive-ai-models' [online resource] # Files - url_main: https://epochai.org/blog/tracking-compute-intensive-ai-models - url_download: https://epochai.org/data/epochdb/large_scale_ai_models.csv + url_main: https://epoch.ai/blog/tracking-compute-intensive-ai-models + url_download: https://epoch.ai/data/epochdb/large_scale_ai_models.csv date_accessed: 2024-10-01 # License license: name: CC BY 4.0 - url: https://epochai.org/blog/how-much-does-it-cost-to-train-frontier-ai-models + url: https://epoch.ai/blog/how-much-does-it-cost-to-train-frontier-ai-models outs: - md5: fc0daab615a6057ff1c9a0df94d757c7 size: 441995 diff --git a/snapshots/artificial_intelligence/2024-11-03/epoch.csv.dvc b/snapshots/artificial_intelligence/2024-11-03/epoch.csv.dvc new file mode 100644 index 00000000000..83520f373f6 --- /dev/null +++ b/snapshots/artificial_intelligence/2024-11-03/epoch.csv.dvc @@ -0,0 +1,38 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: Parameter, Compute and Data Trends in Machine Learning + date_published: 2024-07-19 + description_snapshot: | + We update this chart with the latest available data from our source every month. + + The authors selected the AI systems for inclusion based on the following necessary criteria: + — Have an explicit learning component + — Showcase experimental results + — Advance the state of the art + + In addition, the systems had to meet at least one of the following notability criteria: + — Paper has more than 1000 citations + — Historical importance + — Important state-of-the-art advance + — Deployed in a notable context + + The authors note that: "For new models (from 2020 onward) it is harder to assess these criteria, so we fall back to a subjective selection. We refer to models meeting our selection criteria as 'milestone models." + # Citation + producer: Epoch + citation_full: "Epoch AI, ‘Parameter, Compute and Data Trends in Machine Learning’. Published online at epochai.org. Retrieved from: ‘https://epoch.ai/data/epochdb/visualization’ [online resource]" + # Files + url_main: https://epoch.ai/mlinputs/visualization + url_download: https://epoch.ai/data/epochdb/notable_ai_models.csv + date_accessed: 2024-11-03 + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: cf2671ca050fea8d7c990376221a2a5a + size: 1580212 + path: epoch.csv diff --git a/snapshots/artificial_intelligence/2024-11-03/epoch.py b/snapshots/artificial_intelligence/2024-11-03/epoch.py new file mode 100644 index 00000000000..daa355e267f --- /dev/null +++ b/snapshots/artificial_intelligence/2024-11-03/epoch.py @@ -0,0 +1,33 @@ +"""Script to create a snapshot of dataset 'Parameter, Compute and Data Trends in Machine Learning (Epoch, 2023)'.""" + + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/epoch.csv") + + # Download data from source. + snap.download_from_source() + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/artificial_intelligence/2024-11-03/epoch_compute_intensive.csv.dvc b/snapshots/artificial_intelligence/2024-11-03/epoch_compute_intensive.csv.dvc new file mode 100644 index 00000000000..76ea21b5f6b --- /dev/null +++ b/snapshots/artificial_intelligence/2024-11-03/epoch_compute_intensive.csv.dvc @@ -0,0 +1,32 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Tracking Compute-Intensive AI Models + description: |- + A dataset that tracks compute-intensive AI models, with training compute over 10²³ floating point operations (FLOP). This corresponds to training costs of hundreds of thousands of dollars or more.  + + To identify compute-intensive AI models, the team at Epoch AI used various resources, estimating compute when not directly reported. They included benchmarks and repositories, such as Papers With Code and Hugging Face, to find models exceeding 10²³ FLOP. They also explored non-English media and specific leaderboards, particularly focusing on Chinese sources. + + Additionally, they examined blog posts, press releases from major labs, and scholarly literature to track new models. A separate table was created for models with unconfirmed but plausible compute levels. Despite thorough methods, proprietary and secretive models may have been missed. + date_published: "2024-06-19" + + # Citation + producer: Epoch + citation_full: |- + Robi Rahman, David Owen and Josh You (2024), "Tracking Compute-Intensive AI Models". Published online at epochai.org. Retrieved from: 'https://epoch.ai/blog/tracking-compute-intensive-ai-models' [online resource] + + # Files + url_main: https://epoch.ai/blog/tracking-compute-intensive-ai-models + url_download: https://epoch.ai/data/epochdb/large_scale_ai_models.csv + date_accessed: 2024-11-03 + + # License + license: + name: CC BY 4.0 + url: https://epoch.ai/blog/how-much-does-it-cost-to-train-frontier-ai-models +outs: + - md5: 0f76cc6dafea7cce2443a1844343bb49 + size: 449603 + path: epoch_compute_intensive.csv diff --git a/snapshots/artificial_intelligence/2024-11-03/epoch_compute_intensive.py b/snapshots/artificial_intelligence/2024-11-03/epoch_compute_intensive.py new file mode 100644 index 00000000000..fdbd7822e4a --- /dev/null +++ b/snapshots/artificial_intelligence/2024-11-03/epoch_compute_intensive.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/epoch_compute_intensive.csv") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/artificial_intelligence/2024-12-05/epoch.csv.dvc b/snapshots/artificial_intelligence/2024-12-05/epoch.csv.dvc new file mode 100644 index 00000000000..2ba13908db0 --- /dev/null +++ b/snapshots/artificial_intelligence/2024-12-05/epoch.csv.dvc @@ -0,0 +1,38 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: Parameter, Compute and Data Trends in Machine Learning + date_published: 2024-07-19 + description_snapshot: | + We update this chart with the latest available data from our source every month. + + The authors selected the AI systems for inclusion based on the following necessary criteria: + — Have an explicit learning component + — Showcase experimental results + — Advance the state of the art + + In addition, the systems had to meet at least one of the following notability criteria: + — Paper has more than 1000 citations + — Historical importance + — Important state-of-the-art advance + — Deployed in a notable context + + The authors note that: "For new models (from 2020 onward) it is harder to assess these criteria, so we fall back to a subjective selection. We refer to models meeting our selection criteria as 'milestone models." + # Citation + producer: Epoch + citation_full: "Epoch AI, ‘Parameter, Compute and Data Trends in Machine Learning’. Published online at epochai.org. Retrieved from: ‘https://epoch.ai/data/epochdb/visualization’ [online resource]" + # Files + url_main: https://epoch.ai/mlinputs/visualization + url_download: https://epoch.ai/data/epochdb/notable_ai_models.csv + date_accessed: 2024-12-05 + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: 98750b0d23c2f5e11b766e0849432fb3 + size: 1600590 + path: epoch.csv diff --git a/snapshots/artificial_intelligence/2024-12-05/epoch.py b/snapshots/artificial_intelligence/2024-12-05/epoch.py new file mode 100644 index 00000000000..daa355e267f --- /dev/null +++ b/snapshots/artificial_intelligence/2024-12-05/epoch.py @@ -0,0 +1,33 @@ +"""Script to create a snapshot of dataset 'Parameter, Compute and Data Trends in Machine Learning (Epoch, 2023)'.""" + + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/epoch.csv") + + # Download data from source. + snap.download_from_source() + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.csv.dvc b/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.csv.dvc new file mode 100644 index 00000000000..1850e7f75b3 --- /dev/null +++ b/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.csv.dvc @@ -0,0 +1,33 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Tracking Compute-Intensive AI Models + description: |- + A dataset that tracks compute-intensive AI models, with training compute over 10²³ floating point operations (FLOP). This corresponds to training costs of hundreds of thousands of dollars or more.  + + To identify compute-intensive AI models, the team at Epoch AI used various resources, estimating compute when not directly reported. They included benchmarks and repositories, such as Papers With Code and Hugging Face, to find models exceeding 10²³ FLOP. They also explored non-English media and specific leaderboards, particularly focusing on Chinese sources. + + Additionally, they examined blog posts, press releases from major labs, and scholarly literature to track new models. A separate table was created for models with unconfirmed but plausible compute levels. Despite thorough methods, proprietary and secretive models may have been missed. + date_published: "2024-06-19" + + # Citation + producer: Epoch + citation_full: |- + Robi Rahman, David Owen and Josh You (2024), "Tracking Compute-Intensive AI Models". Published online at epochai.org. Retrieved from: 'https://epoch.ai/blog/tracking-compute-intensive-ai-models' [online resource] + + # Files + url_main: https://epoch.ai/blog/tracking-compute-intensive-ai-models + url_download: https://epoch.ai/data/epochdb/large_scale_ai_models.csv + date_accessed: 2024-12-05 + + # License + license: + name: CC BY 4.0 + url: https://epoch.ai/blog/how-much-does-it-cost-to-train-frontier-ai-models + +outs: + - md5: c52df75e59048128dc8288a0467f3f4c + size: 484868 + path: epoch_compute_intensive.csv diff --git a/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.py b/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.py new file mode 100644 index 00000000000..fdbd7822e4a --- /dev/null +++ b/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/epoch_compute_intensive.csv") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/climate/2024-11-05/surface_temperature.py b/snapshots/climate/2024-11-05/surface_temperature.py new file mode 100644 index 00000000000..bdd2c980630 --- /dev/null +++ b/snapshots/climate/2024-11-05/surface_temperature.py @@ -0,0 +1,55 @@ +"""Script to create a snapshot of the monthly averaged surface temperature data from 1950 to present from the Copernicus Climate Change Service. + +The script assumes that the data is available on the CDS API. +Instructions on how to access the API on a Mac are here: https://confluence.ecmwf.int/display/CKB/How+to+install+and+use+CDS+API+on+macOS + +More information on how to access the data is here: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview + +The data is downloaded as a NetCDF file. Tutorials for using the Copernicus API are here and work with the NETCDF format are here: https://ecmwf-projects.github.io/copernicus-training-c3s/cds-tutorial.html +""" + +import tempfile +from pathlib import Path + +# CDS API +import cdsapi +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/surface_temperature.zip") + snap + # Save data as a compressed temporary file. + with tempfile.TemporaryDirectory() as temp_dir: + output_file = Path(temp_dir) / "era5_monthly_t2m_eur.nc" + + client = cdsapi.Client() + + dataset = "reanalysis-era5-single-levels-monthly-means" + request = { + "product_type": ["monthly_averaged_reanalysis"], + "variable": ["2m_temperature"], + "year": [str(year) for year in range(1940, 2025)], + "month": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"], + "time": "00:00", + "area": [90, -180, -90, 180], + "data_format": "netcdf", + "download_format": "zip", + } + + client.retrieve(dataset, request, output_file) + + # Upload snapshot. + snap.create_snapshot(filename=output_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/climate/2024-11-05/surface_temperature.zip.dvc b/snapshots/climate/2024-11-05/surface_temperature.zip.dvc new file mode 100644 index 00000000000..8c6b1acf9dc --- /dev/null +++ b/snapshots/climate/2024-11-05/surface_temperature.zip.dvc @@ -0,0 +1,27 @@ +meta: + origin: + title_snapshot: ERA5 Monthly Averaged Data on Single Levels from 1940 to Present - Monthly Averages of 2m Surface Temperature + title: ERA5 monthly averaged data on single levels from 1940 to present + description: |- + ERA5 is the latest climate reanalysis produced by ECMWF, providing hourly data on many atmospheric, land-surface and sea-state parameters together with estimates of uncertainty. + + ERA5 data are available in the Climate Data Store on regular latitude-longitude grids at 0.25° x 0.25° resolution, with atmospheric parameters on 37 pressure levels. + + ERA5 is available from 1940 and continues to be extended forward in time, with daily updates being made available 5 days behind real time + + Initial release data, i.e., data no more than three months behind real time, are called ERA5T. + producer: Contains modified Copernicus Climate Change Service information + version_producer: 2 + citation_full: |- + Hersbach, H., Bell, B., Berrisford, P., Biavati, G., Horányi, A., Muñoz Sabater, J., Nicolas, J., Peubey, C., Radu, R., Rozum, I., Schepers, D., Simmons, A., Soci, C., Dee, D., Thépaut, J-N. (2023): ERA5 monthly averaged data on single levels from 1940 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.f17050d7 (Accessed on 13-October-2024) + url_main: https://cds.climate.copernicus.eu/datasets/reanalysis-era5-pressure-levels-monthly-means?tab=overview + date_accessed: 2024-10-13 + date_published: 2019-04-18 + license: + name: Copernicus License + url: https://cds.climate.copernicus.eu/datasets/reanalysis-era5-pressure-levels-monthly-means?tab=overview + +outs: + - md5: 99968ec229bc19cc78393358805a9e25 + size: 1396487077 + path: surface_temperature.zip diff --git a/snapshots/climate/2024-11-18/ch4_concentration_monthly.csv.dvc b/snapshots/climate/2024-11-18/ch4_concentration_monthly.csv.dvc new file mode 100644 index 00000000000..b113298a31a --- /dev/null +++ b/snapshots/climate/2024-11-18/ch4_concentration_monthly.csv.dvc @@ -0,0 +1,23 @@ +meta: + origin: + producer: NOAA Global Monitoring Laboratory + title: Trends in Atmospheric Methane + description: |- + The Carbon Cycle Greenhouse Gases (CCGG) research area operates the Global Greenhouse Gas Reference Network, measuring the atmospheric distribution and trends of the three main long-term drivers of climate change, carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), as well as carbon monoxide (CO) which is an important indicator of air pollution. + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) Global Monitoring Laboratory, Boulder, Colorado, USA (https://gml.noaa.gov) - Trends in Atmospheric Methane. + + Lan, X., K.W. Thoning, and E.J. Dlugokencky: Trends in globally-averaged CH4, N2O, and SF6 determined from NOAA Global Monitoring Laboratory measurements. https://doi.org/10.15138/P8XG-AA10 + attribution: NOAA Global Monitoring Laboratory - Trends in Atmospheric Methane (2024) + attribution_short: NOAA/GML + url_main: https://gml.noaa.gov/ccgg/trends_ch4/ + url_download: https://gml.noaa.gov/webdata/ccgg/trends/ch4/ch4_mm_gl.csv + date_accessed: '2024-11-18' + date_published: '2024-11-05' + license: + name: CC BY 4.0 + url: https://gml.noaa.gov/about/disclaimer.html +outs: + - md5: ba9f906eedfb08c828b5cd797c96af40 + size: 23035 + path: ch4_concentration_monthly.csv diff --git a/snapshots/climate/2024-11-18/climate_change_impacts.py b/snapshots/climate/2024-11-18/climate_change_impacts.py new file mode 100644 index 00000000000..f3aa5e5f526 --- /dev/null +++ b/snapshots/climate/2024-11-18/climate_change_impacts.py @@ -0,0 +1,213 @@ +"""Script to create a snapshot for each of the climate change datasets that have regular updates. + +The publication date will be automatically extracted from the source website, if possible, and otherwise it will be +assumed to be the same as the access date. These dates will be written to the metadata dvc files. + +NOTE: If any of the snapshots fails, first try to fix the issue. But, if that's not possible (e.g. because the data provider server is down, which happens relatively often) follow this steps: +1. Remove the new .dvc file of that failing snapshot. +2. Edit the climate.yml dag file, so that the new affected meadow steps use the latest working snapshot. +3. Comment out the file names of the failing snapshots in the "FILES" list below. +4. Execute this script. + * If another snapshot fails, go back to step 1. +5. Uncomment the file names of the failing snapshots (so that on next update all snapshots will be executed). +6. Commit the changes in the dag. + +If a certain snapshot has been failing multiple times (which you can see by looking at the date of the latest working snapshot) consider changing the data provider. + +""" + +import re +from datetime import datetime +from pathlib import Path +from typing import Optional + +import click +import requests +from bs4 import BeautifulSoup +from structlog import get_logger + +from etl.snapshot import Snapshot + +log = get_logger() + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Names of data files. +FILES = [ + # NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis. + # NOTE: Publication date cannot be automatically extracted. + "surface_temperature_analysis_world.csv", + "surface_temperature_analysis_northern_hemisphere.csv", + "surface_temperature_analysis_southern_hemisphere.csv", + # National Snow and Ice Data Center - Sea Ice Index. + "sea_ice_index.xlsx", + # Met Office Hadley Centre - HadSST. + "sea_surface_temperature_world.csv", + "sea_surface_temperature_northern_hemisphere.csv", + "sea_surface_temperature_southern_hemisphere.csv", + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # NOTE: Publication date cannot be automatically extracted. + "ocean_heat_content_monthly_world_700m.csv", + "ocean_heat_content_monthly_world_2000m.csv", + "ocean_heat_content_annual_world_700m.csv", + "ocean_heat_content_annual_world_2000m.csv", + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series. + "hawaii_ocean_time_series.csv", + # Rutgers University Global Snow Lab - Snow Cover Extent. + # NOTE: Publication date cannot be automatically extracted. But they seem to have regular updates (even daily). + "snow_cover_extent_north_america.csv", + "snow_cover_extent_northern_hemisphere.csv", + # NOAA Global Monitoring Laboratory. + "co2_concentration_monthly.csv", + "ch4_concentration_monthly.csv", + "n2o_concentration_monthly.csv", +] + +######################################################################################################################## +# Other possible datasets to include: +# * Ocean heat content data from MRI/JMA. We have this data as part of the EPA ocean heat content compilation. +# But in the following link, they claim the data is updated every year, so it could be added to our yearly data. +# https://www.data.jma.go.jp/gmd/kaiyou/english/ohc/ohc_global_en.html +# * Rutgers University Global Snow Lab also includes snow cover extent for: +# * Eurasia: https://climate.rutgers.edu/snowcover/files/moncov.eurasia.txt +# * North America (excluding Greenland): https://climate.rutgers.edu/snowcover/files/moncov.nam.txt +# * Ice sheet mass balance from NASA EarthData. This is regularly updated, but to access it one has to manually log in. +# The data can be manually accessed from: +# https://climate.nasa.gov/vital-signs/ice-sheets/ +# By clicking on the HTTP link. This leads to a manual log in page. +# Once logged in, the data is accessible via the following link: +# https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/ANTARCTICA_MASS_TELLUS_MASCON_CRI_TIME_SERIES_RL06.1_V3/antarctica_mass_200204_202310.txt +# So, one could use this link, trying with different dates (e.g. ..._202401.txt, ..._202312.txt, ..._202311.txt), +# until the most recent file is downloaded. +# I contacted EarthData to ask if there is any way to access the latest data programmatically. +# * Global sea level from NASA. +# We could get more up-to-date data on sea levels from https://sealevel.jpl.nasa.gov/ +# but we would need to use a special library with credentials to fetch the data (and the baseline and format would +# probably be different). +######################################################################################################################## + + +def find_date_published(snap: Snapshot) -> Optional[str]: + # Extract publication date for each individual origin, if possible. + # Otherwise, assign the current access date as publication date. + if snap.path.name == "sea_ice_index.xlsx": + # * For sea_ice_index, the date_published can be found on: + # https://noaadata.apps.nsidc.org/NOAA/G02135/seaice_analysis/ + # Next to the file name (Sea_Ice_Index_Monthly_Data_by_Year_G02135_v3.0.xlsx). + + # Extract all the text in the web page. + url = "/".join(snap.metadata.origin.url_download.split("/")[:-1]) # type: ignore + response = requests.get(url) + # Parse HTML content. + soup = BeautifulSoup(response.text, "html.parser") + + # Fetch the date that is written next to the title. + for line in soup.text.split("\n"): + if "Sea_Ice_Index_Monthly_Data_by_Year" in line: + dates = re.findall(r"\d{2}-\w{3}-\d{4}", line) + if len(dates) == 1: + # Format date conveniently. + date = datetime.strptime(dates[0], "%d-%b-%Y").strftime("%Y-%m-%d") + return date + else: + log.warn(f"Failed to extract date_published for: {snap.path.name}") + + elif snap.path.name.startswith("sea_surface_temperature_"): + # * For sea_surface_temperature_* the date_published can be found on: + # https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html + + # Extract all the text in the web page. + url = snap.metadata.origin.url_download.split("/data/")[0] + "/data/download.html" # type: ignore + response = requests.get(url) + # Parse HTML content. + soup = BeautifulSoup(response.text, "html.parser") + + for line in soup.text.split("\n"): + # At the bottom of the page, there is a line like "Last updated: 09/01/2024 Expires: 09/01/2025". + if "Last updated" in line: + dates = re.findall(r"\d{2}/\d{2}/\d{4}", line) + if len(dates) == 2: + # Format date conveniently. + date = datetime.strptime(dates[0], "%d/%m/%Y").strftime("%Y-%m-%d") + return date + else: + log.warn(f"Failed to extract date_published for: {snap.path.name}") + + elif snap.path.name == "hawaii_ocean_time_series.csv": + # * For the Hawaii Ocean Time-Series, the date_published can be found written on the header of the data itself: + # https://hahana.soest.hawaii.edu/hot/hotco2/HOT_surface_CO2.txt + + # Extract text from data file. + url = snap.metadata.origin.url_download # type: ignore + response = requests.get(url) # type: ignore[reportArgumentType] + for line in response.text.split("\n"): + # At the top of the file, there is a line like "Last updated 11 December 2023 by J.E. Dore". + if "Last updated" in line: + # Regular expression to extract the date + dates = re.findall(r"\d{1,2}\s+\w+\s+\d{4}", line) + if len(dates) == 1: + # Format date conveniently. + date = datetime.strptime(dates[0], "%d %B %Y").strftime("%Y-%m-%d") + return date + else: + log.warn(f"Failed to extract date_published for: {snap.path.name}") + + elif "_concentration" in snap.path.name: + # * For NOAA GML concentration data, the date_published can be found in the header of each data file. + # The date is in a line like "# File Creation: Fri Jan 5 03:55:24 2024". + + # Extract text from data file. + url = snap.metadata.origin.url_download # type: ignore + response = requests.get(url) # type: ignore[reportArgumentType] + for line in response.text.split("\n"): + # At the top of the file, there is a line like "Last updated 11 December 2023 by J.E. Dore". + if "File Creation" in line: + # Regular expression to extract the date + dates = re.findall(r"\w{3}\s\w{3}\s+\d{1,2}\s\d{2}:\d{2}:\d{2}\s\d{4}", line) + if len(dates) == 1: + # Format date conveniently. + date = datetime.strptime(dates[0], "%a %b %d %H:%M:%S %Y").strftime("%Y-%m-%d") + return date + else: + log.warn(f"Failed to extract date_published for: {snap.path.name}") + + # In all other cases, assume date_published is the same as date_accessed. + return snap.metadata.origin.date_accessed # type: ignore + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot metadata dvc files for each of the data files. + for file_name in FILES: + snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/{file_name}") + + # To ease the recurrent task update, fetch the access date from the version, and write it to the dvc files. + snap.metadata.origin.date_accessed = SNAPSHOT_VERSION # type: ignore + + # Extract publication date, if possible, and otherwise assume it is the same as the access date. + snap.metadata.origin.date_published = find_date_published(snap=snap) # type: ignore + + # Extract publication year from date_published (which will be used in the custom attribution). + year_published = snap.metadata.origin.date_published.split("-")[0] # type: ignore + + # Assign a custom attribution. + snap.metadata.origin.attribution = ( # type: ignore + f"{snap.metadata.origin.producer} - {snap.metadata.origin.title} ({year_published})" # type: ignore + ) + + # Rewrite metadata to dvc file. + snap.metadata_path.write_text(snap.metadata.to_yaml()) + + # Create the actual snapshots, download the data and upload them to S3. + # NOTE: This cannot be done as part of the previous loop because, if the folder of dvc files has been manually + # duplicated (without manually removing the "outs" section), `create_snapshot` will fail because there are multiple + # files with the same "outs". Therefore, we first clean the dvc files, and then run `create_snapshot`. + for file_name in FILES: + snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/{file_name}") + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/climate/2024-11-18/co2_concentration_monthly.csv.dvc b/snapshots/climate/2024-11-18/co2_concentration_monthly.csv.dvc new file mode 100644 index 00000000000..eb1757aaac5 --- /dev/null +++ b/snapshots/climate/2024-11-18/co2_concentration_monthly.csv.dvc @@ -0,0 +1,23 @@ +meta: + origin: + producer: NOAA Global Monitoring Laboratory + title: Trends in Atmospheric Carbon Dioxide + description: |- + The Carbon Cycle Greenhouse Gases (CCGG) research area operates the Global Greenhouse Gas Reference Network, measuring the atmospheric distribution and trends of the three main long-term drivers of climate change, carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), as well as carbon monoxide (CO) which is an important indicator of air pollution. + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) Global Monitoring Laboratory, Boulder, Colorado, USA (https://gml.noaa.gov) - Trends in Atmospheric Carbon Dioxide. + + Lan, X., Tans, P. and K.W. Thoning: Trends in globally-averaged CO2 determined from NOAA Global Monitoring Laboratory measurements. https://doi.org/10.15138/9N0H-ZH07 + attribution: NOAA Global Monitoring Laboratory - Trends in Atmospheric Carbon Dioxide (2024) + attribution_short: NOAA/GML + url_main: https://gml.noaa.gov/ccgg/trends/gl_data.html + url_download: https://gml.noaa.gov/webdata/ccgg/trends/co2/co2_mm_gl.csv + date_accessed: '2024-11-18' + date_published: '2024-11-05' + license: + name: CC BY 4.0 + url: https://gml.noaa.gov/about/disclaimer.html +outs: + - md5: 1198b5768dfac528928e738a14244cdf + size: 23837 + path: co2_concentration_monthly.csv diff --git a/snapshots/climate/2024-11-18/hawaii_ocean_time_series.csv.dvc b/snapshots/climate/2024-11-18/hawaii_ocean_time_series.csv.dvc new file mode 100644 index 00000000000..c5737be8e7e --- /dev/null +++ b/snapshots/climate/2024-11-18/hawaii_ocean_time_series.csv.dvc @@ -0,0 +1,25 @@ +meta: + origin: + producer: School of Ocean & Earth Science & Technology + title: Hawaii Ocean Time-series + citation_full: |- + School of Ocean and Earth Science and Technology at the University of Hawai'i at Manoa - Hawaii Ocean Time-series (HOT). + + Dore, J.E., R. Lukas, D.W. Sadler, M.J. Church, and D.M. Karl. 2009. Physical and biogeochemical modulation of ocean acidification in the central North Pacific. Proc Natl Acad Sci USA 106:12235-12240. + + HOT observations are supported by the U.S. National Science Foundation under Award #1756517. + + More details can be found at [the HOT Carbon Dioxide page](https://hahana.soest.hawaii.edu/hot/hotco2/hotco2.html), specifically in [this technical document](https://hahana.soest.hawaii.edu/hot/hotco2/HOT_surface_CO2_readme.pdf). + attribution: School of Ocean & Earth Science & Technology - Hawaii Ocean Time-series (2023) + attribution_short: SOEST/Hawaii + url_main: https://hahana.soest.hawaii.edu/hot/ + url_download: https://hahana.soest.hawaii.edu/hot/hotco2/HOT_surface_CO2.txt + date_accessed: '2024-11-18' + date_published: '2023-12-11' + license: + name: Public domain + url: https://hahana.soest.hawaii.edu/hot/dataaccess.html +outs: + - md5: fd502d28aa85a6f241e9507d85b8ca8b + size: 44820 + path: hawaii_ocean_time_series.csv diff --git a/snapshots/climate/2024-11-18/n2o_concentration_monthly.csv.dvc b/snapshots/climate/2024-11-18/n2o_concentration_monthly.csv.dvc new file mode 100644 index 00000000000..9534f0370d7 --- /dev/null +++ b/snapshots/climate/2024-11-18/n2o_concentration_monthly.csv.dvc @@ -0,0 +1,23 @@ +meta: + origin: + producer: NOAA Global Monitoring Laboratory + title: Trends in Atmospheric Nitrous Oxide + description: |- + The Carbon Cycle Greenhouse Gases (CCGG) research area operates the Global Greenhouse Gas Reference Network, measuring the atmospheric distribution and trends of the three main long-term drivers of climate change, carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), as well as carbon monoxide (CO) which is an important indicator of air pollution. + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) Global Monitoring Laboratory, Boulder, Colorado, USA (https://gml.noaa.gov) - Trends in Atmospheric Nitrous Oxide. + + Lan, X., K.W. Thoning, and E.J. Dlugokencky: Trends in globally-averaged CH4, N2O, and SF6 determined from NOAA Global Monitoring Laboratory measurements. https://doi.org/10.15138/P8XG-AA10 + attribution: NOAA Global Monitoring Laboratory - Trends in Atmospheric Nitrous Oxide (2024) + attribution_short: NOAA/GML + url_main: https://gml.noaa.gov/ccgg/trends_n2o/ + url_download: https://gml.noaa.gov/webdata/ccgg/trends/n2o/n2o_mm_gl.csv + date_accessed: '2024-11-18' + date_published: '2024-11-05' + license: + name: CC BY 4.0 + url: https://gml.noaa.gov/about/disclaimer.html +outs: + - md5: af100a1b1b8b0016d6cefb2ed12eb395 + size: 13595 + path: n2o_concentration_monthly.csv diff --git a/snapshots/climate/2024-11-18/ocean_heat_content_annual_world_2000m.csv.dvc b/snapshots/climate/2024-11-18/ocean_heat_content_annual_world_2000m.csv.dvc new file mode 100644 index 00000000000..5df36426300 --- /dev/null +++ b/snapshots/climate/2024-11-18/ocean_heat_content_annual_world_2000m.csv.dvc @@ -0,0 +1,27 @@ +meta: + origin: + producer: NOAA National Centers for Environmental Information + title: Heat Content Basin Time Series + description: |- + The time series of yearly heat content are presented for the 0-700 and 0-2000 meters layers. + + The yearly data for each of four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data.html). + title_snapshot: Heat Content Basin Time Series - World 0 to 2000 meters + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series. + + Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp. + attribution: NOAA National Centers for Environmental Information - Heat Content Basin Time Series (2024) + attribution_short: NOAA/NCEI + url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level + url_download: https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/yearly/h22-w0-2000m.dat + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: CC BY 4.0 + url: |- + https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf +outs: + - md5: af13e73414f3cde4a2326156cf385d35 + size: 1140 + path: ocean_heat_content_annual_world_2000m.csv diff --git a/snapshots/climate/2024-11-18/ocean_heat_content_annual_world_700m.csv.dvc b/snapshots/climate/2024-11-18/ocean_heat_content_annual_world_700m.csv.dvc new file mode 100644 index 00000000000..fbb16b89ece --- /dev/null +++ b/snapshots/climate/2024-11-18/ocean_heat_content_annual_world_700m.csv.dvc @@ -0,0 +1,27 @@ +meta: + origin: + producer: NOAA National Centers for Environmental Information + title: Heat Content Basin Time Series + description: |- + The time series of yearly heat content are presented for the 0-700 and 0-2000 meters layers. + + The yearly data for each of four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data.html). + title_snapshot: Heat Content Basin Time Series - World 0 to 700 meters + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series. + + Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp. + attribution: NOAA National Centers for Environmental Information - Heat Content Basin Time Series (2024) + attribution_short: NOAA/NCEI + url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level + url_download: https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/yearly/h22-w0-700m.dat + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: CC BY 4.0 + url: |- + https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf +outs: + - md5: ef1fff5b0e82b86383acb1e455ea00e5 + size: 3990 + path: ocean_heat_content_annual_world_700m.csv diff --git a/snapshots/climate/2024-11-18/ocean_heat_content_monthly_world_2000m.csv.dvc b/snapshots/climate/2024-11-18/ocean_heat_content_monthly_world_2000m.csv.dvc new file mode 100644 index 00000000000..835518a0a9c --- /dev/null +++ b/snapshots/climate/2024-11-18/ocean_heat_content_monthly_world_2000m.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: NOAA National Centers for Environmental Information + title: Heat Content Monthly Basin Time Series + description: |- + The time series of monthly heat content are presented for the 0-700 and 0-2000 meters layers. + + The monthly data for each of the four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data_monthly.html). + title_snapshot: Heat Content Monthly Basin Time Series - World 0 to 2000 meters + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series. + + Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp. + attribution: NOAA National Centers for Environmental Information - Heat Content Monthly Basin Time Series (2024) + attribution_short: NOAA/NCEI + url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level + url_download: |- + https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/onemonth/ohc2000m_levitus_climdash_monthly.csv + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: CC BY 4.0 + url: |- + https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf +outs: + - md5: 75f29151644e6a15a5a488ecd0692d47 + size: 4031 + path: ocean_heat_content_monthly_world_2000m.csv diff --git a/snapshots/climate/2024-11-18/ocean_heat_content_monthly_world_700m.csv.dvc b/snapshots/climate/2024-11-18/ocean_heat_content_monthly_world_700m.csv.dvc new file mode 100644 index 00000000000..87392a39dfd --- /dev/null +++ b/snapshots/climate/2024-11-18/ocean_heat_content_monthly_world_700m.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: NOAA National Centers for Environmental Information + title: Heat Content Monthly Basin Time Series + description: |- + The time series of monthly heat content are presented for the 0-700 and 0-2000 meters layers. + + The monthly data for each of the four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data_monthly.html). + title_snapshot: Heat Content Monthly Basin Time Series - World 0 to 700 meters + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series. + + Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp. + attribution: NOAA National Centers for Environmental Information - Heat Content Monthly Basin Time Series (2024) + attribution_short: NOAA/NCEI + url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level + url_download: |- + https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/onemonth/ohc_levitus_climdash_monthly.csv + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: CC BY 4.0 + url: |- + https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf +outs: + - md5: eeb3e921ca2515d6209f33433e3c15f6 + size: 3984 + path: ocean_heat_content_monthly_world_700m.csv diff --git a/snapshots/climate/2024-11-18/sea_ice_index.xlsx.dvc b/snapshots/climate/2024-11-18/sea_ice_index.xlsx.dvc new file mode 100644 index 00000000000..dce2df76d58 --- /dev/null +++ b/snapshots/climate/2024-11-18/sea_ice_index.xlsx.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: National Snow and Ice Data Center + title: Sea Ice Index + citation_full: |- + Fetterer, F., K. Knowles, W. N. Meier, M. Savoie, and A. K. Windnagel. (2017). Sea Ice Index, Version 3 [Data Set]. Boulder, Colorado USA. National Snow and Ice Data Center. https://doi.org/10.7265/N5K072F8. + attribution: National Snow and Ice Data Center - Sea Ice Index (2024) + attribution_short: NSIDC + version_producer: Version 3 + url_main: https://nsidc.org/data/g02135/ + url_download: https://noaadata.apps.nsidc.org/NOAA/G02135/seaice_analysis/Sea_Ice_Index_Monthly_Data_by_Year_G02135_v3.0.xlsx + date_accessed: '2024-11-18' + date_published: '2024-11-17' + license: + name: CC BY 4.0 +outs: + - md5: f0a22168d7f60ba44fad42044d210de8 + size: 25352 + path: sea_ice_index.xlsx diff --git a/snapshots/climate/2024-11-18/sea_surface_temperature_northern_hemisphere.csv.dvc b/snapshots/climate/2024-11-18/sea_surface_temperature_northern_hemisphere.csv.dvc new file mode 100644 index 00000000000..b31d1569bdf --- /dev/null +++ b/snapshots/climate/2024-11-18/sea_surface_temperature_northern_hemisphere.csv.dvc @@ -0,0 +1,26 @@ +meta: + origin: + producer: Met Office Hadley Centre + title: Hadley Centre's Sea Surface Temperature (HadSST) + title_snapshot: Hadley Centre's Sea Surface Temperature (HadSST) - Northern hemisphere + citation_full: |- + Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST). + + Kennedy, J. J., Rayner, N. A., Atkinson, C. P., & Killick, R. + E. (2019). An ensemble data set of sea-surface temperature change from 1850: + the Met Office Hadley Centre HadSST.4.0.0.0 data set. Journal of Geophysical + Research: Atmospheres, 124. https://doi.org/10.1029/2018JD029867 + attribution: Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST) (2024) + attribution_short: Met Office + version_producer: 4.0.1.0 + url_main: https://www.metoffice.gov.uk/hadobs/hadsst4/ + url_download: https://www.metoffice.gov.uk/hadobs/hadsst4/data/csv/HadSST.4.0.1.0_monthly_NHEM.csv + date_accessed: '2024-11-18' + date_published: '2024-11-13' + license: + name: Open Government Licence v3 + url: https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html +outs: + - md5: f65fc75f86209d317a80d36d274d1fd3 + size: 153317 + path: sea_surface_temperature_northern_hemisphere.csv diff --git a/snapshots/climate/2024-11-18/sea_surface_temperature_southern_hemisphere.csv.dvc b/snapshots/climate/2024-11-18/sea_surface_temperature_southern_hemisphere.csv.dvc new file mode 100644 index 00000000000..cf5613753b2 --- /dev/null +++ b/snapshots/climate/2024-11-18/sea_surface_temperature_southern_hemisphere.csv.dvc @@ -0,0 +1,26 @@ +meta: + origin: + producer: Met Office Hadley Centre + title: Hadley Centre's Sea Surface Temperature (HadSST) + title_snapshot: Hadley Centre's Sea Surface Temperature (HadSST) - Southern hemisphere + citation_full: |- + Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST). + + Kennedy, J. J., Rayner, N. A., Atkinson, C. P., & Killick, R. + E. (2019). An ensemble data set of sea-surface temperature change from 1850: + the Met Office Hadley Centre HadSST.4.0.0.0 data set. Journal of Geophysical + Research: Atmospheres, 124. https://doi.org/10.1029/2018JD029867 + attribution: Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST) (2024) + attribution_short: Met Office + version_producer: 4.0.1.0 + url_main: https://www.metoffice.gov.uk/hadobs/hadsst4/ + url_download: https://www.metoffice.gov.uk/hadobs/hadsst4/data/csv/HadSST.4.0.1.0_monthly_SHEM.csv + date_accessed: '2024-11-18' + date_published: '2024-11-13' + license: + name: Open Government Licence v3 + url: https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html +outs: + - md5: 0532779d0b546590a3a05eff7cc83bc7 + size: 153913 + path: sea_surface_temperature_southern_hemisphere.csv diff --git a/snapshots/climate/2024-11-18/sea_surface_temperature_world.csv.dvc b/snapshots/climate/2024-11-18/sea_surface_temperature_world.csv.dvc new file mode 100644 index 00000000000..f0d7df032ed --- /dev/null +++ b/snapshots/climate/2024-11-18/sea_surface_temperature_world.csv.dvc @@ -0,0 +1,26 @@ +meta: + origin: + producer: Met Office Hadley Centre + title: Hadley Centre's Sea Surface Temperature (HadSST) + title_snapshot: Hadley Centre's Sea Surface Temperature (HadSST) - World + citation_full: |- + Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST). + + Kennedy, J. J., Rayner, N. A., Atkinson, C. P., & Killick, R. + E. (2019). An ensemble data set of sea-surface temperature change from 1850: + the Met Office Hadley Centre HadSST.4.0.0.0 data set. Journal of Geophysical + Research: Atmospheres, 124. https://doi.org/10.1029/2018JD029867 + attribution: Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST) (2024) + attribution_short: Met Office + version_producer: 4.0.1.0 + url_main: https://www.metoffice.gov.uk/hadobs/hadsst4/ + url_download: https://www.metoffice.gov.uk/hadobs/hadsst4/data/csv/HadSST.4.0.1.0_monthly_GLOBE.csv + date_accessed: '2024-11-18' + date_published: '2024-11-13' + license: + name: Open Government Licence v3 + url: https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html +outs: + - md5: 5d35e517c8f90f515b7df119196c0601 + size: 153806 + path: sea_surface_temperature_world.csv diff --git a/snapshots/climate/2024-11-18/snow_cover_extent_north_america.csv.dvc b/snapshots/climate/2024-11-18/snow_cover_extent_north_america.csv.dvc new file mode 100644 index 00000000000..486f69392d2 --- /dev/null +++ b/snapshots/climate/2024-11-18/snow_cover_extent_north_america.csv.dvc @@ -0,0 +1,22 @@ +meta: + origin: + producer: Rutgers University Global Snow Lab + title: Snow Cover Extent + title_snapshot: Area of Snow Extent - North America (including Greenland) + citation_full: |- + Rutgers University Global Snow Lab - Area of Snow Extent. + + Robinson, David A., Estilow, Thomas W., and NOAA CDR Program (2012): NOAA Climate Data Record (CDR) of Northern Hemisphere (NH) Snow Cover Extent (SCE), Version 1. NOAA National Centers for Environmental Information. doi: 10.7289/V5N014G9 + attribution: Rutgers University Global Snow Lab - Snow Cover Extent (2024) + attribution_short: Rutgers + version_producer: Version 1 + url_main: https://climate.rutgers.edu/snowcover/table_area.php?ui_set=1&ui_sort=0 + url_download: https://climate.rutgers.edu/snowcover/files/moncov.namgnld.txt + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: CC BY 4.0 +outs: + - md5: 47ef1f63e1a2de21e16c862bf9e13919 + size: 12726 + path: snow_cover_extent_north_america.csv diff --git a/snapshots/climate/2024-11-18/snow_cover_extent_northern_hemisphere.csv.dvc b/snapshots/climate/2024-11-18/snow_cover_extent_northern_hemisphere.csv.dvc new file mode 100644 index 00000000000..1777cdeefbb --- /dev/null +++ b/snapshots/climate/2024-11-18/snow_cover_extent_northern_hemisphere.csv.dvc @@ -0,0 +1,22 @@ +meta: + origin: + producer: Rutgers University Global Snow Lab + title: Snow Cover Extent + title_snapshot: Area of Snow Extent - Northern Hemisphere + citation_full: |- + Rutgers University Global Snow Lab - Area of Snow Extent. + + Robinson, David A., Estilow, Thomas W., and NOAA CDR Program (2012): NOAA Climate Data Record (CDR) of Northern Hemisphere (NH) Snow Cover Extent (SCE), Version 1. NOAA National Centers for Environmental Information. doi: 10.7289/V5N014G9 + attribution: Rutgers University Global Snow Lab - Snow Cover Extent (2024) + attribution_short: Rutgers + version_producer: Version 1 + url_main: https://climate.rutgers.edu/snowcover/table_area.php?ui_set=1&ui_sort=0 + url_download: https://climate.rutgers.edu/snowcover/files/moncov.nhland.txt + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: CC BY 4.0 +outs: + - md5: 9ff3e1e0c33fa1ac2099ee3ec6a6f503 + size: 12850 + path: snow_cover_extent_northern_hemisphere.csv diff --git a/snapshots/climate/2024-11-18/surface_temperature_analysis_northern_hemisphere.csv.dvc b/snapshots/climate/2024-11-18/surface_temperature_analysis_northern_hemisphere.csv.dvc new file mode 100644 index 00000000000..32cc0dc0be2 --- /dev/null +++ b/snapshots/climate/2024-11-18/surface_temperature_analysis_northern_hemisphere.csv.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: NASA Goddard Institute for Space Studies + title: GISS Surface Temperature Analysis + title_snapshot: GISS Surface Temperature Analysis - Northern hemisphere + citation_full: NASA. GISS Surface Temperature Analysis (GISTEMP v4) + attribution: NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis (2024) + attribution_short: NASA + version_producer: v4 + url_main: https://data.giss.nasa.gov/gistemp/ + url_download: https://data.giss.nasa.gov/gistemp/tabledata_v4/NH.Ts+dSST.csv + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: CC BY 4.0 +outs: + - md5: 688cdaeb087ec6ba3e5128a8c38681ff + size: 12743 + path: surface_temperature_analysis_northern_hemisphere.csv diff --git a/snapshots/climate/2024-11-18/surface_temperature_analysis_southern_hemisphere.csv.dvc b/snapshots/climate/2024-11-18/surface_temperature_analysis_southern_hemisphere.csv.dvc new file mode 100644 index 00000000000..2faf344c758 --- /dev/null +++ b/snapshots/climate/2024-11-18/surface_temperature_analysis_southern_hemisphere.csv.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: NASA Goddard Institute for Space Studies + title: GISS Surface Temperature Analysis + title_snapshot: GISS Surface Temperature Analysis - Southern hemisphere + citation_full: NASA. GISS Surface Temperature Analysis (GISTEMP v4) + attribution: NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis (2024) + attribution_short: NASA + version_producer: v4 + url_main: https://data.giss.nasa.gov/gistemp/ + url_download: https://data.giss.nasa.gov/gistemp/tabledata_v4/SH.Ts+dSST.csv + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: CC BY 4.0 +outs: + - md5: 89a66566d12cd9c1f905c82dbfb1477c + size: 12716 + path: surface_temperature_analysis_southern_hemisphere.csv diff --git a/snapshots/climate/2024-11-18/surface_temperature_analysis_world.csv.dvc b/snapshots/climate/2024-11-18/surface_temperature_analysis_world.csv.dvc new file mode 100644 index 00000000000..f1c51217b6c --- /dev/null +++ b/snapshots/climate/2024-11-18/surface_temperature_analysis_world.csv.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: NASA Goddard Institute for Space Studies + title: GISS Surface Temperature Analysis + title_snapshot: GISS Surface Temperature Analysis - World + citation_full: NASA. GISS Surface Temperature Analysis (GISTEMP v4) + attribution: NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis (2024) + attribution_short: NASA + version_producer: v4 + url_main: https://data.giss.nasa.gov/gistemp/ + url_download: https://data.giss.nasa.gov/gistemp/tabledata_v4/GLB.Ts+dSST.csv + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: CC BY 4.0 +outs: + - md5: b7af98cebc53cf403b2a0df2749beece + size: 12696 + path: surface_temperature_analysis_world.csv diff --git a/snapshots/climate/2024-11-19/total_precipitation.py b/snapshots/climate/2024-11-19/total_precipitation.py new file mode 100644 index 00000000000..261639c7b02 --- /dev/null +++ b/snapshots/climate/2024-11-19/total_precipitation.py @@ -0,0 +1,55 @@ +"""Script to create a snapshot of the monthly averaged surface temperature data from 1950 to present from the Copernicus Climate Change Service. + +The script assumes that the data is available on the CDS API. +Instructions on how to access the API on a Mac are here: https://confluence.ecmwf.int/display/CKB/How+to+install+and+use+CDS+API+on+macOS + +More information on how to access the data is here: hhttps://cds.climate.copernicus.eu/datasets/reanalysis-era5-single-levels-monthly-means?tab=overview + +The data is downloaded as a NetCDF file. Tutorials for using the Copernicus API are here and work with the NETCDF format are here: https://ecmwf-projects.github.io/copernicus-training-c3s/cds-tutorial.html +""" + +import tempfile +from pathlib import Path + +# CDS API +import cdsapi +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/total_precipitation.zip") + + # Save data as a compressed temporary file. + with tempfile.TemporaryDirectory() as temp_dir: + output_file = Path(temp_dir) / "era5_monthly_t2m_eur.nc" + + client = cdsapi.Client() + + dataset = "reanalysis-era5-single-levels-monthly-means" + request = { + "product_type": ["monthly_averaged_reanalysis"], + "variable": ["total_precipitation"], + "year": [str(year) for year in range(1940, 2025)], + "month": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"], + "time": "00:00", + "area": [90, -180, -90, 180], + "data_format": "netcdf", + "download_format": "zip", + } + + client.retrieve(dataset, request, output_file) + + # Upload snapshot. + snap.create_snapshot(filename=output_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/climate/2024-11-19/total_precipitation.zip.dvc b/snapshots/climate/2024-11-19/total_precipitation.zip.dvc new file mode 100644 index 00000000000..b337fb39531 --- /dev/null +++ b/snapshots/climate/2024-11-19/total_precipitation.zip.dvc @@ -0,0 +1,22 @@ +meta: + origin: + title_snapshot: ERA5 Monthly Averaged Data on Single Levels from 1940 to Present - Monthly Averages of Total Precipitation + title: ERA5 monthly averaged data on single levels from 1940 to present + description: |- + Monthly averages of total precipitation from the ERA5 reanalysis. The data is on single levels and covers the period from 1940 to present. The data is available at a spatial resolution of 0.25 degrees. The data is provided by the Copernicus Climate Change Service (C3S) Climate Data Store (CDS). + producer: Contains modified Copernicus Climate Change Service information + version_producer: 2 + citation_full: |- + Hersbach, H., Bell, B., Berrisford, P., Biavati, G., Horányi, A., Muñoz Sabater, J., Nicolas, J., Peubey, C., Radu, R., Rozum, I., Schepers, D., Simmons, A., Soci, C., Dee, D., Thépaut, J-N. (2023): ERA5 monthly averaged data on single levels from 1940 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.f17050d7 (Accessed on 19-Nov-2024) + url_main: https://cds.climate.copernicus.eu/datasets/reanalysis-era5-single-levels-monthly-means?tab=overview + date_accessed: 2024-11-19 + date_published: 2024-11-06 + license: + name: Copernicus License + url: https://cds.climate.copernicus.eu/datasets/reanalysis-era5-single-levels-monthly-means?tab=overview + + +outs: + - md5: b8c02aa4a3552ccaf2920398114bb6a2 + size: 1392608692 + path: total_precipitation.zip diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc index 9e649833d39..d103b37e950 100644 --- a/snapshots/climate/latest/weekly_wildfires.csv.dvc +++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc @@ -9,12 +9,12 @@ meta: citation_full: Global Wildfire Information System attribution_short: GWIS url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend - date_accessed: 2024-11-04 - date_published: 2024-11-04 + date_accessed: 2024-12-11 + date_published: 2024-12-11 license: name: CC BY 4.0 url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license outs: - - md5: 9ef710b46cdd8b08f56940c703d02019 - size: 12629979 + - md5: fc6f8b908a2988b2d8048707526c460a + size: 12799310 path: weekly_wildfires.csv diff --git a/snapshots/climate/latest/weekly_wildfires.py b/snapshots/climate/latest/weekly_wildfires.py index 3ee9c51ba59..002a5f1303f 100644 --- a/snapshots/climate/latest/weekly_wildfires.py +++ b/snapshots/climate/latest/weekly_wildfires.py @@ -51,6 +51,9 @@ def main(upload: bool) -> None: # Initialize a new snapshot object for storing data, using a predefined file path structure. snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/weekly_wildfires.csv") + # Load existing snapshot for comparison at the end of the script. + orig_snapshot_df = snap.read() + # Initialize an empty list to hold DataFrames for wildfire data. dfs_fires = [] @@ -62,7 +65,9 @@ def main(upload: bool) -> None: base_url = ( "https://api2.effis.emergency.copernicus.eu/statistics/v2/gwis/weekly?country={country_code}&year={year}" ) + url = base_url.format(country_code=country, year=YEAR) + # timeout after 30s, they have occasional outages response = requests.get(url, timeout=30) if response.status_code == 200: @@ -72,7 +77,6 @@ def main(upload: bool) -> None: banfweekly = data["banfweekly"] # Convert the weekly data into a pandas DataFrame. df = pd.DataFrame(banfweekly) - # Select and rename relevant columns, and calculate the 'month_day' column. df = df[["mddate", "events", "area_ha"]] df["month_day"] = [date[4:6] + "-" + date[6:] for date in df["mddate"]] @@ -165,6 +169,12 @@ def main(upload: bool) -> None: # Combine both fires and emissions data into a final DataFrame. df_final = pd.concat([dfs_fires, dfs_emissions]) + + if len(df_final) < len(orig_snapshot_df): + raise ValueError( + f"New snapshot has fewer rows ({len(df_final)}) than the original snapshot {len(orig_snapshot_df)}. API could be down or data is missing." + ) + # Save the final DataFrame to the specified file path in the snapshot. df_to_file(df_final, file_path=snap.path) # type: ignore[reportArgumentType] diff --git a/snapshots/climate_watch/2024-11-21/emissions_by_sector.gz.dvc b/snapshots/climate_watch/2024-11-21/emissions_by_sector.gz.dvc new file mode 100644 index 00000000000..e2dd795191f --- /dev/null +++ b/snapshots/climate_watch/2024-11-21/emissions_by_sector.gz.dvc @@ -0,0 +1,25 @@ +meta: + origin: + producer: Climate Watch + title: Greenhouse gas emissions by sector + description: |- + Climate Watch Historical Emissions data contains sector-level greenhouse gas (GHG) emissions, including emissions of the six major GHGs from most major sources and sinks. Non-CO₂ emissions are expressed in CO₂ equivalents using 100-year global warming potential values from the Intergovernmental Panel on Climate Change (IPCC) Fourth Assessment Report. + + More information about their data sources and methodology can be found in their [FAQ page](https://www.climatewatchdata.org/about/faq/ghg). Specifically, the definitions of all Climate Watch data sectors and their methodology are explained in [this document](https://wri-sites.s3.us-east-1.amazonaws.com/climatewatch.org/www.climatewatch.org/climate-watch/wri_metadata/CW_GHG_Method_Note.pdf). + citation_full: |- + Climate Watch. 2024. Washington, DC: World Resources Institute (WRI). Available online at: https://www.climatewatchdata.org + + Climate Watch data are derived from several sources. + - Data on land-Use change and forestry, and agriculture, are sourced from the Food and Agriculture Organization of the United Nations, FAOSTAT Emissions Database. + - Data on greenhouse gas emissions from fuel combustion are sourced from the OECD/IEA. + attribution_short: Climate Watch + url_main: https://www.climatewatchdata.org/data-explorer/historical-emissions + date_accessed: '2024-11-21' + date_published: '2024-11-13' + license: + name: CC BY 4.0 + url: https://www.climatewatchdata.org/about/permissions +outs: + - md5: 72912534be0bcba4aafc2027b286a64c + size: 3834428 + path: emissions_by_sector.gz diff --git a/snapshots/climate_watch/2024-11-21/emissions_by_sector.py b/snapshots/climate_watch/2024-11-21/emissions_by_sector.py new file mode 100644 index 00000000000..67cdadfc19c --- /dev/null +++ b/snapshots/climate_watch/2024-11-21/emissions_by_sector.py @@ -0,0 +1,103 @@ +"""Script to create a snapshot of dataset. + +NOTE: The publication date can be found by going to +https://www.climatewatchdata.org/data-explorer/historical-emissions +And checking the notifications (clicking on the bell sign on the top right corner). +Although they have many notifications, and it's not clear which one corresponds to a major update. +I will use the latest update, which signals that that's the last time they revisited (at least some of) their data. + +""" + +import gzip +import json +import tempfile +from pathlib import Path +from time import sleep + +import click +import requests +from tqdm.auto import tqdm + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Fixed inputs. + +# Climate Watch API URL. +API_URL = "https://www.climatewatchdata.org/api/v1/data/historical_emissions/" +# Number of records to fetch per api request. +API_RECORDS_PER_REQUEST = 500 +# Time to wait between consecutive api requests. +TIME_BETWEEN_REQUESTS = 0.1 + + +def fetch_all_data_from_api( + api_url=API_URL, + api_records_per_request=API_RECORDS_PER_REQUEST, + time_between_requests=TIME_BETWEEN_REQUESTS, +): + """Fetch all data from the Climate Watch Data API. + + Parameters + ---------- + api_url : str + API URL. + api_records_per_request : int + Maximum number of records to fetch per API request. + time_between_requests : float + Time to wait between consecutive API requests. + + Returns + ------- + data_all : list + Raw data (list with one dictionary per record). + + """ + # Start requests session. + session = requests.Session() + # The total number of records in the database is returned on the header of each request. + # Send a simple request to get that number. + response = session.get(url=api_url) + total_records = int(response.headers["total"]) + print(f"Total number of records to fetch from API: {total_records}") + + # Number of requests to ensure all pages are requested. + total_requests = round(total_records / api_records_per_request) + 1 + # Collect all data from consecutive api requests. This could be sped up by parallelizing requests. + data_all = [] + for page in tqdm(range(1, total_requests + 1)): + response = session.get(url=api_url, json={"page": page, "per_page": api_records_per_request}) + new_data = json.loads(response.content)["data"] + if len(new_data) == 0: + print("No more data to fetch.") + break + data_all.extend(new_data) + sleep(time_between_requests) + + return data_all + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"climate_watch/{SNAPSHOT_VERSION}/emissions_by_sector.gz") + + # Fetch Climate Watch data from API. + data = fetch_all_data_from_api() + + # Save data as a compressed temporary file. + with tempfile.TemporaryDirectory() as temp_dir: + output_file = Path(temp_dir) / "data.json.gz" + + with gzip.open(output_file, "wt", encoding="UTF-8") as _output_file: + json.dump(data, _output_file) + + # Add file to DVC and upload to S3. + snap.create_snapshot(filename=output_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/covid/2024-11-05/get_stats.py b/snapshots/covid/2024-11-05/get_stats.py index 74efe7be8e9..91cb17920a2 100644 --- a/snapshots/covid/2024-11-05/get_stats.py +++ b/snapshots/covid/2024-11-05/get_stats.py @@ -1,20 +1,10 @@ -# issues - -# issue_id, author_name, author_login, date_created -# issue.id, issue.user.name, issue.user.login, issue.created_at - - -# comments - -# comment_id, date_created, date_updated, user_id, issue_id -# comment.id, comment.created_at, comment.updated_at, user_id, issue.id - - -# users - -# user_id, user_login, user_name, user_location -# user.id, user.login, user.name, user.location +""" +issues: list of all issues, including PRs. +pr: list of PRs (redundant with `issues`) +issues_comments: list of comments on issues. +pr_comments: list of comments on PRs. These are not regular comments, but comments on code (e.g. review comments). +""" from datetime import datetime from typing import Optional @@ -30,30 +20,45 @@ # FLAGS EXECUTE_ISSUES = False EXECUTE_PRS = False -EXECUTE_COMMIT = True +EXECUTE_COMMIT = False +SKIP_COMMITS = 15_400 # 10_700 -def get_repo(repo_name: str, access_token: Optional[str] = None) -> github.Repository.Repository: +def get_repo( + repo_name: str, access_token: Optional[str] = None, per_page: Optional[int] = None +) -> github.Repository.Repository: """Get repository.""" if not access_token: assert config.OWIDBOT_ACCESS_TOKEN, "OWIDBOT_ACCESS_TOKEN is not set" access_token = config.OWIDBOT_ACCESS_TOKEN auth = Auth.Token(access_token) - g = Github(auth=auth) + if per_page: + g = Github(auth=auth, per_page=per_page) + else: + g = Github(auth=auth) return g.get_repo(f"owid/{repo_name}") def process_issue(issue_or_pr, users): """Function to process each issue and its comments.""" + is_pr = "pull/" in issue_or_pr.html_url + user = issue_or_pr.user issue_or_pr_data = { "issue_id": issue_or_pr.number, - "author_name": issue_or_pr.user.name, - "author_login": issue_or_pr.user.login, + "author_name": user.name, + "author_login": user.login, "date_created": issue_or_pr.created_at.strftime("%Y-%m-%d %H:%M:%S"), - "is_pr": "pull/" in issue_or_pr.html_url, + "is_pr": is_pr, } issue_or_pr_comments = [] + if user.id not in users: + users[user.id] = { + "user_login": user.login, + "user_name": user.name, + "user_location": user.location, + } + for comment in issue_or_pr.get_comments(): user = comment.user issue_or_pr_comments.append( @@ -63,6 +68,7 @@ def process_issue(issue_or_pr, users): "date_updated": comment.updated_at.strftime("%Y-%m-%d %H:%M:%S"), "user_id": user.id, "issue_id": issue_or_pr.number, + "is_pr": is_pr, } ) @@ -178,3 +184,90 @@ def process_issue(issue_or_pr, users): rand = str(datetime.now().strftime("%Y%m%d%H%M%S")) pd.DataFrame(commits).to_csv(f"gh_stats/commits/total-commits-{rand}.csv", index=False) pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/commits/total-users-commits-{rand}.csv", index=False) + + +if SKIP_COMMITS != 0: + PER_PAGE = 100 + repo = get_repo("covid-19-data", access_token=config.GITHUB_TOKEN, per_page=PER_PAGE) + + # Initialize lists (we will store output data here) + commits = [] + users = {} + + # Get commits + commits_raw = repo.get_commits() + total_commits = commits_raw.totalCount # Total number of commits for progress tracking + + # Calculate the starting page + start_page = (SKIP_COMMITS // PER_PAGE) + 1 + end_page = (total_commits // PER_PAGE) + 1 + + # Initialize a list to store commits from the 101st onward + commits = [] + + # Fetch commits from the 101st onward + for page in range(start_page, end_page): # Adjust the range as needed + print(f"> Progress: {page}/{end_page} commit pages processed ({PER_PAGE * page} commits)") + commit_page = repo.get_commits().get_page(page) + if not commit_page: + break # Stop if there are no more commits + # Retrieve relevant data (several API calls) + for i, c in enumerate(commit_page): + if i % 10 == 0: + print(f">> Progress: {i}/{PER_PAGE} commits processed") + + user = c.committer + stats = c.stats + + commit_raw = { + "sha": c.sha, + "date": c.commit.author.date.strftime("%Y-%m-%d %H:%M:%S"), + "files_changed": len(c.files), + "lines_changed": stats.total, + "lines_deleted": stats.deletions, + "lines_added": stats.additions, + } + + if user is None: + commit_raw["user_id"] = c.commit.author.email + else: + commit_raw["user_id"] = user.id + + commits.append(commit_raw) + # Add user + if user is None: + if c.commit.author.email not in users: + users[c.commit.author.email] = { + "user_login": None, + "user_name": c.commit.author.name, + "user_location": None, + } + else: + # print(user) + if user.id not in users: + try: + users[user.id] = { + "user_login": user.login, + "user_name": user.name, + "user_location": user.location, + } + except Exception: + users[user.id] = { + "user_login": user.login, + "user_name": None, + "user_location": None, + } + + if (i != 0) & (i % 50 == 0): + # Export + print(f"Exporting {i}...") + rand = str(datetime.now().strftime("%Y%m%d%H%M%S")) + pd.DataFrame(commits).to_csv(f"gh_stats/commits/{PER_PAGE * page}-{i}-commits-{rand}.csv", index=False) + pd.DataFrame(users).T.reset_index().to_csv( + f"gh_stats/commits/{PER_PAGE * page}-{i}-users-commits-{rand}.csv", index=False + ) + + # Export + rand = str(datetime.now().strftime("%Y%m%d%H%M%S")) + pd.DataFrame(commits).to_csv(f"gh_stats/commits/total-commits-{rand}.csv", index=False) + pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/commits/total-users-commits-{rand}.csv", index=False) diff --git a/snapshots/covid/2024-11-05/get_vax_reporting.py b/snapshots/covid/2024-11-05/get_vax_reporting.py new file mode 100644 index 00000000000..c06c2cdf624 --- /dev/null +++ b/snapshots/covid/2024-11-05/get_vax_reporting.py @@ -0,0 +1,149 @@ +""" +Get dates of first reporting for each country. + +python snapshot/covid/2024-11-05/get_vax_reporting.py +""" + +import os +import re +from io import StringIO +from typing import Optional + +import github +import github.PullRequest +import github.Repository +import pandas as pd +from github import Auth, Github + +from etl import config + + +def get_repo( + repo_name: str, access_token: Optional[str] = None, per_page: Optional[int] = None +) -> github.Repository.Repository: + """Get repository.""" + if not access_token: + assert config.OWIDBOT_ACCESS_TOKEN, "OWIDBOT_ACCESS_TOKEN is not set" + access_token = config.OWIDBOT_ACCESS_TOKEN + auth = Auth.Token(access_token) + if per_page: + g = Github(auth=auth, per_page=per_page) + else: + g = Github(auth=auth) + return g.get_repo(f"owid/{repo_name}") + + +def get_country_file_paths(repo, folder_path): + files = [] + contents = repo.get_contents(folder_path) + while contents: + file_content = contents.pop(0) + if file_content.type == "dir": + contents.extend(repo.get_contents(file_content.path)) + else: + files.append(file_content.path) + return files + + +def get_country_file_paths_old(repo, folder_path): + country_files = set() + commits = repo.get_commits(path=folder_path) + + def is_country_file(path): + base_path = "scripts/scripts/vaccinations/output" + return re.match(rf"^{base_path}/[^/]+\.csv$", path) + + for commit in commits: + # Get the details of the commit + commit_details = repo.get_commit(commit.sha) + for file in commit_details.files: + if file.status == "removed" and file.filename.startswith(folder_path): + country_files.add(file.filename) + break + + country_files = {c for c in country_files if is_country_file(c)} + return country_files + + +def get_initial_version_of_file(repo, file_path): + num_retries = 10 + commits = [] + for i in range(num_retries): + commits = repo.get_commits(path=file_path) + if commits.totalCount == 0: + print(">> No commits found, retrying...") + continue + # return None + else: + break + # The last commit in the list is the initial commit + initial_commit = list(commits)[-1] + # Retrieve the file content at the initial commit + try: + csv_content = repo.get_contents(file_path, ref=initial_commit.sha) + csv_content = csv_content.decoded_content.decode("utf-8") + df = pd.read_csv(StringIO(csv_content)) + try: + date_reported = initial_commit.commit.author.date + date_reported = date_reported.strftime("%Y-%m-%d") + except Exception: + date_reported = None + + return { + "commit": initial_commit.sha, + "date_first_value": df["date"].min(), + "date_first_reported": date_reported, + } + except Exception as e: + print(f"Error retrieving {file_path} at commit {initial_commit.sha}: {e}") + return None + + +def combine_files_now_and_old(files, files_old): + """Combine list of countries (keep old if available, otherwise new).""" + file_dix = {} + for file in files_old: + key = os.path.basename(file) + file_dix[key] = file + for file in files: + key = os.path.basename(file) + if key not in file_dix: + file_dix[key] = file + files = list(file_dix.values()) + return files + + +# Get repository +repo = get_repo("covid-19-data", access_token=config.GITHUB_TOKEN) + +# Get country file paths +# path_vax = "scripts/output/vaccinations/main_data" +# path_vax_old = "scripts/scripts/vaccinations/output" +path_vax = "public/data/vaccinations/country_data" + +files = get_country_file_paths(repo, path_vax) +# files_old = get_country_file_paths_old(repo, path_vax_old) + +# Get files +# files = combine_files_now_and_old(files, files_old) + +###################################################### +# GET FIRST FILE VERSIONS +###################################################### +data = [] +for i, file in enumerate(files): + print(f"> {file}") + data_ = get_initial_version_of_file(repo, file) + if data_ is not None: + data_["country"] = file + data.append(data_) + + if i % 10 == 0: + print(f">> {i} files processed") + df = pd.DataFrame(data) + df.to_csv(f"first_reporting_dates-{i}.csv", index=False) + + +# Create DataFrame +df = pd.DataFrame(data) +df.to_csv("first_reporting_dates.csv", index=False) diff --git a/snapshots/covid/2024-11-05/github_stats.py b/snapshots/covid/2024-11-05/github_stats.py index d12eba07cac..083bb3fc779 100644 --- a/snapshots/covid/2024-11-05/github_stats.py +++ b/snapshots/covid/2024-11-05/github_stats.py @@ -8,16 +8,20 @@ If you want to retrieve this data again, please look at the script `get_stats.py` in the same folder. You can simply execute it. To run different parts of the script please use the variables at the top of the script EXECUTE_ISSUES, EXECUTE_PRS, EXECUTE_COMMIT. - python snapshots/covid/2024-11-05/github_stats.py \ - --issues gh_stats/issues-20241104000000.csv \ - --issues-comments gh_stats/comments-issues-20241104000000.csv \ - --issues-users gh_stats/users-issues-20241104000000.csv \ - --pr gh_stats/prs-20241105104652.csv \ - --pr-comments gh_stats/comments-prs-20241105104652.csv \ - --pr-users gh_stats/users-prs-20241105104652.csv \ - --commits gh_stats/commits/8800-commits-20241105165504.csv \ - --commits-users gh_stats/commits/8800-users-commits-20241105165504.csv +Run this snapshot script as: + python snapshots/covid/2024-11-05/github_stats.py \ + --issues gh_stats/issues-20241106211832.csv \ + --issues-comments gh_stats/comments-issues-20241106211832.csv \ + --issues-users gh_stats/users-issues-20241106211832.csv \ + --pr gh_stats/prs-20241106220603.csv \ + --pr-comments gh_stats/comments-prs-20241106220603.csv \ + --pr-users gh_stats/users-prs-20241106220603.csv \ + --commits gh_stats/commits/10800-commits-20241105182054.csv \ + --commits-users gh_stats/commits/10800-users-commits-20241105182054.csv \ + --vax-reporting first_reporting_dates.csv + +NOTE: To get data on when countries first reported vaccination data, please refer to get_vax_reporting.py script. """ from pathlib import Path @@ -44,6 +48,7 @@ @click.option("--pr-comments", type=str, help="File with data on PR comments.") @click.option("--pr-users", type=str, help="File with data on users that commented in PRs.") @click.option("--commits", type=str, help="File with data on commits.") +@click.option("--vax-reporting", type=str, help="File with data on reporting of vaccination data.") @click.option("--commits-users", type=str, help="File with data on commit users.") def main( upload: bool, @@ -55,6 +60,7 @@ def main( pr_users: Optional[str] = None, commits: Optional[str] = None, commits_users: Optional[str] = None, + vax_reporting: Optional[str] = None, ) -> None: snapshot_paths = [ (issues, "github_stats_issues.csv"), @@ -65,6 +71,7 @@ def main( (pr_users, "github_stats_pr_users.csv"), (commits, "github_stats_commits.csv"), (commits_users, "github_stats_commits_users.csv"), + (vax_reporting, "github_stats_vax_reporting.csv"), ] for paths in snapshot_paths: diff --git a/snapshots/covid/2024-11-05/github_stats_commits.csv.dvc b/snapshots/covid/2024-11-05/github_stats_commits.csv.dvc index 303806e4300..eb8beb1c240 100644 --- a/snapshots/covid/2024-11-05/github_stats_commits.csv.dvc +++ b/snapshots/covid/2024-11-05/github_stats_commits.csv.dvc @@ -29,6 +29,6 @@ meta: url: https://creativecommons.org/licenses/by/4.0/ outs: - - md5: 45236ca93183af7d057bb47afe37d715 - size: 725503 + - md5: ce91ff3e587e41f605ec41df1f6f8b15 + size: 2607241 path: github_stats_commits.csv diff --git a/snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc b/snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc index d9020f310d7..3cd2c576ee9 100644 --- a/snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc +++ b/snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc @@ -29,6 +29,6 @@ meta: url: https://creativecommons.org/licenses/by/4.0/ outs: - - md5: ada1edd0b7fb873af437894bd7d13779 - size: 301 + - md5: 96c9a52959b8edaf984b317fecf0a35f + size: 1741 path: github_stats_commits_users.csv diff --git a/snapshots/covid/2024-11-05/github_stats_issues.csv.dvc b/snapshots/covid/2024-11-05/github_stats_issues.csv.dvc index d89eec2ce40..821e8c5a268 100644 --- a/snapshots/covid/2024-11-05/github_stats_issues.csv.dvc +++ b/snapshots/covid/2024-11-05/github_stats_issues.csv.dvc @@ -29,6 +29,6 @@ meta: url: https://creativecommons.org/licenses/by/4.0/ outs: - - md5: 8d8a60f7da5dfadfc22e9f76e79ef29f - size: 109219 + - md5: 438798050b69796d0f73271cdab1837e + size: 123116 path: github_stats_issues.csv diff --git a/snapshots/covid/2024-11-05/github_stats_issues_users.csv.dvc b/snapshots/covid/2024-11-05/github_stats_issues_users.csv.dvc index 79055164066..5942ee64158 100644 --- a/snapshots/covid/2024-11-05/github_stats_issues_users.csv.dvc +++ b/snapshots/covid/2024-11-05/github_stats_issues_users.csv.dvc @@ -29,6 +29,6 @@ meta: url: https://creativecommons.org/licenses/by/4.0/ outs: - - md5: 305bd6080fdf3a0ea6ac89d1e4fdf681 - size: 14719 + - md5: cd763a16ae80aa391f2f82b4572ba272 + size: 26019 path: github_stats_issues_users.csv diff --git a/snapshots/covid/2024-11-05/github_stats_pr_users.csv.dvc b/snapshots/covid/2024-11-05/github_stats_pr_users.csv.dvc index 1d1da07e658..1d02c491a20 100644 --- a/snapshots/covid/2024-11-05/github_stats_pr_users.csv.dvc +++ b/snapshots/covid/2024-11-05/github_stats_pr_users.csv.dvc @@ -29,6 +29,6 @@ meta: url: https://creativecommons.org/licenses/by/4.0/ outs: - - md5: 6592cf7306251d5a094d478ab7df58b6 - size: 803 + - md5: 93433b55f115e7ca67f749725a88672a + size: 7226 path: github_stats_pr_users.csv diff --git a/snapshots/covid/2024-11-05/github_stats_vax_reporting.csv.dvc b/snapshots/covid/2024-11-05/github_stats_vax_reporting.csv.dvc new file mode 100644 index 00000000000..cac801c4285 --- /dev/null +++ b/snapshots/covid/2024-11-05/github_stats_vax_reporting.csv.dvc @@ -0,0 +1,33 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: GitHub stats on owid/covid-19-data repository + description: |- + During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more. + + This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible. + title_snapshot: "GitHub stats on owid/covid-19-data repository: Users from Commits" + description_snapshot: |- + This snapshot contains the list of dates when OWID first obtained data for each country in the owid/covid-19-data GitHub repository. + + date_published: 2024-11-13 + + # Citation + producer: Our World in Data + citation_full: |- + Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data. + + # Files + url_main: https://github.com/owid/covid-19-data + date_accessed: 2024-11-13 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ +outs: + - md5: c5db7b864c4df639b6f66d595b0a0c6d + size: 25667 + path: github_stats_vax_reporting.csv diff --git a/snapshots/covid/latest/cases_deaths.csv.dvc b/snapshots/covid/latest/cases_deaths.csv.dvc index de4dbd4aa93..247507293dc 100644 --- a/snapshots/covid/latest/cases_deaths.csv.dvc +++ b/snapshots/covid/latest/cases_deaths.csv.dvc @@ -12,22 +12,22 @@ meta: All data represent date of reporting as opposed to date of symptom onset. All data are subject to continuous verification and may change based on retrospective updates to accurately reflect trends, changes in country case definitions and/or reporting practices. Significant data errors detected or reported to WHO may be corrected at more frequent intervals. - New case and death counts from the Region of the Americas + **New case and death counts from the Region of the Americas** Starting from the week commencing on 11 September 2023, the source of the data from the Region of the Americas was switched to the aggregated national surveillances, received through the COVID-19, Influenza, RSV and Other Respiratory Viruses program in the Americas. Data have been included retrospectively since 31 July 2023. - Rates + **Rates** <0.001 per 100,000 population may be rounded to 0. citation_full: 'WHO COVID-19 Dashboard. Geneva: World Health Organization, 2020. Available online: https://covid19.who.int/' attribution_short: WHO version_producer: WHO COVID-19 Dashboard - Daily cases and deaths url_main: https://covid19.who.int/ - url_download: https://covid19.who.int/WHO-COVID-19-global-data.csv - date_accessed: 2024-11-04 + url_download: https://srhdpeuwpubsa.blob.core.windows.net/whdh/COVID/WHO-COVID-19-global-daily-data.csv + date_accessed: 2024-12-11 date_published: '2024-07-07' license: name: CC BY 4.0 url: https://data.who.int/dashboards/covid19/ outs: - - md5: b7dacf0c7f6240e37a0404724cac65c2 - size: 2827211 + - md5: 16914ffd0a8531ef26e28bc0578eb491 + size: 19539571 path: cases_deaths.csv diff --git a/snapshots/covid/latest/cases_deaths.py b/snapshots/covid/latest/cases_deaths.py index 43b5bb47803..ba96124a716 100644 --- a/snapshots/covid/latest/cases_deaths.py +++ b/snapshots/covid/latest/cases_deaths.py @@ -1,4 +1,13 @@ -"""Script to create a snapshot of dataset.""" +"""Script to create a snapshot of dataset. + +As of 2024-11-29, the WHO reports three files for cases & deaths: + +- [NEW] Daily frequency reporting of new COVID-19 cases and deaths by date reported to WHO: Mostly weekly data, but occasionally daily data (especially past data). +- Weekly COVID-19 cases and deaths by date reported to WHO: Reports weekly values. This is what we have been using since we switched from JHU to WHO. +- Latest reported counts of COVID-19 cases and deaths: Reports latest values (only latest date is available) + + +""" from datetime import date from pathlib import Path diff --git a/snapshots/demography/2024-11-26/multiple_births.7z.dvc b/snapshots/demography/2024-11-26/multiple_births.7z.dvc new file mode 100644 index 00000000000..aec8d447b87 --- /dev/null +++ b/snapshots/demography/2024-11-26/multiple_births.7z.dvc @@ -0,0 +1,63 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Human Multiple Births Database + description: |- + The frequency of twin births has significantly increased in developed countries, doubling since the 1970s (see Figure 1). Two main factors have contributed to this development, namely the rise in the use of medically assisted reproduction techniques, as well as a substantial increase in the mean age at childbearing (Pison et al., 2015). This 'boom' in the birth of twins constitutes a public health challenge, in that twins tend to have frailer health than singletons, at least during their early years. Compared to singletons, twins have lower birth weight, they tend to be born prematurely, and the deliveries are more complicated, all of which can potentially lead to long-term health problems. It is therefore important to understand better the causes of the increase in the twinning rate, as well as the variations across countries. + + The Human Multiple Births Database (HMBD) gathers the number of twin births and the twinning rates for countries with reliable statistics. The database also provides statistics on other multiple births (i.e., triplets, quadruplets, etc.) whenever possible. Although their frequency has increased even more than that of twins, they still constitute a minority, as most multiple deliveries involve twins. A detailed description of the HMBD is available in this article (DOI: 10.4054/DemRes.2023.48.4). + date_published: "2024-09-30" + version_producer: v.1 + + # Citation + producer: Human Multiple Births Database + citation_full: |- + Human Multiple Births Database (2024). French Institute for Demographic Studies - INED (distributor). Extracted from: https://www.twinbirths.org (26/11/2024). + + A detailed description of the HMBD is available in: + Torres, C., Caporali, A., & Pison, G. (2023). The Human Multiple Births Database (HMBD). Demographic Research, 48, 89–106. https://doi.org/10.4054/demres.2023.48.4 + + Country-level sources: + - Australia: Australian Bureau of Statistics (https://www.abs.gov.au/) + - Austria: Statistics Austria (https://www.statistik.at/web_en/statistics/index.html) + - Canada: Statistics Canada (https://www.statcan.gc.ca/) + - Chile: Instituto Nacional de Estadísticas (http://www.ine.cl/) + - Czech Republic: Czech Statistical Office (https://www.czso.cz/csu/czso/home) + - Denmark: Statistics Denmark (https://www.dst.dk/en) + - Finland: Statistics Finland (https://www.stat.fi/index_en.html) + - France: INSEE (https://www.insee.fr/fr/accueil) + - Germany: Statistisches Bundesamt (https://www.destatis.de/EN/Home/_node.html) + - Greece: Hellenic Statistical Authority (https://www.statistics.gr/en/home/) + - Iceland: Statistics Iceland (https://www.statice.is/) + - Italy: ISTAT and Ministerio della Salute (https://www.istat.it/en/, https://www.salute.gov.it/portale/home.html) + - Japan: Ministry of Health, Labour and Welfare (https://www.e-stat.go.jp/en) + - Lithuania: Statistics Lithuania (https://www.stat.gov.lt/en/) + - New Zealand: Statistics New Zealand (https://www.stats.govt.nz/) + - Netherlands: Statistics Netherlands (https://www.cbs.nl/en-gb) + - Norway: Statistics Norway (https://www.ssb.no/en) + - South Korea: Statistics Korea (http://kostat.go.kr/portal/eng/index.action) + - Spain: Instituto Nacional de Estadística (https://www.ine.es/en/index.htm) + - Sweden: Statistics Sweden (https://www.scb.se/en/) + - Switzerland: Federal Statistical Office (https://www.bfs.admin.ch/bfs/en/home.html) + - United States: Centers for Disease Control and Prevention - National Center for Health Statistics (https://www.cdc.gov/nchs/index.htm) + - UK - England and Wales: Office for National Statistics (https://www.ons.gov.uk/) + - UK - Scotland: National Records of Scotland (https://www.nrscotland.gov.uk/) + - Uruguay: Instituto Nacional de Estadística (https://www.gub.uy/ministerio-salud-publica/home) + + attribution_short: HMDB + # Files + url_main: https://www.twinbirths.org/en/data-metadata/ + url_download: https://www.twinbirths.org/fichier/s_rubrique/30699/hmbd_pooled_data_30.09.2024.7z + date_accessed: 2024-11-26 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: 14a9122b39cb033b646c44a915644d63 + size: 69023 + path: multiple_births.7z diff --git a/snapshots/demography/2024-11-26/multiple_births.py b/snapshots/demography/2024-11-26/multiple_births.py new file mode 100644 index 00000000000..51fa8c688d2 --- /dev/null +++ b/snapshots/demography/2024-11-26/multiple_births.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"demography/{SNAPSHOT_VERSION}/multiple_births.7z") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/ember/2024-05-08/yearly_electricity.csv.dvc b/snapshots/ember/2024-05-08/yearly_electricity.csv.dvc index a043c574ba7..6b3e81662af 100644 --- a/snapshots/ember/2024-05-08/yearly_electricity.csv.dvc +++ b/snapshots/ember/2024-05-08/yearly_electricity.csv.dvc @@ -7,7 +7,7 @@ meta: This dataset contains yearly electricity generation, capacity, emissions, import and demand data for over 200 geographies. - You can find more about Ember's methodology in [this document](https://ember-climate.org/app/uploads/2022/07/Ember-Electricity-Data-Methodology.pdf). + You can find more about Ember's methodology in [this document](https://storage.googleapis.com/emb-prod-bkt-publicdata/public-downloads/ember_electricity_data_methodology.pdf). # Citation producer: Ember @@ -19,7 +19,7 @@ meta: # Files url_main: https://ember-climate.org/data-catalogue/yearly-electricity-data/ - # url_download: https://ember-climate.org/app/uploads/2022/07/yearly_full_release_long_format.csv + # url_download: https://storage.googleapis.com/emb-prod-bkt-publicdata/public-downloads/yearly_full_release_long_format.csv date_accessed: 2024-05-08 # License diff --git a/snapshots/ember/2024-05-08/yearly_electricity.py b/snapshots/ember/2024-05-08/yearly_electricity.py index a7f5261ecba..79695afd76b 100644 --- a/snapshots/ember/2024-05-08/yearly_electricity.py +++ b/snapshots/ember/2024-05-08/yearly_electricity.py @@ -18,6 +18,8 @@ # TODO: Temporarily using a local file. Fetch data directly using the yearly electricity data url after next update. # The download url should still be the same: # https://ember-climate.org/app/uploads/2022/07/yearly_full_release_long_format.csv +# NOTE: This link seems to have changed now to: +# https://storage.googleapis.com/emb-prod-bkt-publicdata/public-downloads/yearly_full_release_long_format.csv ######################################################################################################################## diff --git a/snapshots/ember/2024-11-20/european_wholesale_electricity_prices.csv.dvc b/snapshots/ember/2024-11-20/european_wholesale_electricity_prices.csv.dvc new file mode 100644 index 00000000000..ba348bf3808 --- /dev/null +++ b/snapshots/ember/2024-11-20/european_wholesale_electricity_prices.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: European Wholesale Electricity Price Data + description: |- + Wholesale day-ahead electricity price data for European countries. + date_published: "2024-11-20" + title_snapshot: European Wholesale Electricity Price Data - Monthly + + # Citation + producer: Ember + citation_full: |- + Ember - European Wholesale Electricity Price Data. Based on data from European Network of Transmission System Operators (ENTSO-E). + + # Files + url_main: https://ember-energy.org/data/european-wholesale-electricity-price-data/ + url_download: https://storage.googleapis.com/emb-prod-bkt-publicdata/public-downloads/european_wholesale_electricity_price_data_monthly.csv + date_accessed: 2024-11-20 + + # License + license: + name: CC BY 4.0 + url: https://ember-energy.org/creative-commons/ + +outs: + - md5: ce52ff862b464953c93b1f04531d0db5 + size: 94675 + path: european_wholesale_electricity_prices.csv diff --git a/snapshots/ember/2024-11-20/european_wholesale_electricity_prices.py b/snapshots/ember/2024-11-20/european_wholesale_electricity_prices.py new file mode 100644 index 00000000000..2f52db1b75d --- /dev/null +++ b/snapshots/ember/2024-11-20/european_wholesale_electricity_prices.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"ember/{SNAPSHOT_VERSION}/european_wholesale_electricity_prices.csv") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/emissions/2023-10-24/emission_factors.xlsx.dvc b/snapshots/emissions/2023-10-24/emission_factors.xlsx.dvc index cd242dc46f2..b45df33dc35 100644 --- a/snapshots/emissions/2023-10-24/emission_factors.xlsx.dvc +++ b/snapshots/emissions/2023-10-24/emission_factors.xlsx.dvc @@ -5,7 +5,7 @@ meta: # Data product / Snapshot title: Emission Factor Database description: |- - The Intergovernmental Panel on Climate Change (IPCC) Emission Factor Database (EFDB) is a library of emission factors and parameters that can be used for estimation of national greenhouse gas emissions/removals. For more details, see [the User Manual](https://www.ipccnggip.iges.or.jp/EFDB/documents/EFDB_User_Manual.pdf). + The Intergovernmental Panel on Climate Change (IPCC) Emission Factor Database (EFDB) is a library of emission factors and parameters that can be used for estimation of national greenhouse gas emissions/removals. For more details, see [the User Manual](https://www.ipcc-nggip.iges.or.jp/EFDB/documents/EFDB_User_Manual.pdf). date_published: 2023-10-24 # Citation diff --git a/snapshots/emissions/2024-11-21/national_contributions.py b/snapshots/emissions/2024-11-21/national_contributions.py new file mode 100644 index 00000000000..5519cd0edb7 --- /dev/null +++ b/snapshots/emissions/2024-11-21/national_contributions.py @@ -0,0 +1,109 @@ +"""Script to create a snapshot of dataset National contributions to climate change (Jones et al.). + +NOTE: All metadata fields are automatically updated by this script. However, the dataset description may change a bit +(for example they may cite more recent papers). Visually inspect the dataset description and manually make small +modifications, if needed. + +""" + +from datetime import datetime +from pathlib import Path +from typing import Dict + +import click +import requests +from bs4 import BeautifulSoup + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Names of data files to snapshot. +DATA_FILES = [ + "annual_emissions.csv", + "cumulative_emissions.csv", + "temperature_response.csv", +] + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + for data_file in DATA_FILES: + # Create a new snapshot. + snap = Snapshot(f"emissions/{SNAPSHOT_VERSION}/national_contributions_{data_file}") + + # Get the publication date (it needs to be done only once). + extracted_fields = extract_metadata_from_main_page(snap) + + for field in extracted_fields: + # Replace metadata fields with the new extracted fields. + setattr(snap.metadata.origin, field, extracted_fields[field]) + + # Rewrite metadata to dvc file. + snap.metadata_path.write_text(snap.metadata.to_yaml()) + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +def extract_metadata_from_main_page(snap: Snapshot) -> Dict[str, str]: + """Extract the publication date.""" + # Get the full HTML content of the main page. + response = requests.get(snap.metadata.origin.url_main) # type: ignore + + # The "latest" url redirects to the new record (which we need to extract other fields). + response_final = response.url + + # Parse the HTML content of the main page. + soup = BeautifulSoup(response.content, "html.parser") + + # Extract the publication date, which is given in one of the first sentences as in, e.g. "Published March 19, 2024". + date_published_str = [line.split("Published")[1].strip() for line in soup.text.split("\n") if "Published" in line][ + 0 + ] + + # Convert to ISO format. + date_published = datetime.strptime(date_published_str, "%B %d, %Y").strftime("%Y-%m-%d") + + # Extract the version of the data producer. + version_producer = [line.split("| Version ")[1].strip() for line in soup.text.split("\n") if "| Version " in line][ + 0 + ] + + # The download links have the years hardcoded in the url, so we need to update them. + file_name = snap.metadata.origin.url_download.split("/")[-1] # type: ignore + # Assume that the latest informed year in the data is 1 years before the current version. + # NOTE: This is tricky, it may not work on next update. + file_name_new = file_name.split("-")[0] + "-" + str(int(version_producer.split(".")[0]) - 1) + ".csv" + # Create the new download url (using the new token for the latest version, and the latest year in the file name). + url_download = response_final + "/files/" + file_name_new + + # The full citation is not included in the HTML and is fetched from an API. + response_citation = requests.get( + response_final.replace("records/", "api/records/") + "?style=chicago-fullnote-bibliography", + headers={"Accept": "text/x-bibliography"}, + ) + + # Extract the full citation. + citation_full = response_citation.text + + # Gather all extracted fields. + extracted_fields = { + "date_published": date_published, + "version_producer": version_producer, + "url_download": url_download, + "citation_full": citation_full, + } + + return extracted_fields + + +if __name__ == "__main__": + main() diff --git a/snapshots/emissions/2024-11-21/national_contributions_annual_emissions.csv.dvc b/snapshots/emissions/2024-11-21/national_contributions_annual_emissions.csv.dvc new file mode 100644 index 00000000000..3546665332f --- /dev/null +++ b/snapshots/emissions/2024-11-21/national_contributions_annual_emissions.csv.dvc @@ -0,0 +1,35 @@ +meta: + origin: + producer: Jones et al. + title: National contributions to climate change + description: |- + A dataset describing the global warming response to national emissions CO₂, CH4 and N2O from fossil and land use sources. + + National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide. + + This dataset describes the global warming response to national emissions CO₂, CH₄ and N₂O from fossil and land use sources since 1851. + + National CO₂ emissions data are collated from the Global Carbon Project (Andrew and Peters, 2023; Friedlingstein et al., 2023). + + National CH₄ and N₂O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2023). + + A time series of cumulative CO₂-equivalent emissions is constructed for each country, gas, and emissions source (fossil or land use). Emissions of CH₄ and N₂O emissions are related to cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021). + + Warming in response to cumulative CO₂-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST). + + The data files provide emissions, cumulative emissions and the GMST response by country, gas (CO₂, CH₄, N₂O or 3-GHG total) and source (fossil emissions, land use emissions or the total). + title_snapshot: National contributions to climate change - Annual emissions + citation_full: |- + Jones, Matthew W., Glen P. Peters, Thomas Gasser, Robbie M. Andrew, Clemens Schwingshackl, Johannes Gütschow, Richard A. Houghton, Pierre Friedlingstein, Julia Pongratz, and Corinne Le Quéré. “National Contributions to Climate Change Due to Historical Emissions of Carbon Dioxide, Methane and Nitrous Oxide”. Scientific Data. Zenodo, November 13, 2024. https://doi.org/10.5281/zenodo.14054503. + version_producer: '2024.2' + url_main: https://zenodo.org/records/7636699/latest + url_download: https://zenodo.org/records/14054503/files/EMISSIONS_ANNUAL_1830-2023.csv + date_accessed: '2024-11-21' + date_published: '2024-11-13' + license: + name: CC BY 4.0 + url: https://zenodo.org/records/7636699/latest +outs: + - md5: 89896fd652e79bfc6c1ffc8cc506077f + size: 26055255 + path: national_contributions_annual_emissions.csv diff --git a/snapshots/emissions/2024-11-21/national_contributions_cumulative_emissions.csv.dvc b/snapshots/emissions/2024-11-21/national_contributions_cumulative_emissions.csv.dvc new file mode 100644 index 00000000000..f2a79b74d3c --- /dev/null +++ b/snapshots/emissions/2024-11-21/national_contributions_cumulative_emissions.csv.dvc @@ -0,0 +1,35 @@ +meta: + origin: + producer: Jones et al. + title: National contributions to climate change + description: |- + A dataset describing the global warming response to national emissions CO₂, CH4 and N2O from fossil and land use sources. + + National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide. + + This dataset describes the global warming response to national emissions CO₂, CH₄ and N₂O from fossil and land use sources since 1851. + + National CO₂ emissions data are collated from the Global Carbon Project (Andrew and Peters, 2023; Friedlingstein et al., 2023). + + National CH₄ and N₂O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2023). + + A time series of cumulative CO₂-equivalent emissions is constructed for each country, gas, and emissions source (fossil or land use). Emissions of CH₄ and N₂O emissions are related to cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021). + + Warming in response to cumulative CO₂-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST). + + The data files provide emissions, cumulative emissions and the GMST response by country, gas (CO₂, CH₄, N₂O or 3-GHG total) and source (fossil emissions, land use emissions or the total). + title_snapshot: National contributions to climate change - Cumulative emissions + citation_full: |- + Jones, Matthew W., Glen P. Peters, Thomas Gasser, Robbie M. Andrew, Clemens Schwingshackl, Johannes Gütschow, Richard A. Houghton, Pierre Friedlingstein, Julia Pongratz, and Corinne Le Quéré. “National Contributions to Climate Change Due to Historical Emissions of Carbon Dioxide, Methane and Nitrous Oxide”. Scientific Data. Zenodo, November 13, 2024. https://doi.org/10.5281/zenodo.14054503. + version_producer: '2024.2' + url_main: https://zenodo.org/records/7636699/latest + url_download: https://zenodo.org/records/14054503/files/EMISSIONS_CUMULATIVE_CO2e100_1851-2023.csv + date_accessed: '2024-11-21' + date_published: '2024-11-13' + license: + name: CC BY 4.0 + url: https://zenodo.org/records/7636699/latest +outs: + - md5: 0234dd69545cdda94dd3df41bafbcc00 + size: 33316386 + path: national_contributions_cumulative_emissions.csv diff --git a/snapshots/emissions/2024-11-21/national_contributions_temperature_response.csv.dvc b/snapshots/emissions/2024-11-21/national_contributions_temperature_response.csv.dvc new file mode 100644 index 00000000000..e784b3a375c --- /dev/null +++ b/snapshots/emissions/2024-11-21/national_contributions_temperature_response.csv.dvc @@ -0,0 +1,35 @@ +meta: + origin: + producer: Jones et al. + title: National contributions to climate change + description: |- + A dataset describing the global warming response to national emissions CO₂, CH4 and N2O from fossil and land use sources. + + National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide. + + This dataset describes the global warming response to national emissions CO₂, CH₄ and N₂O from fossil and land use sources since 1851. + + National CO₂ emissions data are collated from the Global Carbon Project (Andrew and Peters, 2023; Friedlingstein et al., 2023). + + National CH₄ and N₂O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2023). + + A time series of cumulative CO₂-equivalent emissions is constructed for each country, gas, and emissions source (fossil or land use). Emissions of CH₄ and N₂O emissions are related to cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021). + + Warming in response to cumulative CO₂-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST). + + The data files provide emissions, cumulative emissions and the GMST response by country, gas (CO₂, CH₄, N₂O or 3-GHG total) and source (fossil emissions, land use emissions or the total). + title_snapshot: National contributions to climate change - Temperature response + citation_full: |- + Jones, Matthew W., Glen P. Peters, Thomas Gasser, Robbie M. Andrew, Clemens Schwingshackl, Johannes Gütschow, Richard A. Houghton, Pierre Friedlingstein, Julia Pongratz, and Corinne Le Quéré. “National Contributions to Climate Change Due to Historical Emissions of Carbon Dioxide, Methane and Nitrous Oxide”. Scientific Data. Zenodo, November 13, 2024. https://doi.org/10.5281/zenodo.14054503. + version_producer: '2024.2' + url_main: https://zenodo.org/records/7636699/latest + url_download: https://zenodo.org/records/14054503/files/GMST_response_1851-2023.csv + date_accessed: '2024-11-21' + date_published: '2024-11-13' + license: + name: CC BY 4.0 + url: https://zenodo.org/records/7636699/latest +outs: + - md5: 654dd70760d0892a4f3d2588f820ca88 + size: 28813693 + path: national_contributions_temperature_response.csv diff --git a/snapshots/eurostat/2024-11-05/gas_and_electricity_prices.py b/snapshots/eurostat/2024-11-05/gas_and_electricity_prices.py new file mode 100644 index 00000000000..b0bf9e6d6af --- /dev/null +++ b/snapshots/eurostat/2024-11-05/gas_and_electricity_prices.py @@ -0,0 +1,90 @@ +"""Script to create a snapshot of dataset.""" + +import zipfile +from pathlib import Path + +import click +import requests +from tqdm.auto import tqdm + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Base URL for Eurostat API energy data. +BASE_URL = "https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/data/" + +# List of dataset codes to download. +URL_DOWNLOADS = [ + #################################################################################################################### + # Energy statistics - natural gas and electricity prices (from 2007 onwards) (nrg_pc) + # Gas prices for household consumers - bi-annual data (from 2007 onwards) (nrg_pc_202) + "nrg_pc_202", + # Gas prices for non-household consumers - bi-annual data (from 2007 onwards) (nrg_pc_203) + "nrg_pc_203", + # Electricity prices for household consumers - bi-annual data (from 2007 onwards) (nrg_pc_204) + "nrg_pc_204", + # Electricity prices for non-household consumers - bi-annual data (from 2007 onwards) (nrg_pc_205) + "nrg_pc_205", + # Household consumption volumes of gas by consumption bands (nrg_pc_202_v) + "nrg_pc_202_v", + # Non-household consumption volumes of gas by consumption bands (nrg_pc_203_v) + "nrg_pc_203_v", + # Household consumption volumes of electricity by consumption bands (nrg_pc_204_v) + "nrg_pc_204_v", + # Non-household consumption volumes of electricity by consumption bands (nrg_pc_205_v) + "nrg_pc_205_v", + # Gas prices components for household consumers - annual data (nrg_pc_202_c) + "nrg_pc_202_c", + # Gas prices components for non-household consumers - annual data (nrg_pc_203_c) + "nrg_pc_203_c", + # Electricity prices components for household consumers - annual data (from 2007 onwards) (nrg_pc_204_c) + "nrg_pc_204_c", + # Electricity prices components for non-household consumers - annual data (from 2007 onwards) (nrg_pc_205_c) + "nrg_pc_205_c", + # Share for transmission and distribution in the network cost for gas and electricity - annual data (nrg_pc_206) + "nrg_pc_206", + #################################################################################################################### + # Energy statistics - natural gas and electricity prices (until 2007) (nrg_pc_h) + # Gas prices for domestic consumers - bi-annual data (until 2007) (nrg_pc_202_h) + "nrg_pc_202_h", + # Gas prices for industrial consumers - bi-annual data (until 2007) (nrg_pc_203_h) + "nrg_pc_203_h", + # Electricity prices for domestic consumers - bi-annual data (until 2007) (nrg_pc_204_h) + "nrg_pc_204_h", + # Electricity prices for industrial consumers - bi-annual data (until 2007) (nrg_pc_205_h) + "nrg_pc_205_h", + # Electricity - marker prices - bi-annual data (until 2007) (nrg_pc_206_h) + "nrg_pc_206_h", + #################################################################################################################### +] +# Further API parameters to download each file. +URL_SUFFIX = "?format=TSV&compressed=false" + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"eurostat/{SNAPSHOT_VERSION}/gas_and_electricity_prices.zip") + + # Ensure output snapshot folder exists, otherwise create it. + snap.path.parent.mkdir(exist_ok=True, parents=True) + + # Create a temporary ZIP file. + with zipfile.ZipFile(snap.path, "w") as zip_file: + # Fetch all relevant datasets from Eurostat API. + for code in tqdm(URL_DOWNLOADS): + # Request the data file for the current dataset. + response = requests.get(f"{BASE_URL}{code}{URL_SUFFIX}") + # Save each file inside the ZIP file. + file_name = f"{code}.tsv" + zip_file.writestr(file_name, response.text) + + # Create snapshot and upload to R2. + snap.create_snapshot(upload=upload, filename=snap.path) + + +if __name__ == "__main__": + main() diff --git a/snapshots/eurostat/2024-11-05/gas_and_electricity_prices.zip.dvc b/snapshots/eurostat/2024-11-05/gas_and_electricity_prices.zip.dvc new file mode 100644 index 00000000000..96d2eb2963a --- /dev/null +++ b/snapshots/eurostat/2024-11-05/gas_and_electricity_prices.zip.dvc @@ -0,0 +1,26 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Energy statistics, prices of natural gas and electricity + date_published: "2024-11-04" + + # Citation + producer: Eurostat + citation_full: |- + Eurostat - Energy statistics, prices of natural gas and electricity (2024). + + # Files + url_main: https://ec.europa.eu/eurostat/web/energy/database + date_accessed: 2024-11-05 + + # License + license: + name: CC BY 4.0 + url: https://ec.europa.eu/eurostat/web/main/help/copyright-notice + +outs: + - md5: 285787159fb3f598db8c0fb0ba67eb2c + size: 8286547 + path: gas_and_electricity_prices.zip diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc index 6c6fbad82e8..29102377237 100644 --- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc +++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc @@ -13,8 +13,8 @@ meta: HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality). url: https://www.mortality.org/Data/STMF source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv - date_accessed: 2024-11-04 - publication_date: 2024-09-30 + date_accessed: 2024-12-11 + publication_date: 2024-11-11 publication_year: 2024 published_by: |- HMD. Human Mortality Database. Max Planck Institute for Demographic Research (Germany), University of California, Berkeley (USA), and French Institute for Demographic Studies (France). Available at www.mortality.org. @@ -33,6 +33,6 @@ meta: name: Creative Commons BY 4.0 url: https://www.mortality.org/Data/UserAgreement outs: - - md5: 63069a522999934ab7a3285bfba1c59e - size: 21148259 + - md5: bd5792a20fbb75f8ae5f55a6abafcca5 + size: 21258242 path: hmd_stmf.csv diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc index 214eb08ace0..d7b4d86e4fd 100644 --- a/snapshots/excess_mortality/latest/wmd.csv.dvc +++ b/snapshots/excess_mortality/latest/wmd.csv.dvc @@ -13,7 +13,7 @@ meta: Published paper available at https://elifesciences.org/articles/69336. url: https://github.com/akarlinsky/world_mortality/ source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv - date_accessed: 2024-11-04 + date_accessed: 2024-12-11 publication_date: '2021-06-30' publication_year: 2021 published_by: |- @@ -33,6 +33,6 @@ meta: name: MIT License url: https://github.com/akarlinsky/world_mortality/blob/main/LICENSE outs: - - md5: e5b41b01bccad6ac31ac3629ee089077 - size: 1078035 + - md5: cab03dff0de45a45aae54fe9772c4666 + size: 1087717 path: wmd.csv diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc index 3e557e2c823..91a48ea6e6d 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc @@ -7,7 +7,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv - date_accessed: 2024-11-04 + date_accessed: 2024-12-11 publication_date: '2021-06-30' publication_year: 2021 published_by: |- @@ -22,6 +22,6 @@ meta: url: https://github.com/dkobak/excess-mortality/blob/main/LICENSE access_notes: Contains data by age. outs: - - md5: a23769cd99a7f951d07cbe486355976e + - md5: 4f3aceb8263897e9a0aa899eb83a1982 size: 381804 path: xm_karlinsky_kobak.csv diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc index 821c68fb9e1..4571161e0bd 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc @@ -6,7 +6,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv - date_accessed: 2024-11-04 + date_accessed: 2024-12-11 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/fasttrack/2022-11-01/lighting_efficiency_uk.csv.dvc b/snapshots/fasttrack/2022-11-01/lighting_efficiency_uk.csv.dvc index a8571c72af8..40805443ffa 100644 --- a/snapshots/fasttrack/2022-11-01/lighting_efficiency_uk.csv.dvc +++ b/snapshots/fasttrack/2022-11-01/lighting_efficiency_uk.csv.dvc @@ -1,18 +1,18 @@ meta: - namespace: fasttrack - short_name: lighting_efficiency_uk - file_extension: csv - date_accessed: 2023-01-20 + source: + name: |- + Fouquet & Pearson (2006). Seven centuries of energy services: The price and use of light in the United Kingdom (1300-2000). + url: https://www.jstor.org/stable/23296980 + source_data_url: |- + gAAAAABnRYZlXxepMAB9sIKn_qv_qmZn_zxaLMC7r6W0Pm4yDGaffkjMBQa1ovpzqTVnmQFcclmueGVS_TPk9_ethfFaKeAVFo7yRlvM-T5Vn56IJP1ZDGZzGF1pZQtvwvsWCYq7O_ZzRq9mGI439QHZ5s-0oewSvJFcpvI_Nc1EqVdsO-Hyb3hKhvCdXuR5Sg_E0gtagMAeh0KT8kAYTIxfAVtj29fY5yVUXXQU76CEkSRtenAVfSYjMD7hwik8ZFaG-fS-XRrP + date_accessed: '2024-11-26' + published_by: |- + Fouquet, R., & Pearson, P. J. (2006). Seven centuries of energy services: The price and use of light in the United Kingdom (1300-2000). The energy journal, 27(1). name: Lighting effiency and shares in the UK description: '' - source_name: Google Sheet - url: https://www.jstor.org/stable/23296980 - source_published_by: Google Sheet - source_data_url: gAAAAABjykepDM9qSPmcia30Y6XfRfCAr-IVVU4MKQ-5uronVcInRAbQENcbYoFR7vrbbSMRsNqulOO1uRMVdAZ3lQIdTl_fQNWcRFmxC1V8nujs-6pa5aXPNDAmm4mOtKD_d--xITDJEdrwIFi-xCKtZmx7oLm8b-sbG2b1WEvUHKbJAIARIgw1PEA_7KQEhiKmbnpU-SjalCi-9PAeMKUDtsQhxAYSBtkOUwXt7_HCq5UBYPvqA_N8fUw5pb7MDILs0bScwGzY + license: {} is_public: false - version: '2022-11-01' -wdir: ../../../data/snapshots/fasttrack/2022-11-01 outs: -- md5: 7e2d4e77b9c23946c89c518df591a009 - size: 747 - path: lighting_efficiency_uk.csv + - md5: 8de6c6871078ece444a6c927db29ad48 + size: 679 + path: lighting_efficiency_uk.csv diff --git a/snapshots/fasttrack/2023-01-19/food_expenditures_by_country.csv.dvc b/snapshots/fasttrack/2023-01-19/food_expenditures_by_country.csv.dvc deleted file mode 100644 index e0179559e5b..00000000000 --- a/snapshots/fasttrack/2023-01-19/food_expenditures_by_country.csv.dvc +++ /dev/null @@ -1,18 +0,0 @@ -meta: - namespace: fasttrack - short_name: food_expenditures_by_country - file_extension: csv - date_accessed: 2023-01-20 - name: Food expenditures by country (USDA, 2023) - description: '' - source_name: Google Sheet - url: https://www.ers.usda.gov/topics/international-markets-u-s-trade/international-consumer-and-food-industry-trends/#data - source_published_by: Google Sheet - source_data_url: gAAAAABjyoIXcZhrrY4oOnHOyLS8RohUA72-X3Z1ZHFU7A_Vmg60iZNiXhYuAi99P34Qs5wXDznB_BK-KyzpkEgsV9mGZ5miuZGV0RPrTrIrHdqBDozjzbbYdUE5N_yzDQFtHTcsD_2cIiqrqby4ZJrM2xRnegMQ5kQvMD8tdajOp9438_qAgKpz5ejRVYabGGuL88EXxtOds8UsvHhaPqwMb9_xe7GXVdEGRNIsEzzSJqZHFkCkj_R-Vzg2lHP1tgSLVeULyb2i - is_public: false - version: '2023-01-19' -wdir: ../../../data/snapshots/fasttrack/2023-01-19 -outs: -- md5: b03947c40e7c56fa98f08b06fea563e4 - size: 6497 - path: food_expenditures_by_country.csv diff --git a/snapshots/fasttrack/2023-03-27/global_warming_contributions.csv.dvc b/snapshots/fasttrack/2023-03-27/global_warming_contributions.csv.dvc index 596a41eeb39..574f274fa94 100644 --- a/snapshots/fasttrack/2023-03-27/global_warming_contributions.csv.dvc +++ b/snapshots/fasttrack/2023-03-27/global_warming_contributions.csv.dvc @@ -1,8 +1,13 @@ meta: - namespace: fasttrack - short_name: global_warming_contributions - file_extension: csv - date_accessed: 2023-03-29 + source: + name: Matthew Jones et al. (2023) + url: https://doi.org/10.5281/zenodo.7636699 + source_data_url: |- + gAAAAABnRYaj4DTXeJP0ir8ExcuaJRXjM2guld5knTOnakD5MVRDT9gg0A5PS-3i584pjGjV-RpP-30PBWfwi7DcqsY2qcDyjtdELFAz_0YeiJPZBPjb6SgpO5xGPJq5HBzJugTrPm15GuJ0ntAZGz3mh13Ev2eNs4AdpXcM5R2V-z9Ru3PdHs-ZFIpHVXUOD3zw5cGmm5mlCNybYfiqvFEPuhSgCP5-aWk2wZdj9Nv6rFUK6g2F1IKygoH1DUS5Gk4TciIBKcaT + date_accessed: '2024-11-26' + publication_year: 2023 + published_by: |- + Jones, Matthew W., Peters, Glen P., Gasser, Thomas, Andrew, Robbie M., Schwingshackl, Clemens, Gütschow, Johannes, Houghton, Richard A., Friedlingstein, Pierre, Pongratz, Julia, & Le Quéré, Corinne. (2023). National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide [Data set]. In Scientific Data (2023.1). name: Global warming contributions description: |- Jones et al. (2023) quantify national and regional contributions to the increase of global mean surface temperature over the last few centuries. As they detail: the "dataset describing the global warming response to national emissions CO2, CH4 and N2O from fossil and land use sources during 1851-2021. @@ -14,15 +19,9 @@ meta: We construct a time series of cumulative CO2-equivalent emissions for each country, gas, and emissions source (fossil or land use). Emissions of CH4 and N2O emissions are related to cumulative CO2-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021). Warming in response to cumulative CO2-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST)." - source_name: Google Sheet - url: https://doi.org/10.5281/zenodo.7636699 - source_published_by: Google Sheet - source_data_url: gAAAAABkJA7WvCo-f3u_JMGUF9cid_2fGu1DIt-m-5KWm8yczckrjMqCtatgoUgl0ndutC89gy_T0x7nP0SLoOLFwT9QPPBnHhk1q4NVdXRQmQGqtMUZ9fpI3ihqHUGi536Kh2baRyUgXWQFBG_3QPLq0gLEMs2o9zYNx_e0ZxsXrNrIsTCjtopQEeV4Wwc_h2dk_5l4i-MYFbNSbqs6hXMMmUAKRMIKmwxQG8pKnjgAoYSQZclY581EqJrFjRKnmFGSXbLnc6i4 + license: {} is_public: false - version: '2023-03-27' - publication_year: 2023 -wdir: ../../../data/snapshots/fasttrack/2023-03-27 outs: -- md5: 48f8ff1455c82c193d7e5ec1cbc58fb7 - size: 27352636 - path: global_warming_contributions.csv + - md5: 237c2f9f1ec5ffc19978b1f40d36b028 + size: 19377131 + path: global_warming_contributions.csv diff --git a/snapshots/fasttrack/2023-04-30/paratz.csv.dvc b/snapshots/fasttrack/2023-04-30/paratz.csv.dvc index d6cef55f5b9..cdab99d5e09 100644 --- a/snapshots/fasttrack/2023-04-30/paratz.csv.dvc +++ b/snapshots/fasttrack/2023-04-30/paratz.csv.dvc @@ -1,34 +1,19 @@ meta: - name: A systematic review of global autopsy rates in all-cause mortality and young - sudden death, Paratz et al (2023) - description: >- - The data for this indicator is taken from: Paratz ED, Rowe SJ, Stub D, Pflaumer - A, La Gerche A. A systematic review of - global autopsy rates in all-cause mortality and young sudden death. Heart Rhythm. - 2023 Apr;20(4):607-613. doi: 10.1016/j.hrthm.2023.01.008. - - - The data is collated from a number of published papers and databases. The year - shown reflects the date given in the database - or the year of the publication. For Spain and Australia the data is only representative - of a region of each country, Catalonia - and Victoria, respectively. source: - name: Google Sheet - description: + name: Paratz et al., (2023) url: https://www.heartrhythmjournal.com/article/S1547-5271(23)00027-9/fulltext - source_data_url: + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vROXfyuCq1_aNHxIkLGOhbi9xZmVHFvPltm44VU__skPYh4Gn9ES8oLcgI8okIF7D4Sts_gjD9568_O/pub?output=csv - owid_data_url: - date_accessed: '2023-07-13' - publication_date: + date_accessed: '2024-11-26' publication_year: 2023 - published_by: Google Sheet - license: - name: - url: -wdir: ../../../data/snapshots/fasttrack/2023-04-30 + published_by: Heart Rhythm Journal + name: A systematic review of global autopsy rates in all-cause mortality and young sudden death, Paratz et al (2023) + description: |- + The data for this indicator is taken from: Paratz ED, Rowe SJ, Stub D, Pflaumer A, La Gerche A. A systematic review of global autopsy rates in all-cause mortality and young sudden death. Heart Rhythm. 2023 Apr;20(4):607-613. doi: 10.1016/j.hrthm.2023.01.008. + + The data is collated from a number of published papers and databases. The year shown reflects the date given in the database or the year of the publication. For Spain and Australia the data is only representative of a region of each country, Catalonia and Victoria, respectively. + license: {} outs: -- md5: 14f4576ea63fda9388db9f2c3f596990 - size: 1110 - path: paratz.csv + - md5: 14f4576ea63fda9388db9f2c3f596990 + size: 1110 + path: paratz.csv diff --git a/snapshots/fasttrack/2023-05-03/apms_2014.csv.dvc b/snapshots/fasttrack/2023-05-03/apms_2014.csv.dvc index f73e679bdc0..e9c64b5d942 100644 --- a/snapshots/fasttrack/2023-05-03/apms_2014.csv.dvc +++ b/snapshots/fasttrack/2023-05-03/apms_2014.csv.dvc @@ -1,26 +1,21 @@ meta: - namespace: fasttrack - short_name: apms_2014 - file_extension: csv - date_accessed: 2023-05-03 + source: + name: Adult Psychiatric Morbidity Survey 2014, England (2016) + url: https://www.gov.uk/government/statistics/adult-psychiatric-morbidity-survey-mental-health-and-wellbeing-england-2014 + source_data_url: |- + gAAAAABnRYjm-BZi3Nqs7In2gpZhuG03kWo5SiR_xJvXwncMzCmm--o4Z8FRFP55gIjTYun0AkKER23bHIUepBVraLszkW61lTi6JF3H7H0letw5MoLXJDe4S8UEwvkcq5h37pnxrcSJnzImSFrTQOuHF8l2kjJAF1QscQkYd_LGafvd9rWKkokdPPVLBf7iopl1gdm1gu7njbwKkauTdZyUbskGamH5fblFVs2LZv_GOGMQTv9yc4C-7dcXaRivHu-c-oU0j61D + date_accessed: '2024-11-26' + publication_year: 2016 + published_by: |- + "McManus S, Bebbington P, Jenkins R, Brugha T. (eds.) (2016) Mental health and wellbeing in England: Adult Psychiatric Morbidity Survey 2014. Leeds: NHS Digital" name: Current depression in England by age and gender (APMS, 2014) - description: This is a dataset of the prevalence of current depression in the general - population in England, living in private households. Households were sampled randomly - and individuals were interviewed using the revised Clinical Interview Schedule - (CIS-R), which is a diagnostic structured interview format to determine whether - people had common mental disorders in the past week. In this dataset, presence - of a current episode of major depression was determined. - source_name: Google Sheet - url: https://www.gov.uk/government/statistics/adult-psychiatric-morbidity-survey-mental-health-and-wellbeing-england-2014 - source_published_by: Google Sheet - source_data_url: https://docs.google.com/spreadsheets/d/e/2PACX-1vR8UNjna5w3mHijBNhbtk2vWPh_d2tYa4CrOZc2WFKeYP-ChcOCxNttPyv5izWcjnsZZv3CY2f98sd3/pub?output=csv - license_url: https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/ - license_name: Open Government Licence v3.0 + description: |- + This is a dataset of the prevalence of current depression in the general population in England, living in private households. Households were sampled randomly and individuals were interviewed using the revised Clinical Interview Schedule (CIS-R), which is a diagnostic structured interview format to determine whether people had common mental disorders in the past week. In this dataset, presence of a current episode of major depression was determined. + license: + name: Open Government Licence v3.0 + url: https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/ is_public: false - version: '2023-05-03' - publication_year: 2016 -wdir: ../../../data/snapshots/fasttrack/2023-05-03 outs: -- md5: 7a42c7cee959d16ae9d981f4b8b7ade1 - size: 288 - path: apms_2014.csv + - md5: 7a42c7cee959d16ae9d981f4b8b7ade1 + size: 288 + path: apms_2014.csv diff --git a/snapshots/fasttrack/2023-05-31/cholera.csv.dvc b/snapshots/fasttrack/2023-05-31/cholera.csv.dvc index f7eaa4c7aa4..c35236fed09 100644 --- a/snapshots/fasttrack/2023-05-31/cholera.csv.dvc +++ b/snapshots/fasttrack/2023-05-31/cholera.csv.dvc @@ -1,8 +1,11 @@ meta: - namespace: fasttrack - short_name: cholera - file_extension: csv - date_accessed: 2023-06-01 + source: + name: World Health Organization (2023) + url: nan + source_data_url: |- + https://docs.google.com/spreadsheets/d/e/2PACX-1vSlmIK_QFOjnb3DSPq6AoEeVFCPhsvKBE4yiUKrOogGhyu98SqoJutDR_3CEI43nvslvYIcJjICn-qI/pub?output=csv + date_accessed: '2024-11-26' + published_by: World Health Organization name: Cholera reported cases, deaths and case fatality rate (WHO, 2023) description: |- The data is created by combining multiple WHO Weekly Epidemiological Reports for cholera reported cases, deaths and case fatality rate. @@ -18,15 +21,8 @@ meta: 2020: https://web.archive.org/web/20230326231135/http://apps.who.int/iris/bitstream/handle/10665/345271/WER9637-445-454-eng-fre.pdf?sequence=1&isAllowed=y 2021: https://web.archive.org/web/20230526223955/https://apps.who.int/iris/bitstream/handle/10665/362858/WER9737-453-464-eng-fre.pdf?sequence=1&isAllowed=y - source_name: Google Sheet - url: '' - source_published_by: Google Sheet - source_data_url: - https://docs.google.com/spreadsheets/d/e/2PACX-1vSlmIK_QFOjnb3DSPq6AoEeVFCPhsvKBE4yiUKrOogGhyu98SqoJutDR_3CEI43nvslvYIcJjICn-qI/pub?output=csv - is_public: true - version: '2023-05-31' -wdir: ../../../data/snapshots/fasttrack/2023-05-31 + license: {} outs: -- md5: 035a7f6ada3274928cffda95091de105 - size: 4449 - path: cholera.csv + - md5: 39d2f683da32e151413ab8d2dbfc7582 + size: 4141 + path: cholera.csv diff --git a/snapshots/fasttrack/2023-06-16/guinea_worm.csv.dvc b/snapshots/fasttrack/2023-06-16/guinea_worm.csv.dvc index 37627bc762e..cb7263f0844 100644 --- a/snapshots/fasttrack/2023-06-16/guinea_worm.csv.dvc +++ b/snapshots/fasttrack/2023-06-16/guinea_worm.csv.dvc @@ -1,21 +1,16 @@ meta: - namespace: fasttrack - short_name: guinea_worm - file_extension: csv - date_accessed: 2023-06-16 + source: + name: The Carter Center (2023) + url: https://www.cartercenter.org/resources/pdfs/news/health_publications/guinea_worm/guinea-worm-cases-by-year-from-1989.pdf + source_data_url: |- + https://docs.google.com/spreadsheets/d/e/2PACX-1vQRR-nQzFpJXWz6cuB9aqrt2yt2Z8BtOtu3eNkF4WvI3xuNEdinb0vCN3wIagy2hEOHiSj3xUAffHeX/pub?output=csv + date_accessed: '2024-11-26' + publication_year: 2023 + published_by: The Carter Center (2023) name: Guinea Worm Cases - Carter Center (2023) description: The number of cases of guinea worm disease worldwide since 1989 - source_name: Google Sheet - url: - https://www.cartercenter.org/resources/pdfs/news/health_publications/guinea_worm/guinea-worm-cases-by-year-from-1989.pdf - source_published_by: Google Sheet - source_data_url: - https://docs.google.com/spreadsheets/d/e/2PACX-1vQRR-nQzFpJXWz6cuB9aqrt2yt2Z8BtOtu3eNkF4WvI3xuNEdinb0vCN3wIagy2hEOHiSj3xUAffHeX/pub?output=csv - is_public: true - version: '2023-06-16' - publication_year: 2023 -wdir: ../../../data/snapshots/fasttrack/2023-06-16 + license: {} outs: -- md5: 44af3d83549167d5edde31c323526c19 - size: 582 - path: guinea_worm.csv + - md5: 44af3d83549167d5edde31c323526c19 + size: 582 + path: guinea_worm.csv diff --git a/snapshots/fasttrack/2023-06-19/world_population_comparison.csv.dvc b/snapshots/fasttrack/2023-06-19/world_population_comparison.csv.dvc index 9e53e1e1f8a..d111694cd95 100644 --- a/snapshots/fasttrack/2023-06-19/world_population_comparison.csv.dvc +++ b/snapshots/fasttrack/2023-06-19/world_population_comparison.csv.dvc @@ -1,55 +1,31 @@ meta: + source: + name: Multiple sources compiled by Our World in Data (2019) + url: nan + source_data_url: |- + https://docs.google.com/spreadsheets/d/e/2PACX-1vQjyo0SCpkP7gW490fsxx2x0nkCPqW3elr5LfI-zbFWkb1rzOAumgJrEDO0eFpoEtPsZyHjIM58iXDe/pub?output=csv + date_accessed: '2024-12-02' + published_by: Multiple sources compiled by Our World in Data (2019) name: Historical world population comparison (various sources) - description: >- + description: |- Among others these are the original source: + McEvedy, Colin and Richard Jones, 1978, “Atlas of World Population History,” Facts on File, New York, pp. 342-351. - McEvedy, Colin and Richard Jones, 1978, “Atlas of World Population History,” Facts - on File, New York, pp. 342-351. - - - Biraben, Jean-Noel, 1980, An Essay Concerning Mankind’s Evolution, Population, - Selected Papers, December, table 2. - - - Durand, John D., 1974, “Historical Estimates of World Population: An Evaluation,” - University of Pennsylvania, Population - Center, Analytical and Technical Reports, Number 10, table 2. + Biraben, Jean-Noel, 1980, An Essay Concerning Mankind’s Evolution, Population, Selected Papers, December, table 2. + Durand, John D., 1974, “Historical Estimates of World Population: An Evaluation,” University of Pennsylvania, Population Center, Analytical and Technical Reports, Number 10, table 2. - Haub, Carl, 1995, “How Many People Have Ever Lived on Earth?” Population Today, - February, p. 5. + Haub, Carl, 1995, “How Many People Have Ever Lived on Earth?” Population Today, February, p. 5. + Thomlinson, Ralph, 1975, “Demographic Problems, Controversy Over Population Control,” Second Edition, Table 1. - Thomlinson, Ralph, 1975, “Demographic Problems, Controversy Over Population Control,” - Second Edition, Table 1. - - - United Nations, 1999, The World at Six Billion, Table 1, “World Population From” - Year 0 to Stabilization, p. 5, - + United Nations, 1999, The World at Six Billion, Table 1, “World Population From” Year 0 to Stabilization, p. 5, U.S. Census Bureau (USCB), 2012, Total Midyear Population for the World: 1950-2050. - - Michael Kremer (1993) “Population Growth and Technological Change: One Million - B.C. to 1990”, Quarterly Journal of Economics., - August 1993, pp.681-716. - source: - name: Google Sheet - description: - url: '' - source_data_url: - https://docs.google.com/spreadsheets/d/e/2PACX-1vQjyo0SCpkP7gW490fsxx2x0nkCPqW3elr5LfI-zbFWkb1rzOAumgJrEDO0eFpoEtPsZyHjIM58iXDe/pub?output=csv - owid_data_url: - date_accessed: '2023-07-03' - publication_date: - publication_year: - published_by: Google Sheet - license: - name: - url: -wdir: ../../../data/snapshots/fasttrack/2023-06-19 + Michael Kremer (1993) “Population Growth and Technological Change: One Million B.C. to 1990”, Quarterly Journal of Economics., August 1993, pp.681-716. + license: {} outs: -- md5: fa998c4590ac0bdada993b0bcaf0f2a8 - size: 14087 - path: world_population_comparison.csv + - md5: f7ff2acb39ad85d3a9b2599a63175abc + size: 13424 + path: world_population_comparison.csv diff --git a/snapshots/fasttrack/2023-08-07/pain_hours_days_hen_systems.csv.dvc b/snapshots/fasttrack/2023-08-07/pain_hours_days_hen_systems.csv.dvc index 6593646b1ed..4693ae861db 100644 --- a/snapshots/fasttrack/2023-08-07/pain_hours_days_hen_systems.csv.dvc +++ b/snapshots/fasttrack/2023-08-07/pain_hours_days_hen_systems.csv.dvc @@ -1,17 +1,16 @@ meta: source: - name: Google Sheet + name: Welfare Footprint based on Schuck-Paim and Alonso (2021) url: https://welfarefootprint.org/research-projects/laying-hens/ - source_data_url: + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vRioMTHFwUUiCe2JWDOAZD7BDur4MSx3o576xfvTROzUY75sxIYqOZuIWQw_lopGkrLQ_lgXxMRgAk2/pub?output=csv - date_accessed: '2023-08-07' + date_accessed: '2024-11-26' publication_year: 2021 - published_by: Google Sheet + published_by: Welfare Footprint name: Pain hours and days of hen systems (Welfare Footprint) description: '' license: {} -wdir: ../../../data/snapshots/fasttrack/2023-08-07 outs: -- md5: 2053ef477157266c201a9c39c46a8df8 - size: 459 - path: pain_hours_days_hen_systems.csv + - md5: 2053ef477157266c201a9c39c46a8df8 + size: 459 + path: pain_hours_days_hen_systems.csv diff --git a/snapshots/fasttrack/2023-08-21/survey_livestock_oklahoma.csv.dvc b/snapshots/fasttrack/2023-08-21/survey_livestock_oklahoma.csv.dvc index 6a0b552ce7e..4e73c3ea3ea 100644 --- a/snapshots/fasttrack/2023-08-21/survey_livestock_oklahoma.csv.dvc +++ b/snapshots/fasttrack/2023-08-21/survey_livestock_oklahoma.csv.dvc @@ -1,18 +1,16 @@ meta: source: - name: Google Sheet - url: - https://web.archive.org/web/20190806000018/http://agecon.okstate.edu/files/january%202018.pdf - source_data_url: + name: Food Demand Survey, Oklahoma State University + url: https://web.archive.org/web/20190806000018/http://agecon.okstate.edu/files/january%202018.pdf + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vTO7prhB6yU1jGOYi4ZvkBUV3wsfGi-ua3nXOE0fB5OpNH9YTClBw0H166ZbQ4VKhvv1ee3PhuUDeaB/pub?output=csv - date_accessed: '2023-08-22' - publication_year: '2018' - published_by: Google Sheet + date_accessed: '2024-11-26' + publication_year: 2018 + published_by: Oklahoma State University, Department of Agricultural Economics name: Survey attitudes to livestock farming (Oklahoma University) description: '' license: {} -wdir: ../../../data/snapshots/fasttrack/2023-08-21 outs: -- md5: b5d54c9c99616a0f61523365746d0e81 - size: 1195 - path: survey_livestock_oklahoma.csv + - md5: b5d54c9c99616a0f61523365746d0e81 + size: 1195 + path: survey_livestock_oklahoma.csv diff --git a/snapshots/fasttrack/2023-09-29/un_space_objects.csv.dvc b/snapshots/fasttrack/2023-09-29/un_space_objects.csv.dvc index babcf82a93d..c2033a5be3b 100644 --- a/snapshots/fasttrack/2023-09-29/un_space_objects.csv.dvc +++ b/snapshots/fasttrack/2023-09-29/un_space_objects.csv.dvc @@ -1,45 +1,25 @@ meta: source: - name: Google Sheet + name: United Nations Office for Outer Space Affairs, Online Index of Objects Launched into Outer Space (2023) url: https://www.unoosa.org/oosa/osoindex/search-ng.jspx - source_data_url: + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vS0rsoIS37xrLx_JRHO2mdsqHmVreN255IRUSP1YLEP26NnIOz3bHLz5fDmR2bnTwwEY5gttb17fZgU/pub?output=csv - date_accessed: '2023-09-29' - publication_year: '2023' - published_by: Google Sheet + date_accessed: '2024-11-26' + publication_year: 2023 + published_by: Online Index of Objects Launched into Outer Space (2023) name: Online Index of Objects Launched into Outer Space (UN, 2023-09-29) - description: >- - This data is compiled from the Online Index of Objects Launched into Outer Space, - maintained by the United Nations Office - for Outer Space Affairs. + description: |- + This data is compiled from the Online Index of Objects Launched into Outer Space, maintained by the United Nations Office for Outer Space Affairs. + Since 1962, the United Nations has maintained a Register of Objects Launched into Outer Space. Originally established as a mechanism to aid the United Nations Committee on the Peaceful Uses of Outer Space in its discussions on the political, legal, and technical issues concerning outer space, the evolution of international space law resulted in space object registration becoming a means of identifying which States' bear international responsibility and liability for space objects. - Since 1962, the United Nations has maintained a Register of Objects Launched into - Outer Space. Originally established - as a mechanism to aid the United Nations Committee on the Peaceful Uses of Outer - Space in its discussions on the political, - legal, and technical issues concerning outer space, the evolution of international - space law resulted in space object - registration becoming a means of identifying which States' bear international - responsibility and liability for space objects. - - - The source indicates that around 87% of all satellites, probes, landers, crewed - spacecraft, and space station flight elements - launched into Earth orbit or beyond have been registered with the Secretary-General. - + The source indicates that around 87% of all satellites, probes, landers, crewed spacecraft, and space station flight elements launched into Earth orbit or beyond have been registered with the Secretary-General. In the data shown on our charts: - - - when an object is launched by a country on behalf of another one, it is attributed - to the latter; - - - when a launch is made jointly by several countries, it is recorded in each of - these countries' time series, but only - once in the 'World' series. + - when an object is launched by a country on behalf of another one, it is attributed to the latter; + - when a launch is made jointly by several countries, it is recorded in each of these countries' time series, but only once in the 'World' series. license: {} -wdir: ../../../data/snapshots/fasttrack/2023-09-29 outs: -- md5: 44879ab7f57a44915bcd36b4b7833fd5 - size: 22610 - path: un_space_objects.csv + - md5: 44879ab7f57a44915bcd36b4b7833fd5 + size: 22610 + path: un_space_objects.csv diff --git a/snapshots/fasttrack/2023-10-05/great_pacific_garbage_lebreton.csv.dvc b/snapshots/fasttrack/2023-10-05/great_pacific_garbage_lebreton.csv.dvc index 5030249d9f8..96ec46a4fe7 100644 --- a/snapshots/fasttrack/2023-10-05/great_pacific_garbage_lebreton.csv.dvc +++ b/snapshots/fasttrack/2023-10-05/great_pacific_garbage_lebreton.csv.dvc @@ -1,17 +1,17 @@ meta: source: - name: Google Sheet + name: Plastics in Great Pacific Garbage Patch (Lebreton et al. 2022) url: https://www.nature.com/articles/s41598-022-16529-0 - source_data_url: + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vShK8-lh5FUIl954ziCnKzkt9N3B5IdxAxk9mh-QhufRD-SuovXXLKjbQtq8g40yzUbHIPWRVLuVzKZ/pub?output=csv - date_accessed: '2023-10-05' - publication_year: '2022' - published_by: Google Sheet + date_accessed: '2024-11-26' + publication_year: 2022 + published_by: |- + Lebreton et al. (2022). Industrialised fishing nations largely contribute to floating plastic pollution in the North Pacific subtropical gyre. Nature Scientific Reports. name: Plastics in Great Pacific Garbage Patch (Lebreton) description: '' license: {} -wdir: ../../../data/snapshots/fasttrack/2023-10-05 outs: -- md5: 96e4d3017bd50b72a8bd267198ab1971 - size: 338 - path: great_pacific_garbage_lebreton.csv + - md5: 96e4d3017bd50b72a8bd267198ab1971 + size: 338 + path: great_pacific_garbage_lebreton.csv diff --git a/snapshots/fasttrack/latest/antimicrobial_usage_livestock.csv.dvc b/snapshots/fasttrack/latest/antimicrobial_usage_livestock.csv.dvc index 2ee12981302..e904f870cd9 100644 --- a/snapshots/fasttrack/latest/antimicrobial_usage_livestock.csv.dvc +++ b/snapshots/fasttrack/latest/antimicrobial_usage_livestock.csv.dvc @@ -1,33 +1,23 @@ meta: + origin: + producer: Mulchandani et al. + title: 'Global trends in antimicrobial use in food-producing animals: 2020 to 2030' + description: |- + Data on usage of antimicrobials in food animals were collected from 42 countries. Multivariate regression models were used in combination with projections of animal counts for cattle, sheep, chicken, and pigs from the Food and Agriculture Organization to estimate global antimicrobial usage of veterinary antimicrobials in 2020 and 2030. Maps of animal densities were used to identify geographic hotspots of antimicrobial use. In each country, estimates of antimicrobial use (tonnes) were calibrated to match continental-level reports of antimicrobial use intensity (milligrams per kilogram of animal) from the World Organization for Animal Health, as well as country-level reports of antimicrobial use from countries that made this information publicly available. + citation_full: |- + Mulchandani, R., Wang, Y., Gilbert, M., & Van Boeckel, T. P. (2023). Global trends in antimicrobial use in food-producing animals: 2020 to 2030. PLOS Global Public Health, 3(2), e0001305. https://doi.org/10.1371/journal.pgph.0001305 + url_main: https://journals.plos.org/globalpublichealth/article?id=10.1371/journal.pgph.0001305 + url_download: |- + https://docs.google.com/spreadsheets/d/e/2PACX-1vT1GgT43B-J5fD0kqup2QeajeMNLtjo10An4N3OkugtbOn-Q4OIaoI5pC2hsnYroRn8UmVhcczZADHw/pub?output=csv + date_accessed: '2024-12-06' + date_published: '2023-02-01' + license: + name: Open access name: Antimicrobial usage in livestock - description: This dataset estimates the usage of antimicrobials in livestock (cattle, - sheep, chicken, and pigs) by country. Data on antimicrobials comes from government - reports, surveillance systems and national surveys. In addition, the authors estimate - the biomass of livestock in the country, to adjust for differences in antimicrobial - usage by animal size. Biomass data comes from the Food and Agriculture Organization - (FAO). 'The PCU represents the total number of animals in a country (alive or - slaughtered), multiplied by the average weight of the animal at the time of treatment. - Therefore, the PCU is a standardization metric that accounts for differences in - animal weight, and number of production cycles per year between countries.' Therefore, - mg/PCU refers to the usage of antimicrobials per animal population-corrected unit. - source: - name: Google Sheet - description: - url: - https://journals.plos.org/globalpublichealth/article?id=10.1371/journal.pgph.0001305 - source_data_url: - gAAAAABkx3_ZrECpU1RI6tcJPiqDvqB2RKAZEsofB7GToYg6Vsw-LANlBdEgn1IxkrXt-La91vsraylCA313XJ_3HlggZ9aLs42krkNJwtDpi8I31JdQoiZg5lPB_m9w4Kx0dab4AgKjPhsnX8Y1-YoOw0-uSgkyigp2-gdc0mBZ6XknkcRp-53G1AZamnOn-9p8kKzkuzmqYv6ISc3tPI12oPqWZSBSM1UzFW1QRmM8VwTqtE1WqAGly3iX3yIPQUzV_E5tu2vI - owid_data_url: - date_accessed: '2023-07-31' - publication_date: - publication_year: 2023 - published_by: Google Sheet - license: - name: - url: - is_public: false -wdir: ../../../data/snapshots/fasttrack/latest + description: |- + This dataset estimates the usage of antimicrobials in livestock (cattle, sheep, chicken, and pigs) by country. Data on antimicrobials comes from government reports, surveillance systems and national surveys. In addition, the authors estimate the biomass of livestock in the country, to adjust for differences in antimicrobial usage by animal size. Biomass data comes from the Food and Agriculture Organization (FAO). 'The PCU represents the total number of animals in a country (alive or slaughtered), multiplied by the average weight of the animal at the time of treatment. Therefore, the PCU is a standardization metric that accounts for differences in animal weight, and number of production cycles per year between countries.' Therefore, mg/PCU refers to the usage of antimicrobials per animal population-corrected unit. + license: {} outs: -- md5: 2e79d5ae8ead88349c544d2e7b4b5ca2 - size: 9186 - path: antimicrobial_usage_livestock.csv + - md5: e0c44fec35851446ebb61784ce6528e3 + size: 8682 + path: antimicrobial_usage_livestock.csv diff --git a/snapshots/fasttrack/latest/cumulative_lives_saved_vaccination_shattock.csv.dvc b/snapshots/fasttrack/latest/cumulative_lives_saved_vaccination_shattock.csv.dvc index 8c9c6de7258..6fee0afab30 100644 --- a/snapshots/fasttrack/latest/cumulative_lives_saved_vaccination_shattock.csv.dvc +++ b/snapshots/fasttrack/latest/cumulative_lives_saved_vaccination_shattock.csv.dvc @@ -1,14 +1,14 @@ meta: origin: producer: |- - Shattock et al. (2024). Contribution of vaccination to improved child survival: modelling 50 years of the Expanded Programme on Immunization. + Shattock et al. (2024). Contribution of vaccination to improved survival and health: modelling 50 years of the Expanded Programme on Immunization. title: cumulative_lives_saved_vaccination_shattock citation_full: |- - Shattock et al. (2024). Contribution of vaccination to improved child survival: modelling 50 years of the Expanded Programme on Immunization. The Lancet. + Shattock et al. (2024). Contribution of vaccination to improved survival and health: modelling 50 years of the Expanded Programme on Immunization. The Lancet. url_main: https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)00850-X/fulltext url_download: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vQHxzyufogWkSCuwEmHd6jF2c3JfqTdUY9ngwgpdwZfR5FA7JGQRzKNKhJ6hmAfGXReeAmgpHRMj8iM/pub?output=csv - date_accessed: '2024-05-03' + date_accessed: '2024-11-28' name: Cumulative lives saved from vaccinations since 1974 (Shattock et al. 2024) description: The cumulative number of lives saved thanks to vaccinations, from 1974 onwards. license: {} diff --git a/snapshots/fasttrack/latest/democracy_freedom_house.csv.dvc b/snapshots/fasttrack/latest/democracy_freedom_house.csv.dvc index 54fb121dd16..578a16d565f 100644 --- a/snapshots/fasttrack/latest/democracy_freedom_house.csv.dvc +++ b/snapshots/fasttrack/latest/democracy_freedom_house.csv.dvc @@ -1,25 +1,20 @@ meta: source: - name: Google Sheet + name: Freedom House (2023) url: https://freedomhouse.org/report/freedom-world - source_data_url: + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vRXOAxs_t6f-Y3kMGmKc0trQiH2PMWjOxxlHZ1uKd88g_sFMq1zvgaeGO3G15lJdSixh9dCIIqUcBHI/pub?output=csv - date_accessed: '2023-09-05' - published_by: Google Sheet + date_accessed: '2024-11-26' + published_by: Freedom House (2023). Freedom in the World. name: Democracy - Freedom House (2023) - description: >- - This dataset provides information on political regimes, using data from Freedom - House's Freedom in the World (2023). - + description: |- + This dataset provides information on political regimes, using data from Freedom House's Freedom in the World (2023). You can read a description of the data in this post: https://ourworldindata.org/democracies-measurement - - You can download the code and complete dataset, including supplementary variables, - from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/democracy + You can download the code and complete dataset, including supplementary variables, from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/democracy license: {} -wdir: ../../../data/snapshots/fasttrack/latest outs: -- md5: 5407777c1c32eae16bdc68a7dab2dcef - size: 629063 - path: democracy_freedom_house.csv + - md5: 8c5d63ec87bb5475b75de261d7e32a73 + size: 629913 + path: democracy_freedom_house.csv diff --git a/snapshots/fasttrack/latest/gbd_2019_mental_health_country_coverage.csv.dvc b/snapshots/fasttrack/latest/gbd_2019_mental_health_country_coverage.csv.dvc index eaf8778537e..723cdb77c01 100644 --- a/snapshots/fasttrack/latest/gbd_2019_mental_health_country_coverage.csv.dvc +++ b/snapshots/fasttrack/latest/gbd_2019_mental_health_country_coverage.csv.dvc @@ -1,26 +1,22 @@ meta: - namespace: fasttrack - short_name: gbd_2019_mental_health_country_coverage - file_extension: csv - date_accessed: 2023-05-29 + source: + name: IHME GBD (2019) + url: https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30925-9/fulltext + source_data_url: |- + gAAAAABnRYlazzL_oEfuzBXSmjRVB2Y_aOmXSCAXVYIW_72BvzIYsW9GUt8OjdfLueLZUKFP-ZGq7e8B19xEzO3nPErc9S9x8w5MaFQeUEEwgZHOHjHGCjfOxrcUDXEQ-RyBwZuKmo7Ngy5dTDkh_sza4x9FOhmx5yUB25NI0TKhPhkPxrC_AIads-3sWMviypayPf_OebJnlMZUgRBEnRYqUH5sGcQTZvpi2ixly61NvwF_oU1_8FZL8iyj9mDiUvBP-kpERU3L + date_accessed: '2024-11-26' + publication_year: 2020 + published_by: |- + Vos, T., Lim, S. S., Abbafati, C., Abbas, K. M., Abbasi, M., Abbasifard, M., Abbasi-Kangevari, M., Abbastabar, H., Abd-Allah, F., Abdelalim, A., Abdollahi, M., Abdollahpour, I., Abolhassani, H., Aboyans, V., Abrams, E. M., Abreu, L. G., Abrigo, M. R. M., Abu-Raddad, L. J., Abushouk, A. I., … Murray, C. J. L. (2020). Global burden of 369 diseases and injuries in 204 countries and territories, 1990–2019: A systematic analysis for the Global Burden of Disease Study 2019. The Lancet, 396(10258), 1204–1222. name: Countries with mental health data in GBD 2019 description: |- Dataset showing the number of countries with primary data on the prevalence of mental illnesses. These were found after a systematic review, grey literature search and expert consultation, to identify studies with data on the prevalence of each mental illness. 'The GBD inclusion criteria stipulated that: (1) the diagnostic criteria must be from 1980 onward; (2) “caseness” must be based on clinical threshold as established by the DSM, ICD, Chinese Classification of Mental Disorders (CCMD), or diagnosed by a clinician using established tools; (3) sufficient information must be provided on study method and sample characteristics to assess the quality of the study; and (4) study samples must be representative of the general population (i.e., case studies, veterans, or refugee samples were excluded). No limitation was set on the language of publication.' - source_name: Google Sheet - url: - https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30925-9/fulltext - source_published_by: Google Sheet - source_data_url: - gAAAAABkdGlgpHgXXTDAwptybyG87JbjFVjs02Q-Rzc9mvCEIbiBXzn73TAFhhJhTIRWevZe9DtC6nMtJ1Cq1aUvY-ZA7cYHUb3afdxTpm9C-vyTkjYLqSy7VlBPzYJODjHtY-78465q_2SAHLZ1LJC2UQWtGtzHzWszyyTlsspAbhEpata6QVf2_DDdwJpHdWnJBVdGqnddeqpSKtwYXmoIG-A8BX4FAp5eVmbAzuhwR2b5vavEHmTnIJPyMOVBCu4vp9sGudYb - license_url: - https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30925-9/fulltext + license: + url: https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30925-9/fulltext is_public: false - version: latest - publication_year: 2020 -wdir: ../../../data/snapshots/fasttrack/latest outs: -- md5: f972f0ff8f16acaf3af8c63d4e880e96 - size: 486 - path: gbd_2019_mental_health_country_coverage.csv + - md5: f972f0ff8f16acaf3af8c63d4e880e96 + size: 486 + path: gbd_2019_mental_health_country_coverage.csv diff --git a/snapshots/fasttrack/latest/global_maternal_offspring_loss.csv.dvc b/snapshots/fasttrack/latest/global_maternal_offspring_loss.csv.dvc index ef84d64a187..0aa1753a193 100644 --- a/snapshots/fasttrack/latest/global_maternal_offspring_loss.csv.dvc +++ b/snapshots/fasttrack/latest/global_maternal_offspring_loss.csv.dvc @@ -1,24 +1,18 @@ meta: source: - name: Google Sheet + name: Smith-Greenaway et al. (2021) url: https://gh.bmj.com/content/6/4/e004837.abstract - source_data_url: + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vSiCgGTJ8kXoVPpaaskyRz79os2jbWtLbcqk18ybEXI_tsE6WPJy6DH-lXggK_VbHir456mI3D98Jbd/pub?output=csv - date_accessed: '2023-09-05' - publication_year: '2021' - published_by: Google Sheet + date_accessed: '2024-11-26' + publication_year: 2021 + published_by: |- + Global burden of maternal bereavement: indicators of the cumulative prevalence of child loss. (2021) Emily Smith-Greenaway, Diego Alburez-Gutierrez, Jenny Trinitapoli, Emilio Zagheni. name: Global maternal offspring loss - Smith-Greenaway et al. 2021 - description: This dataset shows survey data and estimates of maternal offspring - loss across countries. This includes mothers who have lost an infant, child under - 5 years old, or offspring. These are given as a rate per 1000 women in the age - group. Underlying data comes from large-scale surveys (such as the Demographic - and Health Surveys and Multiple Indicator Cluster Surveys) conducted in many low- - and middle-income countries. For countries lacking data, these are estimated using - an indirect approach that combines formal kinship models and life-table methods - in an additional 81 countries. + description: |- + This dataset shows survey data and estimates of maternal offspring loss across countries. This includes mothers who have lost an infant, child under 5 years old, or offspring. These are given as a rate per 1000 women in the age group. Underlying data comes from large-scale surveys (such as the Demographic and Health Surveys and Multiple Indicator Cluster Surveys) conducted in many low- and middle-income countries. For countries lacking data, these are estimated using an indirect approach that combines formal kinship models and life-table methods in an additional 81 countries. Citation: Smith-Greenaway, E., Alburez-Gutierrez, D., Trinitapoli, J., & Zagheni, E. (2021). Global burden of maternal bereavement: Indicators of the cumulative prevalence of child loss. BMJ Global Health, 6(4), e004837. https://doi.org/10.1136/bmjgh-2020-004837 license: {} -wdir: ../../../data/snapshots/fasttrack/latest outs: -- md5: 7ce7e9d8a371afa038a470f8977d160d - size: 8075 - path: global_maternal_offspring_loss.csv + - md5: 7ce7e9d8a371afa038a470f8977d160d + size: 8075 + path: global_maternal_offspring_loss.csv diff --git a/snapshots/fasttrack/latest/historical_france_mortality_cause.csv.dvc b/snapshots/fasttrack/latest/historical_france_mortality_cause.csv.dvc index 9305707e4cf..89c8a92d726 100644 --- a/snapshots/fasttrack/latest/historical_france_mortality_cause.csv.dvc +++ b/snapshots/fasttrack/latest/historical_france_mortality_cause.csv.dvc @@ -1,65 +1,36 @@ meta: source: - name: Google Sheet + name: Institut National d'Études Démographiques url: https://www.demographic-research.org/Volumes/Vol36/21/ - source_data_url: - https://docs.google.com/spreadsheets/d/e/2PACX-1vSUInsCt97X8kjDGENSZm5-xLSbE_P4MEAW4bYkGHn-KN5aVWQ4VAm3JzziskCUTkFQuWQVinWtLO_I/pub?output=csv - date_accessed: '2023-08-18' - publication_year: '2014' - published_by: Google Sheet + source_data_url: |- + gAAAAABnRYkUiIe1Lp-eC1kj1PYYYQ25kHCvpjl7pYYYL4eO5R8EtQ_r17u121vjvLUCpJ3X0LDrvoZ7IGgLK52Of2eKqxKBxWh30Ud4jSt_oSgpu6iL2FdM4M5nJbVStK7IBlb2SregUwmWCdJDn8xUpzz__p-Ly3mpfWJfa1i11msoZyXQVbTlcGbq5Rz9GEYok3B9elGj9tKPOFgqprbt4_UgHuYSUGPyn9fXCX9DBkRhC52YfXIpd_G625e2UxdS1-TK06j4 + date_accessed: '2024-11-26' + publication_year: 2014 + published_by: Jacques Vallin and France Meslé name: Database on causes of death in France from 1925 to 1999 - description: >- - Dataset on mortality rates from each cause of death category in France between - 1925 and 1999. The underlying data for - this chart comes from the Institut National d'Études Démographiques, published - by Jacques Vallin and France Meslé, and - covers causes of deaths nationally in France between 1925 and 1999. Causes of - death were categorized into categories according - to the 9th edition of the International Classification of Diseases (ICD-9) manual. - Mortality rates are given for five-year - age bands, as an annual rate out of 100,000 people in that age group. Below are - the ICD codes used for each cause category: - All causes = 000*-999*, - + description: |- + Dataset on mortality rates from each cause of death category in France between 1925 and 1999. The underlying data for this chart comes from the Institut National d'Études Démographiques, published by Jacques Vallin and France Meslé, and covers causes of deaths nationally in France between 1925 and 1999. Causes of death were categorized into categories according to the 9th edition of the International Classification of Diseases (ICD-9) manual. Mortality rates are given for five-year age bands, as an annual rate out of 100,000 people in that age group. Below are the ICD codes used for each cause category: All causes = 000*-999*, Infectious and parasitic diseases = 001*-139*, - Neoplasms = 140*-239*, - Endocrine nutritional and metabolic diseases and immunity disorders = 240*-279*, - Diseases of the blood and blood-forming organs = 280*-289*, - Mental disorders = 290*-319*, - Diseases of the nervous system = 320*-359*, - Diseases of the sense organs = 360*-389*, - Diseases of the circulatory system = 390*-459*, - Diseases of the respiratory system = 460*-519*, - Diseases of the digestive system = 520*-579*, - Diseases of the genitourinary system = 580*-629*, - Complications of pregnancy childbirth and the puerperium = 630*-679*, - Diseases of the skin and subcutaneous tissue = 680*-709*, - Diseases of the musculoskeletal system and connective tissue = 710*-739*, - Congenital anomalies = 740*-759*, - Certain conditions originating in the perinatal period = 760*-779*, - Symptoms signs and ill-defined conditions = 780*-799*, - External causes (injury and poisoning) = 800*-999* license: {} is_public: false -wdir: ../../../data/snapshots/fasttrack/latest outs: -- md5: 856b79cf2b8b9e0ba868c7d4084c7c4a - size: 629827 - path: historical_france_mortality_cause.csv + - md5: 856b79cf2b8b9e0ba868c7d4084c7c4a + size: 629827 + path: historical_france_mortality_cause.csv diff --git a/snapshots/fasttrack/latest/infant_mortality_vaccination_shattock.csv.dvc b/snapshots/fasttrack/latest/infant_mortality_vaccination_shattock.csv.dvc index 9f6c72e61cb..c93ee0e7a2c 100644 --- a/snapshots/fasttrack/latest/infant_mortality_vaccination_shattock.csv.dvc +++ b/snapshots/fasttrack/latest/infant_mortality_vaccination_shattock.csv.dvc @@ -1,14 +1,14 @@ meta: origin: producer: |- - Shattock et al. (2024). Contribution of vaccination to improved child survival: modelling 50 years of the Expanded Programme on Immunization. + Shattock et al. (2024). Contribution of vaccination to improved survival and health: modelling 50 years of the Expanded Programme on Immunization. title: infant_mortality_vaccination_shattock citation_full: |- - Shattock et al. (2024). Contribution of vaccination to improved child survival: modelling 50 years of the Expanded Programme on Immunization. The Lancet. + Shattock et al. (2024). Contribution of vaccination to improved survival and health: modelling 50 years of the Expanded Programme on Immunization. The Lancet. url_main: https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)00850-X/fulltext url_download: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vSLcIzJwPFHDM0x41c0X0h4_YNiw43W3YtVQfpoNtVKOjCZuOyyFcd0JM1O-RjCW_DASDOJkWRIfntY/pub?output=csv - date_accessed: '2024-05-03' + date_accessed: '2024-11-28' name: Infant mortality with and without vaccinations (Shattock et al, 2024) description: Estimates of the infant mortality rates with and without vital vaccinations. license: {} diff --git a/snapshots/fasttrack/latest/lead_paint_regulation_who.csv.dvc b/snapshots/fasttrack/latest/lead_paint_regulation_who.csv.dvc index b07698a5666..a45b6985906 100644 --- a/snapshots/fasttrack/latest/lead_paint_regulation_who.csv.dvc +++ b/snapshots/fasttrack/latest/lead_paint_regulation_who.csv.dvc @@ -1,24 +1,20 @@ meta: - namespace: fasttrack - short_name: lead_paint_regulation_who - file_extension: csv - date_accessed: 2023-05-31 + source: + name: Lead paint regulations (WHO, 2023) + url: https://www.who.int/data/gho/data/themes/topics/indicator-groups/legally-binding-controls-on-lead-paint + source_data_url: |- + gAAAAABnRY1UxjQRqqtwi3k4YRi5i2a_AmviR0K1Yq7lWpp5mQ8zmhp0sOt2x_D-aN8W2WkMf8tCvbcZ2Bi41ergSpO1KCkDtsaTcT24IcGVC16eX9TpV0AkYs4TAXg4LbY31u0XeecK9mQ7uw7nzqkQLsEbEDXTz7izPL1kz58WSwNQE8WCvzhV2kv3mddI_ycEwFGUFSypG0QXyYHreQLjP3n5CsnC2vGymzKGwdQUmiK0r2nipPcCj4vh3O6qbvSj0W_hcA5j + date_accessed: '2024-11-26' + publication_year: 2023 + published_by: World Health Organization (WHO) name: Lead paint regulations (WHO, 2023) description: |- The WHO collects data on which countries have legally-binding controls on lead paint. It sources this data from surveys conducted by WHO and UNEP of national authorities. The World Health Organization (WHO) tracks the introduction of legally-binding controls on lead concentrations in paint. Paint is a main contributor to harmful lead exposure. The stringency of controls on lead paint can vary by country. Maximum concentrations of lead can differ, and may only apply to particular types of paint (for example, products used in households). - source_name: Google Sheet - url: - https://www.who.int/data/gho/data/themes/topics/indicator-groups/legally-binding-controls-on-lead-paint - source_published_by: Google Sheet - source_data_url: - gAAAAABkdy33dfUqbIoFH4JceoCGzD0jhxgG7WWfCuXNbU3fqjWI0eFoNybQuugGLBO2bR-HGEAAMlzhGcAweillVjqC8fv2ZuiG5kSeLgbLGBbZc2AiBucQz5QwuPOh-_mMD0qMPTnsiDvHsfGp91drLXO4da3usmBrcIJbn0vv2bknY0iuDYbnyOf4HKHcbLM_9iazUXMeIvIzuvEHq3iu4QZHaTHgbP66JFYwGhM0EiP_mxGKNbart9M1j3zVkXRESjdxKTnA + license: {} is_public: false - version: latest - publication_year: 2023 -wdir: ../../../data/snapshots/fasttrack/latest outs: -- md5: 1699f03673a329af3bf555a0b4e6e8a0 - size: 2935 - path: lead_paint_regulation_who.csv + - md5: 1699f03673a329af3bf555a0b4e6e8a0 + size: 2935 + path: lead_paint_regulation_who.csv diff --git a/snapshots/fasttrack/latest/lives_saved_vaccination_who.csv.dvc b/snapshots/fasttrack/latest/lives_saved_vaccination_who.csv.dvc index aa7be1be0f4..0a344e05f02 100644 --- a/snapshots/fasttrack/latest/lives_saved_vaccination_who.csv.dvc +++ b/snapshots/fasttrack/latest/lives_saved_vaccination_who.csv.dvc @@ -1,14 +1,14 @@ meta: origin: producer: |- - Shattock et al. (2024). Contribution of vaccination to improved child survival: modelling 50 years of the Expanded Programme on Immunization. + Shattock et al. (2024). Contribution of vaccination to improved survival and health: modelling 50 years of the Expanded Programme on Immunization. title: lives_saved_vaccination_who citation_full: |- - Shattock et al. (2024). Contribution of vaccination to improved child survival: modelling 50 years of the Expanded Programme on Immunization. The Lancet. + Shattock et al. (2024). Contribution of vaccination to improved survival and health: modelling 50 years of the Expanded Programme on Immunization. The Lancet. url_main: https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(24)00850-X/fulltext url_download: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vSP20L-mjOB7xZ73EGhXyjZSdC_CBwWZAKkCs_T4iCRgwXKNXBb0K2H8L8mI7rnLIol8MqYAyG5YMHq/pub?output=csv - date_accessed: '2024-05-03' + date_accessed: '2024-11-28' name: Lives saved by vaccinations (WHO, 2024) description: Estimates of the number of lives saved by vaccination over the last 50 years (from 1974 to 2024). license: {} diff --git a/snapshots/fasttrack/latest/mineral_prices_usgs.csv.dvc b/snapshots/fasttrack/latest/mineral_prices_usgs.csv.dvc new file mode 100644 index 00000000000..c798d8ac322 --- /dev/null +++ b/snapshots/fasttrack/latest/mineral_prices_usgs.csv.dvc @@ -0,0 +1,16 @@ +meta: + origin: + producer: United States Geological Survey (2024). National Minerals Information Center. + title: mineral_prices_usgs + citation_full: United States Geological Survey (2024). National Minerals Information Center. + url_main: https://www.usgs.gov/centers/national-minerals-information-center + url_download: |- + https://docs.google.com/spreadsheets/d/e/2PACX-1vSSKAJ9UB8Pcjg1pm10s8l64cndMbW-LdlAHVOOZ4af4X10b8Iu76HgZuRtscI0NMsBtfueHxFi_XYo/pub?output=csv + date_accessed: '2024-11-06' + name: Mineral prices (USGS, 2024) + description: '' + license: {} +outs: + - md5: 4a445a61d31803e84335ff9990405913 + size: 8969 + path: mineral_prices_usgs.csv diff --git a/snapshots/fasttrack/latest/nuclear_warhead_inventories.csv.dvc b/snapshots/fasttrack/latest/nuclear_warhead_inventories.csv.dvc index dd0dd3b4107..74c09663476 100644 --- a/snapshots/fasttrack/latest/nuclear_warhead_inventories.csv.dvc +++ b/snapshots/fasttrack/latest/nuclear_warhead_inventories.csv.dvc @@ -1,27 +1,19 @@ meta: source: - name: Google Sheet + name: Federation of American Scientists (2023) url: https://fas.org/issues/nuclear-weapons/status-world-nuclear-forces/ - source_data_url: + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vRLseg03w2pyyqQIq9ip016J168lYsJ8BWJRqzDX8L8LqF6zm4TTyOGS7HmrwW3ZL1bj2KHXQNuhNir/pub?output=csv - date_accessed: '2023-08-22' - publication_year: .nan - published_by: Google Sheet + date_accessed: '2024-11-26' + published_by: Hans M. Kristensen, Matt Korda, Eliana Reynolds, and Robert Norris. name: Nuclear warhead inventories – Federation of American Scientists - description: >- - This dataset provides information on the nuclear warhead inventories by the nuclear - powers, using data from the Federation - of American Scientists, prepared by Hans M. Kristensen, Matt Korda, and Robert - Norris. - - - You can download the code and complete dataset, including supplementary variables, - from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/nuclear_weapons + description: |- + This dataset provides information on the nuclear warhead inventories by the nuclear powers, using data from the Federation of American Scientists, prepared by Hans M. Kristensen, Matt Korda, and Robert Norris. + You can download the code and complete dataset, including supplementary variables, from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/nuclear_weapons Publisher source: None license: {} -wdir: ../../../data/snapshots/fasttrack/latest outs: -- md5: e1269c0f219efe73d1c408caa13c3ba8 - size: 694 - path: nuclear_warhead_inventories.csv + - md5: e1269c0f219efe73d1c408caa13c3ba8 + size: 694 + path: nuclear_warhead_inventories.csv diff --git a/snapshots/fasttrack/latest/nuclear_warhead_stockpiles.csv.dvc b/snapshots/fasttrack/latest/nuclear_warhead_stockpiles.csv.dvc index fbb3fe5c9cb..12f4d0b0114 100644 --- a/snapshots/fasttrack/latest/nuclear_warhead_stockpiles.csv.dvc +++ b/snapshots/fasttrack/latest/nuclear_warhead_stockpiles.csv.dvc @@ -1,26 +1,19 @@ meta: source: - name: Google Sheet + name: Federation of American Scientists (2023) url: https://fas.org/issues/nuclear-weapons/status-world-nuclear-forces/ - source_data_url: + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vQ9gi-5QBEFuGclzGkfwv9NY_DavklY4DmA9-6QFG7KVFXTvuDf6fe4ETT7_e2zko4akSlAAqGduasG/pub?output=csv - date_accessed: '2023-10-16' - published_by: Google Sheet + date_accessed: '2024-11-26' + published_by: Hans M. Kristensen, Matt Korda, Eliana Reynolds, and Robert Norris. name: Nuclear warhead stockpiles – Federation of American Scientists - description: >- - This dataset provides information on the number of stockpiled nuclear warheads - by the nuclear powers, using data from - the Federation of American Scientists, prepared by Hans M. Kristensen, Matt Korda, - Eliana Reynolds, and Robert Norris. - - - You can download the code and complete dataset, including supplementary variables, - from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/nuclear_weapons + description: |- + This dataset provides information on the number of stockpiled nuclear warheads by the nuclear powers, using data from the Federation of American Scientists, prepared by Hans M. Kristensen, Matt Korda, Eliana Reynolds, and Robert Norris. + You can download the code and complete dataset, including supplementary variables, from GitHub: https://github.com/owid/notebooks/tree/main/BastianHerre/nuclear_weapons Publisher source: None license: {} -wdir: ../../../data/snapshots/fasttrack/latest outs: -- md5: 3ae0bc86faefa1734117ba84f4761adc - size: 15471 - path: nuclear_warhead_stockpiles.csv + - md5: 3ae0bc86faefa1734117ba84f4761adc + size: 15471 + path: nuclear_warhead_stockpiles.csv diff --git a/snapshots/fasttrack/latest/pain_hours_hen_systems.csv.dvc b/snapshots/fasttrack/latest/pain_hours_hen_systems.csv.dvc index 78ffe32d43b..b9989419fb6 100644 --- a/snapshots/fasttrack/latest/pain_hours_hen_systems.csv.dvc +++ b/snapshots/fasttrack/latest/pain_hours_hen_systems.csv.dvc @@ -1,23 +1,17 @@ meta: - name: Pain hours of hen systems (Welfare Footprint) - description: '' source: - name: Google Sheet - description: + name: Welfare Footprint based on Schuck-Paim and Alonso (2021) url: https://welfarefootprint.org/research-projects/laying-hens/ - source_data_url: - gAAAAABkwT3GHig9-hHZr_XHHeWuAsK5PHjtz55H-Tvu_KCW-XytOuBUvNMnrv3hZ593VSyU95XV8hLYTSto8khid8slVfs4OA7joXQ4qzAsufp7A3vltFcf9o3BbCJseT64F2CYUUgMkJ9ptK0YjR0E2VhxNWylRc5G8OCv19srGHlFBGjG7_isIrTbJimw3L_c4hmRMlUNEZMMoo9OyTNjNRKb6Ta_4bkYL4xn6hBHt-89D_6xF58zl2ZQFdtjZn69vnwBW58p - owid_data_url: - date_accessed: '2023-07-26' - publication_date: + source_data_url: |- + gAAAAABnRY4A9xP89q46o72YIhvw5MBBfzt_84o2BNljMcBv29JHUFXvVtuYbTDPVsVbX-hfm9rZ8DJ-rW8qWqW0lWbhRDNvNd_ncCTITnovArjMEeLU41PNBF1vXiAz3Exdbb4orqamK0KosiZFwIKGtyvRFrF7XOH32d6O3z8ybbYchAUPJ8fqDyHP87olHyJt21OraPoNDQZ-MmNI5oSAHeMlz6zqyaEtO6mfd1g98R0B5QG6wf4zx7ogpunekE2MdQl1bOkQ + date_accessed: '2024-11-26' publication_year: 2021 - published_by: Google Sheet - license: - name: - url: + published_by: Welfare Footprint + name: Pain hours of hen systems (Welfare Footprint) + description: '' + license: {} is_public: false -wdir: ../../../data/snapshots/fasttrack/latest outs: -- md5: 73b70fa53e689c7115055f982d9b17b5 - size: 228 - path: pain_hours_hen_systems.csv + - md5: daf4b07b53f3eb73c1725266e6e7b83c + size: 466 + path: pain_hours_hen_systems.csv diff --git a/snapshots/fasttrack/latest/plastic_waste_meijer_2021.csv.dvc b/snapshots/fasttrack/latest/plastic_waste_meijer_2021.csv.dvc index 5f13b6b45c4..677696f9254 100644 --- a/snapshots/fasttrack/latest/plastic_waste_meijer_2021.csv.dvc +++ b/snapshots/fasttrack/latest/plastic_waste_meijer_2021.csv.dvc @@ -1,17 +1,17 @@ meta: source: - name: Google Sheet + name: Plastic ocean waste and pollution (Meijer et al. 2021) url: https://www.science.org/doi/10.1126/sciadv.aaz5803 - source_data_url: + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vTD6oDffmZRLtbnYvEY4GKVbNhQO7oZpuivywu0lC_lIS8EIqy67SPnPYHMlW9ByeuQQ1g4lLt0Igl-/pub?output=csv - date_accessed: '2023-09-20' - publication_year: '2021' - published_by: Google Sheet + date_accessed: '2024-11-26' + publication_year: 2021 + published_by: |- + Meijer, L. J., Van Emmerik, T., Van Der Ent, R., Schmidt, C., & Lebreton, L. (2021). More than 1000 rivers account for 80% of global riverine plastic emissions into the ocean. Science Advances, 7(18), eaaz5803. name: Plastic ocean waste and pollution (Meijer et al. 2021) description: '' license: {} -wdir: ../../../data/snapshots/fasttrack/latest outs: -- md5: 9622364b41ad11332ca7164e8da26a20 - size: 12674 - path: plastic_waste_meijer_2021.csv + - md5: cdffe453c306eb9a30aef5dba633f82a + size: 12841 + path: plastic_waste_meijer_2021.csv diff --git a/snapshots/fasttrack/latest/sentience_institute.csv.dvc b/snapshots/fasttrack/latest/sentience_institute.csv.dvc index 96eb93e0cce..121bce8e039 100644 --- a/snapshots/fasttrack/latest/sentience_institute.csv.dvc +++ b/snapshots/fasttrack/latest/sentience_institute.csv.dvc @@ -1,17 +1,16 @@ meta: source: - name: Google Sheet + name: 'Sentience Institute. Animals, Food, and Technology (AFT) Survey: 2021 Update.' url: https://www.sentienceinstitute.org/aft-survey-2021 - source_data_url: + source_data_url: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vTHuvrVzfeof4Srke_7bxUlu6eKNZ7RHrjrP6GD184aqrlIiojtSdGH4Hzu48JNynpiES7CX4nf1kUI/pub?output=csv - date_accessed: '2023-09-18' - publication_year: '2021' - published_by: Google Sheet + date_accessed: '2024-11-26' + publication_year: 2021 + published_by: Sentience Institute name: Survey attitudes to livestock farming (Sentience Institute) description: '' license: {} -wdir: ../../../data/snapshots/fasttrack/latest outs: -- md5: 8d9a1e7de3ccba2495c5bd97cbfc2e62 - size: 1540 - path: sentience_institute.csv + - md5: 8d9a1e7de3ccba2495c5bd97cbfc2e62 + size: 1540 + path: sentience_institute.csv diff --git a/snapshots/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.csv.dvc b/snapshots/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.csv.dvc index b5d9ea6ee6a..e41becaeee8 100644 --- a/snapshots/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.csv.dvc +++ b/snapshots/fasttrack/latest/treatment_gap_anxiety_disorders_world_mental_health_surveys.csv.dvc @@ -1,47 +1,19 @@ meta: - namespace: fasttrack - short_name: treatment_gap_anxiety_disorders_world_mental_health_surveys - file_extension: csv - date_accessed: 2023-05-12 - name: Treatment gap for anxiety disorders - World Mental Health Surveys - Alonso - et al. 2017 - description: 'This dataset comes from the World Mental Health surveys, which conducted - national studies in 21 countries, using validated structured interviews to survey - members of the general population about symptoms of mental illnesses they had - in the past 12 months and their lifetime so far. The source describes the dataset: - "Data came from 24 community epidemiological surveys administered in 21 countries - as part of the WMH surveys (Kessler & Ustun, 2004). These included 12 surveys - carried out in high-income countries, 6 surveys in upper-middle-income countries - and 6 in low or lower-middle income countries (see table 1). The majority of surveys - were based on nationally representative household samples. Three were representative - of urban areas in their countries (Colombia, Mexico, and Peru). Three were representative - of selected regions in their countries (Japan, Nigeria, and Murcia, Spain). Four - were representative of selected Metropolitan Areas (Sao Paulo, Brazil; Medellin, - Colombia; and Beijing-Shanghai and Shenzhen in the People’s Republic of China - (PRC)). Trained lay interviewers conducted face-to-face interviews with respondents, - aged 18 years and over. The interviews took place within the households of the - respondents. To reduce respondent burden, the interview was divided into two parts. - Part I assessed core mental disorders and was administered to all respondents. - Part II, which assessed additional disorders and correlates, was administered - to all Part I respondents who met lifetime criteria for any disorder plus a probability - subsample of other Part I respondents. Part II data, the focus of this report, - were weighted by the inverse of their probabilities of selection into Part II - and additionally weighted to adjust samples to match population distributions - on the cross-classification of key socio-demographic and geographic variables. - Further details about WMH sampling and weighting are available elsewhere(Heeringa - et al., 2008). Response rates ranged between 45.9% and 97.2% and had a weighted - average of 70.1% across all surveys."' - source_name: Google Sheet - url: https://pubmed.ncbi.nlm.nih.gov/29356216/ - source_published_by: Google Sheet - source_data_url: https://docs.google.com/spreadsheets/d/e/2PACX-1vQtKZYhI1TbqVZILdDFZuSYJcew8xyYKc9Euyzfzwz6g8O28Qfapc-QJfVetZqJF2N8mNLItaErz63_/pub?output=csv - license_url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6008788/ - license_name: Made freely available by authors - is_public: true - version: latest - publication_year: 2017 -wdir: ../../../data/snapshots/fasttrack/latest + source: + name: Treatment gap for anxiety disorders - World Mental Health Surveys - Alonso et al. 2017 + url: https://pubmed.ncbi.nlm.nih.gov/29356216/ + source_data_url: |- + https://docs.google.com/spreadsheets/d/e/2PACX-1vQtKZYhI1TbqVZILdDFZuSYJcew8xyYKc9Euyzfzwz6g8O28Qfapc-QJfVetZqJF2N8mNLItaErz63_/pub?output=csv + date_accessed: '2024-11-26' + publication_year: 2017 + published_by: Alonso et al. (2017) + name: Treatment gap for anxiety disorders - World Mental Health Surveys - Alonso et al. 2017 + description: |- + This dataset comes from the World Mental Health surveys, which conducted national studies in 21 countries, using validated structured interviews to survey members of the general population about symptoms of mental illnesses they had in the past 12 months and their lifetime so far. The source describes the dataset: "Data came from 24 community epidemiological surveys administered in 21 countries as part of the WMH surveys (Kessler & Ustun, 2004). These included 12 surveys carried out in high-income countries, 6 surveys in upper-middle-income countries and 6 in low or lower-middle income countries (see table 1). The majority of surveys were based on nationally representative household samples. Three were representative of urban areas in their countries (Colombia, Mexico, and Peru). Three were representative of selected regions in their countries (Japan, Nigeria, and Murcia, Spain). Four were representative of selected Metropolitan Areas (Sao Paulo, Brazil; Medellin, Colombia; and Beijing-Shanghai and Shenzhen in the People’s Republic of China (PRC)). Trained lay interviewers conducted face-to-face interviews with respondents, aged 18 years and over. The interviews took place within the households of the respondents. To reduce respondent burden, the interview was divided into two parts. Part I assessed core mental disorders and was administered to all respondents. Part II, which assessed additional disorders and correlates, was administered to all Part I respondents who met lifetime criteria for any disorder plus a probability subsample of other Part I respondents. Part II data, the focus of this report, were weighted by the inverse of their probabilities of selection into Part II and additionally weighted to adjust samples to match population distributions on the cross-classification of key socio-demographic and geographic variables. Further details about WMH sampling and weighting are available elsewhere(Heeringa et al., 2008). Response rates ranged between 45.9% and 97.2% and had a weighted average of 70.1% across all surveys." + license: + name: Made freely available by authors + url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6008788/ outs: -- md5: 24b2bc09a23fe8ed5ab7e8a37df42c4d - size: 1297 - path: treatment_gap_anxiety_disorders_world_mental_health_surveys.csv + - md5: 24b2bc09a23fe8ed5ab7e8a37df42c4d + size: 1297 + path: treatment_gap_anxiety_disorders_world_mental_health_surveys.csv diff --git a/snapshots/fasttrack/latest/useful_energy_cost_way.csv.dvc b/snapshots/fasttrack/latest/useful_energy_cost_way.csv.dvc new file mode 100644 index 00000000000..bb7cb8ffb2b --- /dev/null +++ b/snapshots/fasttrack/latest/useful_energy_cost_way.csv.dvc @@ -0,0 +1,17 @@ +meta: + origin: + producer: Way et al. (2022). Empirically grounded technology forecasts and the energy transition. + title: useful_energy_cost_way + citation_full: |- + Way, R., Ives, M. C., Mealy, P., & Farmer, J. D. (2022). Empirically grounded technology forecasts and the energy transition. Joule, 6(9), 2057-2082. + url_main: https://www.sciencedirect.com/science/article/pii/S254243512200410X + url_download: |- + https://docs.google.com/spreadsheets/d/e/2PACX-1vSO9emLabLcnGvJWeSGjZVbmYFqXPQ5uggopEH7QZLg9gExLk1ZVu4vR7_9W0_4qlIBLBo4uRGltigs/pub?output=csv + date_accessed: '2024-11-07' + name: Useful energy costs (Way et al. 2022) + description: '' + license: {} +outs: + - md5: fc62ac8d1f10a2f067db04834fecd331 + size: 3543 + path: useful_energy_cost_way.csv diff --git a/snapshots/fasttrack/latest/whm_treatment_gap_anxiety_disorders.csv.dvc b/snapshots/fasttrack/latest/whm_treatment_gap_anxiety_disorders.csv.dvc index cba58543e82..0a3591664e3 100644 --- a/snapshots/fasttrack/latest/whm_treatment_gap_anxiety_disorders.csv.dvc +++ b/snapshots/fasttrack/latest/whm_treatment_gap_anxiety_disorders.csv.dvc @@ -1,47 +1,19 @@ meta: - namespace: fasttrack - short_name: whm_treatment_gap_anxiety_disorders - file_extension: csv - date_accessed: 2023-06-15 + source: + name: Treatment gap for anxiety disorders - World Mental Health Surveys - Alonso et al. 2017 + url: https://pubmed.ncbi.nlm.nih.gov/29356216/ + source_data_url: |- + https://docs.google.com/spreadsheets/d/e/2PACX-1vTQERY6SffT6Lc4ogBdVxjBlPFiOOluxEd3h9oAbrRtSy5YXQ0BTYqZFhrF1wl5N9j6Ko-Mm2XwHZtZ/pub?output=csv + date_accessed: '2024-11-26' + publication_year: 2017 + published_by: Alonso et al. (2017) name: Treatment gap for anxiety disorders (WMH, 2017) - description: 'This dataset comes from the World Mental Health surveys, which conducted - national studies in 21 countries, using validated structured interviews to survey - members of the general population about symptoms of mental illnesses they had - in the past 12 months and their lifetime so far. The source describes the dataset: - "Data came from 24 community epidemiological surveys administered in 21 countries - as part of the WMH surveys (Kessler & Ustun, 2004). These included 12 surveys - carried out in high-income countries, 6 surveys in upper-middle-income countries - and 6 in low or lower-middle income countries (see table 1). The majority of surveys - were based on nationally representative household samples. Three were representative - of urban areas in their countries (Colombia, Mexico, and Peru). Three were representative - of selected regions in their countries (Japan, Nigeria, and Murcia, Spain). Four - were representative of selected Metropolitan Areas (Sao Paulo, Brazil; Medellin, - Colombia; and Beijing-Shanghai and Shenzhen in the People’s Republic of China - (PRC)). Trained lay interviewers conducted face-to-face interviews with respondents, - aged 18 years and over. The interviews took place within the households of the - respondents. To reduce respondent burden, the interview was divided into two parts. - Part I assessed core mental disorders and was administered to all respondents. - Part II, which assessed additional disorders and correlates, was administered - to all Part I respondents who met lifetime criteria for any disorder plus a probability - subsample of other Part I respondents. Part II data, the focus of this report, - were weighted by the inverse of their probabilities of selection into Part II - and additionally weighted to adjust samples to match population distributions - on the cross-classification of key socio-demographic and geographic variables. - Further details about WMH sampling and weighting are available elsewhere(Heeringa - et al., 2008). Response rates ranged between 45.9% and 97.2% and had a weighted - average of 70.1% across all surveys."' - source_name: Google Sheet - url: https://pubmed.ncbi.nlm.nih.gov/29356216/ - source_published_by: Google Sheet - source_data_url: - https://docs.google.com/spreadsheets/d/e/2PACX-1vTQERY6SffT6Lc4ogBdVxjBlPFiOOluxEd3h9oAbrRtSy5YXQ0BTYqZFhrF1wl5N9j6Ko-Mm2XwHZtZ/pub?output=csv - license_url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6008788/ - license_name: Made freely available by authors - is_public: true - version: latest - publication_year: 2017 -wdir: ../../../data/snapshots/fasttrack/latest + description: |- + This dataset comes from the World Mental Health surveys, which conducted national studies in 21 countries, using validated structured interviews to survey members of the general population about symptoms of mental illnesses they had in the past 12 months and their lifetime so far. The source describes the dataset: "Data came from 24 community epidemiological surveys administered in 21 countries as part of the WMH surveys (Kessler & Ustun, 2004). These included 12 surveys carried out in high-income countries, 6 surveys in upper-middle-income countries and 6 in low or lower-middle income countries (see table 1). The majority of surveys were based on nationally representative household samples. Three were representative of urban areas in their countries (Colombia, Mexico, and Peru). Three were representative of selected regions in their countries (Japan, Nigeria, and Murcia, Spain). Four were representative of selected Metropolitan Areas (Sao Paulo, Brazil; Medellin, Colombia; and Beijing-Shanghai and Shenzhen in the People’s Republic of China (PRC)). Trained lay interviewers conducted face-to-face interviews with respondents, aged 18 years and over. The interviews took place within the households of the respondents. To reduce respondent burden, the interview was divided into two parts. Part I assessed core mental disorders and was administered to all respondents. Part II, which assessed additional disorders and correlates, was administered to all Part I respondents who met lifetime criteria for any disorder plus a probability subsample of other Part I respondents. Part II data, the focus of this report, were weighted by the inverse of their probabilities of selection into Part II and additionally weighted to adjust samples to match population distributions on the cross-classification of key socio-demographic and geographic variables. Further details about WMH sampling and weighting are available elsewhere(Heeringa et al., 2008). Response rates ranged between 45.9% and 97.2% and had a weighted average of 70.1% across all surveys." + license: + name: Made freely available by authors + url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6008788/ outs: -- md5: 2ef040684f17d2121ded34c460efa4aa - size: 2000 - path: whm_treatment_gap_anxiety_disorders.csv + - md5: 2ef040684f17d2121ded34c460efa4aa + size: 2000 + path: whm_treatment_gap_anxiety_disorders.csv diff --git a/snapshots/gcp/2024-11-13/global_carbon_budget.py b/snapshots/gcp/2024-11-13/global_carbon_budget.py new file mode 100644 index 00000000000..f5364b94ee7 --- /dev/null +++ b/snapshots/gcp/2024-11-13/global_carbon_budget.py @@ -0,0 +1,64 @@ +"""Script to create snapshots of the Global Carbon Budget data products. + +A snapshot will be created for each of the following datasets: +* Global Carbon Budget - Fossil CO2 emissions. +* Global Carbon Budget - Global emissions. +* Global Carbon Budget - Land-use change emissions. +* Global Carbon Budget - National emissions. + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Names of input data files to create snapshots for. +DATA_FILES = [ + "global_carbon_budget_fossil_co2_emissions.csv", + "global_carbon_budget_global_emissions.xlsx", + "global_carbon_budget_land_use_change_emissions.xlsx", + "global_carbon_budget_national_emissions.xlsx", +] + +# Define common metadata fields (to be written to dvc files). +ATTRIBUTION = "Global Carbon Budget (2024)" +ATTRIBUTION_SHORT = "GCB" +CITATION_FULL = """Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + +The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + +For more details, see the original paper: +Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023.""" + +DESCRIPTION = """The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + +The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies.""" + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot for each dataset. + for data_file in DATA_FILES: + snap = Snapshot(f"gcp/{SNAPSHOT_VERSION}/{data_file}") + + # Replace the full citation and description in the metadata. + snap.metadata.origin.attribution = ATTRIBUTION # type: ignore + snap.metadata.origin.attribution_short = ATTRIBUTION_SHORT # type: ignore + snap.metadata.origin.citation_full = CITATION_FULL # type: ignore + snap.metadata.origin.description = DESCRIPTION # type: ignore + + # Rewrite metadata to dvc file. + snap.metadata_path.write_text(snap.metadata.to_yaml()) + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/gcp/2024-11-13/global_carbon_budget_fossil_co2_emissions.csv.dvc b/snapshots/gcp/2024-11-13/global_carbon_budget_fossil_co2_emissions.csv.dvc new file mode 100644 index 00000000000..feba80ce3c0 --- /dev/null +++ b/snapshots/gcp/2024-11-13/global_carbon_budget_fossil_co2_emissions.csv.dvc @@ -0,0 +1,29 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - Fossil CO2 emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + url_download: https://zenodo.org/records/13981696/files/GCB2024v17_MtCO2_flat.csv + date_accessed: '2024-11-13' + date_published: '2024-11-13' + license: + name: CC BY 4.0 + url: https://zenodo.org/records/10177738 +outs: + - md5: 70dac1843444b14655bf756c70c1f04a + size: 3128569 + path: global_carbon_budget_fossil_co2_emissions.csv diff --git a/snapshots/gcp/2024-11-13/global_carbon_budget_global_emissions.xlsx.dvc b/snapshots/gcp/2024-11-13/global_carbon_budget_global_emissions.xlsx.dvc new file mode 100644 index 00000000000..edaaf1bd695 --- /dev/null +++ b/snapshots/gcp/2024-11-13/global_carbon_budget_global_emissions.xlsx.dvc @@ -0,0 +1,29 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - Global emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + url_download: https://globalcarbonbudgetdata.org/downloads/jGJH0-data/Global_Carbon_Budget_2024_v1.0.xlsx + date_accessed: '2024-11-13' + date_published: '2024-11-13' + license: + name: CC BY 4.0 + url: https://www.icos-cp.eu/data-services/about-data-portal/data-license +outs: + - md5: ba4ef8c16f172438e1ae283f20ef92e1 + size: 406583 + path: global_carbon_budget_global_emissions.xlsx diff --git a/snapshots/gcp/2024-11-13/global_carbon_budget_land_use_change_emissions.xlsx.dvc b/snapshots/gcp/2024-11-13/global_carbon_budget_land_use_change_emissions.xlsx.dvc new file mode 100644 index 00000000000..cfb7e546564 --- /dev/null +++ b/snapshots/gcp/2024-11-13/global_carbon_budget_land_use_change_emissions.xlsx.dvc @@ -0,0 +1,29 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - Land-use change emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + url_download: https://globalcarbonbudgetdata.org/downloads/jGJH0-data/National_LandUseChange_Carbon_Emissions_2024v1.0.xlsx + date_accessed: '2024-11-13' + date_published: '2024-11-13' + license: + name: CC BY 4.0 + url: https://www.icos-cp.eu/data-services/about-data-portal/data-license +outs: + - md5: 3415714f06bf3c00dbd675e40a73152b + size: 1257748 + path: global_carbon_budget_land_use_change_emissions.xlsx diff --git a/snapshots/gcp/2024-11-13/global_carbon_budget_national_emissions.xlsx.dvc b/snapshots/gcp/2024-11-13/global_carbon_budget_national_emissions.xlsx.dvc new file mode 100644 index 00000000000..d0399880bda --- /dev/null +++ b/snapshots/gcp/2024-11-13/global_carbon_budget_national_emissions.xlsx.dvc @@ -0,0 +1,29 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - National emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + url_download: https://globalcarbonbudgetdata.org/downloads/jGJH0-data/National_Fossil_Carbon_Emissions_2024v1.0.xlsx + date_accessed: '2024-11-13' + date_published: '2024-11-13' + license: + name: CC BY 4.0 + url: https://www.icos-cp.eu/data-services/about-data-portal/data-license +outs: + - md5: 15f157d2f4c6770c85883a33beec954c + size: 724957 + path: global_carbon_budget_national_emissions.xlsx diff --git a/snapshots/gcp/2024-11-21/global_carbon_budget.py b/snapshots/gcp/2024-11-21/global_carbon_budget.py new file mode 100644 index 00000000000..5821b60d6b3 --- /dev/null +++ b/snapshots/gcp/2024-11-21/global_carbon_budget.py @@ -0,0 +1,71 @@ +"""Script to create snapshots of the Global Carbon Budget data products. + +A snapshot will be created for each of the following datasets: +* Global Carbon Budget - Fossil CO2 emissions. +* Global Carbon Budget - Global emissions. +* Global Carbon Budget - Land-use change emissions. +* Global Carbon Budget - National emissions. + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Names of input data files to create snapshots for. +DATA_FILES = [ + "global_carbon_budget_fossil_co2_emissions.csv", + "global_carbon_budget_global_emissions.xlsx", + "global_carbon_budget_land_use_change_emissions.xlsx", + "global_carbon_budget_national_emissions.xlsx", +] + +# Define common metadata fields (to be written to dvc files). +ATTRIBUTION = "Global Carbon Budget (2024)" +ATTRIBUTION_SHORT = "GCB" +CITATION_FULL = """Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + +The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + +For more details, see the original paper: +Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023.""" + +DESCRIPTION = """The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + +The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies.""" + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-folder", prompt=True, type=str, help="Path to local folder where data files are.") +def main(path_to_folder: str, upload: bool) -> None: + # Create a new snapshot for each dataset. + for data_file in DATA_FILES: + snap = Snapshot(f"gcp/{SNAPSHOT_VERSION}/{data_file}") + + # Replace the full citation and description in the metadata. + snap.metadata.origin.attribution = ATTRIBUTION # type: ignore + snap.metadata.origin.attribution_short = ATTRIBUTION_SHORT # type: ignore + snap.metadata.origin.citation_full = CITATION_FULL # type: ignore + snap.metadata.origin.description = DESCRIPTION # type: ignore + + # Rewrite metadata to dvc file. + snap.metadata_path.write_text(snap.metadata.to_yaml()) + + # Download data from source, add file to DVC and upload to S3. + ################################################################################################################ + # snap.create_snapshot(upload=upload) + # TODO: Once public, remove this, uncomment previous, and remove click.option for path to folder. + path_to_file = Path(path_to_folder) / data_file + assert path_to_file.exists(), f"File {path_to_file} does not exist." + snap.create_snapshot(filename=path_to_file, upload=upload) + ################################################################################################################ + + +if __name__ == "__main__": + main() diff --git a/snapshots/gcp/2024-11-21/global_carbon_budget_fossil_co2_emissions.csv.dvc b/snapshots/gcp/2024-11-21/global_carbon_budget_fossil_co2_emissions.csv.dvc new file mode 100644 index 00000000000..ee398009771 --- /dev/null +++ b/snapshots/gcp/2024-11-21/global_carbon_budget_fossil_co2_emissions.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - Fossil CO2 emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + date_accessed: '2024-11-21' + date_published: '2024-11-21' + license: + name: CC BY 4.0 + url: https://zenodo.org/records/10177738 +outs: + - md5: eefcbe53b9da64d615a170970496c7c1 + size: 2869860 + path: global_carbon_budget_fossil_co2_emissions.csv diff --git a/snapshots/gcp/2024-11-21/global_carbon_budget_global_emissions.xlsx.dvc b/snapshots/gcp/2024-11-21/global_carbon_budget_global_emissions.xlsx.dvc new file mode 100644 index 00000000000..91040f5e5a9 --- /dev/null +++ b/snapshots/gcp/2024-11-21/global_carbon_budget_global_emissions.xlsx.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - Global emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + date_accessed: '2024-11-21' + date_published: '2024-11-21' + license: + name: CC BY 4.0 + url: https://www.icos-cp.eu/data-services/about-data-portal/data-license +outs: + - md5: ad8155e5db112173f5ab465205e30680 + size: 941480 + path: global_carbon_budget_global_emissions.xlsx diff --git a/snapshots/gcp/2024-11-21/global_carbon_budget_land_use_change_emissions.xlsx.dvc b/snapshots/gcp/2024-11-21/global_carbon_budget_land_use_change_emissions.xlsx.dvc new file mode 100644 index 00000000000..ee07763c9d0 --- /dev/null +++ b/snapshots/gcp/2024-11-21/global_carbon_budget_land_use_change_emissions.xlsx.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - Land-use change emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + date_accessed: '2024-11-21' + date_published: '2024-11-21' + license: + name: CC BY 4.0 + url: https://www.icos-cp.eu/data-services/about-data-portal/data-license +outs: + - md5: afec716d828628dd37928197b2b545e1 + size: 1264402 + path: global_carbon_budget_land_use_change_emissions.xlsx diff --git a/snapshots/gcp/2024-11-21/global_carbon_budget_national_emissions.xlsx.dvc b/snapshots/gcp/2024-11-21/global_carbon_budget_national_emissions.xlsx.dvc new file mode 100644 index 00000000000..ede8c8d039d --- /dev/null +++ b/snapshots/gcp/2024-11-21/global_carbon_budget_national_emissions.xlsx.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - National emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + date_accessed: '2024-11-21' + date_published: '2024-11-21' + license: + name: CC BY 4.0 + url: https://www.icos-cp.eu/data-services/about-data-portal/data-license +outs: + - md5: 08d4f1086b4d2f6ec31fc48c6cd96e8e + size: 961527 + path: global_carbon_budget_national_emissions.xlsx diff --git a/snapshots/health/2023-05-04/global_wellbeing.xlsx.dvc b/snapshots/health/2023-05-04/global_wellbeing.xlsx.dvc index 48d61db0d47..5530d0bfb8b 100644 --- a/snapshots/health/2023-05-04/global_wellbeing.xlsx.dvc +++ b/snapshots/health/2023-05-04/global_wellbeing.xlsx.dvc @@ -3,7 +3,7 @@ meta: short_name: global_wellbeing name: Global Wellbeing Initiative (Gallup, 2020) version: '2023-05-04' - publication_year: 2020" + publication_year: 2020 source_name: Global Wellbeing Initiative (2020) source_published_by: Gallup World Poll, Global Wellbeing Initiative dataset, 2020 url: https://www.gallup.com/analytics/468179/global-wellbeing-initiative-dataset.aspx diff --git a/snapshots/health/latest/global_health_mpox.csv.dvc b/snapshots/health/latest/global_health_mpox.csv.dvc index eeee09f5427..4d27ab747a3 100644 --- a/snapshots/health/latest/global_health_mpox.csv.dvc +++ b/snapshots/health/latest/global_health_mpox.csv.dvc @@ -22,6 +22,6 @@ meta: url: https://global.health/terms-of-use/ outs: - - md5: f3691333735ed6d9703822d7cbc08f3e - size: 14267272 + - md5: 08388d2230adafbb7fe28ddcd1eb0dc8 + size: 16813136 path: global_health_mpox.csv diff --git a/snapshots/health/latest/global_health_mpox.py b/snapshots/health/latest/global_health_mpox.py index 0f65671044b..cb234a5f608 100644 --- a/snapshots/health/latest/global_health_mpox.py +++ b/snapshots/health/latest/global_health_mpox.py @@ -15,7 +15,7 @@ def main(upload: bool) -> None: # Create a new snapshot. snap = Snapshot(f"health/{SNAPSHOT_VERSION}/global_health_mpox.csv") - tb = snap.read() + tb = snap.read(safe_types=False) assert tb["Date_report_source_I"].min() > "2023-12-01", "Global.health have added data for 2023" # Download data from source, add file to DVC and upload to S3. snap.create_snapshot(upload=upload) diff --git a/snapshots/hmd/2024-11-19/hfd.py b/snapshots/hmd/2024-11-19/hfd.py new file mode 100644 index 00000000000..ca7ea224244 --- /dev/null +++ b/snapshots/hmd/2024-11-19/hfd.py @@ -0,0 +1,31 @@ +"""To be able to download the dataset, you need to be registered at HFD. + +Just go to their site: https://www.humanfertility.org/ +Or directly try to login: https://www.humanfertility.org/Account/Login / register: https://www.humanfertility.org/Account/Auth + +Only after that, you'll be allowed to download the dataset: https://www.humanfertility.org/File/Download/Files/zip/HFD.zip +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"hmd/{SNAPSHOT_VERSION}/hfd.zip") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/hmd/2024-11-19/hfd.zip.dvc b/snapshots/hmd/2024-11-19/hfd.zip.dvc new file mode 100644 index 00000000000..9b4913b5589 --- /dev/null +++ b/snapshots/hmd/2024-11-19/hfd.zip.dvc @@ -0,0 +1,34 @@ +meta: + origin: + # Data product / Snapshot + title: Human Fertility Database + description: |- + The HFD is entirely based on one and the same type of initial data - officially registered birth counts by calendar year, mother's age (and/or cohort) and (whenever possible) biological birth order. These data, together with total female population exposure from the Human Mortality Database (www.mortality.org) and parity-specific female population exposure from selected population censuses, population registers, or large-scale surveys, are further processed using a uniform set of methods. The major HFD output includes detailed data on births, unconditional and conditional fertility rates, cohort and period fertility tables as well as selected aggregate indicators such as total fertility rates, mean ages at childbearing, and parity progression ratios. + + For each country, there are four blocks of data provided: + + - Summary Indicators + - Age-Specific Data + - Fertility Tables + - Input Data + + More details at https://www.humanfertility.org/Data/ExplanatoryNotes, and https://www.humanfertility.org/File/GetDocumentFree/Docs/methods.pdf. + date_published: 2024-05-23 + # Citation + producer: Human Fertility Database + citation_full: |- + Human Fertility Database. Max Planck Institute for Demographic Research (Germany) and Vienna Institute of Demography (Austria). Available at www.humanfertility.org (data downloaded on 2024-11-19). + attribution_short: HFD + # Files + url_main: https://www.humanfertility.org/Home/Index + url_download: https://www.humanfertility.org/File/Download/Files/zip/HFD.zip + date_accessed: 2024-11-19 + + # License + license: + url: https://www.mortality.org/Data/UserAgreement + name: CC BY 4.0 +outs: + - md5: e09ca5adeceb3c66996aa526ffa992c6 + size: 38391495 + path: hfd.zip diff --git a/snapshots/hmd/2024-11-27/hmd.py b/snapshots/hmd/2024-11-27/hmd.py new file mode 100644 index 00000000000..c5180a4dfa6 --- /dev/null +++ b/snapshots/hmd/2024-11-27/hmd.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"hmd/{SNAPSHOT_VERSION}/hmd.zip") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/hmd/2024-11-27/hmd.zip.dvc b/snapshots/hmd/2024-11-27/hmd.zip.dvc new file mode 100644 index 00000000000..63f4dfe3a51 --- /dev/null +++ b/snapshots/hmd/2024-11-27/hmd.zip.dvc @@ -0,0 +1,74 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Human Mortality Database + description: |- + The Human Mortality Database (HMD) contains original calculations of all-cause death rates and life tables for national populations (countries or areas), as well as the input data used in constructing those tables. The input data consist of death counts from vital statistics, plus census counts, birth counts, and population estimates from various sources. + + + # Scope and basic principles + + The database is limited by design to populations where death registration and census data are virtually complete, since this type of information is required for the uniform method used to reconstruct historical data series. As a result, the countries and areas included here are relatively wealthy and for the most part highly industrialized. + + The main goal of the Human Mortality Database is to document the longevity revolution of the modern era and to facilitate research into its causes and consequences. As much as possible, the authors of the database have followed four guiding principles: comparability, flexibility, accessibility, reproducibility. + + + # Computing death rates and life tables + + Their process for computing mortality rates and life tables can be described in terms of six steps, corresponding to six data types that are available from the HMD. Here is an overview of the process: + + 1. Births. Annual counts of live births by sex are collected for each population over the longest possible time period. These counts are used mainly for making population estimates at younger ages. + 2. Deaths. Death counts are collected at the finest level of detail available. If raw data are aggregated, uniform methods are used to estimate death counts by completed age (i.e., age-last-birthday at time of death), calendar year of death, and calendar year of birth. + 3. Population size. Annual estimates of population size on January 1st are either obtained from another source or are derived from census data plus birth and death counts. + 4. Exposure-to-risk. Estimates of the population exposed to the risk of death during some age-time interval are based on annual (January 1st) population estimates, with a small correction that reflects the timing of deaths within the interval. + 5. Death rates. Death rates are always a ratio of the death count for a given age-time interval divided by an estimate of the exposure-to-risk in the same interval. + 6. Life tables. To build a life table, probabilities of death are computed from death rates. These probabilities are used to construct life tables, which include life expectancies and other useful indicators of mortality and longevity. + + + # Corrections to the data + + The data presented here have been corrected for gross errors (e.g., a processing error whereby 3,800 becomes 38,000 in a published statistical table would be obvious in most cases, and it would be corrected). However, the authors have not attempted to correct the data for systematic age misstatement (misreporting of age) or coverage errors (over- or under-enumeration of people or events). + + Some available studies assess the completeness of census coverage or death registration in the various countries, and more work is needed in this area. However, in developing the database thus far, the authors did not consider it feasible or desirable to attempt corrections of this sort, especially since it would be impossible to correct the data by a uniform method across all countries. + + + # Age misreporting + + Populations are included here if there is a well-founded belief that the coverage of their census and vital registration systems is relatively high, and thus, that fruitful analyses by both specialists and non-specialists should be possible with these data. Nevertheless, there is evidence of both age heaping (overreporting ages ending in "0" or "5") and age exaggeration in these data. + + In general, the degree of age heaping in these data varies by the time period and population considered, but it is usually no burden to scientific analysis. In most cases, it is sufficient to analyze data in five-year age groups in order to avoid the false impressions created by this particular form of age misstatement. + + Age exaggeration, on the other hand, is a more insidious problem. The authors' approach is guided by the conventional wisdom that age reporting in death registration systems is typically more reliable than in census counts or official population estimates. For this reason, the authors derive population estimates at older ages from the death counts themselves, employing extinct cohort methods. Such methods eliminate some, but certainly not all, of the biases in old-age mortality estimates due to age exaggeration. + + + # Uniform set of procedures + + A key goal of this project is to follow a uniform set of procedures for each population. This approach does not guarantee the cross-national comparability of the data. Rather, it ensures only that the authors have not introduced biases by the authors' own manipulations. The desire of the authors for uniformity had to face the challenge that raw data come in a variety of formats (for example, 1-year versus 5-year age groups). The authors' general approach to this problem is that the available raw data are used first to estimate two quantities: 1) the number of deaths by completed age, year of birth, and year of death; and 2) population estimates by single years of age on January 1 of each year. For each population, these calculations are performed separately by sex. From these two pieces of information, they compute death rates and life tables in a variety of age-time configurations. + + It is reasonable to ask whether a single procedure is the best method for treating the data from a variety of populations. Here, two points must be considered. First, the authors' uniform methodology is based on procedures that were developed separately, though following similar principles, for various countries and by different researchers. Earlier methods were synthesized by choosing what they considered the best among alternative procedures and by eliminating superficial inconsistencies. The second point is that a uniform procedure is possible only because the authors have not attempted to correct the data for reporting and coverage errors. Although some general principles could be followed, such problems would have to be addressed individually for each population. + + Although the authors adhere strictly to a uniform procedure, the data for each population also receive significant individualized attention. Each country or area is assigned to an individual researcher, who takes responsibility for assembling and checking the data for errors. In addition, the person assigned to each country/area checks the authors' data against other available sources. These procedures help to assure a high level of data quality, but assistance from database users in identifying problems is always appreciated! + date_published: "2024-11-13" + # Citation + producer: Human Mortality Database + citation_full: |- + HMD. Human Mortality Database. Max Planck Institute for Demographic Research (Germany), University of California, Berkeley (USA), and French Institute for Demographic Studies (France). Available at www.mortality.org. + + See also the methods protocol: + Wilmoth, J. R., Andreev, K., Jdanov, D., Glei, D. A., Riffe, T., Boe, C., Bubenheim, M., Philipov, D., Shkolnikov, V., Vachon, P., Winant, C., & Barbieri, M. (2021). Methods protocol for the human mortality database (v6). [Available online](https://www.mortality.org/File/GetDocument/Public/Docs/MethodsProtocolV6.pdf) (needs log in to mortality.org). + attribution_short: HMD + # Files + url_main: https://www.mortality.org/Data/ZippedDataFiles + date_accessed: 2024-11-27 + + # License + license: + name: CC BY 4.0 + url: https://www.mortality.org/Data/UserAgreement + +outs: + - md5: ceed045241a19573e6621423b582558e + size: 147314590 + path: hmd.zip diff --git a/snapshots/hmd/2024-12-01/hmd_country.py b/snapshots/hmd/2024-12-01/hmd_country.py new file mode 100644 index 00000000000..356e913b41d --- /dev/null +++ b/snapshots/hmd/2024-12-01/hmd_country.py @@ -0,0 +1,32 @@ +"""Download data manually: + +- Go to https://mortality.org/Data/ZippedDataFiles +- Scroll down to "By country" section +- Click on "All HMD countries" + +Note: you need to be logged in to download the data. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"hmd/{SNAPSHOT_VERSION}/hmd_country.zip") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/hmd/2024-12-01/hmd_country.zip.dvc b/snapshots/hmd/2024-12-01/hmd_country.zip.dvc new file mode 100644 index 00000000000..9a0d86340f9 --- /dev/null +++ b/snapshots/hmd/2024-12-01/hmd_country.zip.dvc @@ -0,0 +1,79 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Human Mortality Database, by country + description: |- + The Human Mortality Database (HMD) contains original calculations of all-cause death rates and life tables for national populations (countries or areas), as well as the input data used in constructing those tables. The input data consist of death counts from vital statistics, plus census counts, birth counts, and population estimates from various sources. + + + # Scope and basic principles + + The database is limited by design to populations where death registration and census data are virtually complete, since this type of information is required for the uniform method used to reconstruct historical data series. As a result, the countries and areas included here are relatively wealthy and for the most part highly industrialized. + + The main goal of the Human Mortality Database is to document the longevity revolution of the modern era and to facilitate research into its causes and consequences. As much as possible, the authors of the database have followed four guiding principles: comparability, flexibility, accessibility, reproducibility. + + + # Computing death rates and life tables + + Their process for computing mortality rates and life tables can be described in terms of six steps, corresponding to six data types that are available from the HMD. Here is an overview of the process: + + 1. Births. Annual counts of live births by sex are collected for each population over the longest possible time period. These counts are used mainly for making population estimates at younger ages. + 2. Deaths. Death counts are collected at the finest level of detail available. If raw data are aggregated, uniform methods are used to estimate death counts by completed age (i.e., age-last-birthday at time of death), calendar year of death, and calendar year of birth. + 3. Population size. Annual estimates of population size on January 1st are either obtained from another source or are derived from census data plus birth and death counts. + 4. Exposure-to-risk. Estimates of the population exposed to the risk of death during some age-time interval are based on annual (January 1st) population estimates, with a small correction that reflects the timing of deaths within the interval. + 5. Death rates. Death rates are always a ratio of the death count for a given age-time interval divided by an estimate of the exposure-to-risk in the same interval. + 6. Life tables. To build a life table, probabilities of death are computed from death rates. These probabilities are used to construct life tables, which include life expectancies and other useful indicators of mortality and longevity. + + + # Corrections to the data + + The data presented here have been corrected for gross errors (e.g., a processing error whereby 3,800 becomes 38,000 in a published statistical table would be obvious in most cases, and it would be corrected). However, the authors have not attempted to correct the data for systematic age misstatement (misreporting of age) or coverage errors (over- or under-enumeration of people or events). + + Some available studies assess the completeness of census coverage or death registration in the various countries, and more work is needed in this area. However, in developing the database thus far, the authors did not consider it feasible or desirable to attempt corrections of this sort, especially since it would be impossible to correct the data by a uniform method across all countries. + + + # Age misreporting + + Populations are included here if there is a well-founded belief that the coverage of their census and vital registration systems is relatively high, and thus, that fruitful analyses by both specialists and non-specialists should be possible with these data. Nevertheless, there is evidence of both age heaping (overreporting ages ending in "0" or "5") and age exaggeration in these data. + + In general, the degree of age heaping in these data varies by the time period and population considered, but it is usually no burden to scientific analysis. In most cases, it is sufficient to analyze data in five-year age groups in order to avoid the false impressions created by this particular form of age misstatement. + + Age exaggeration, on the other hand, is a more insidious problem. The authors' approach is guided by the conventional wisdom that age reporting in death registration systems is typically more reliable than in census counts or official population estimates. For this reason, the authors derive population estimates at older ages from the death counts themselves, employing extinct cohort methods. Such methods eliminate some, but certainly not all, of the biases in old-age mortality estimates due to age exaggeration. + + + # Uniform set of procedures + + A key goal of this project is to follow a uniform set of procedures for each population. This approach does not guarantee the cross-national comparability of the data. Rather, it ensures only that the authors have not introduced biases by the authors' own manipulations. The desire of the authors for uniformity had to face the challenge that raw data come in a variety of formats (for example, 1-year versus 5-year age groups). The authors' general approach to this problem is that the available raw data are used first to estimate two quantities: 1) the number of deaths by completed age, year of birth, and year of death; and 2) population estimates by single years of age on January 1 of each year. For each population, these calculations are performed separately by sex. From these two pieces of information, they compute death rates and life tables in a variety of age-time configurations. + + It is reasonable to ask whether a single procedure is the best method for treating the data from a variety of populations. Here, two points must be considered. First, the authors' uniform methodology is based on procedures that were developed separately, though following similar principles, for various countries and by different researchers. Earlier methods were synthesized by choosing what they considered the best among alternative procedures and by eliminating superficial inconsistencies. The second point is that a uniform procedure is possible only because the authors have not attempted to correct the data for reporting and coverage errors. Although some general principles could be followed, such problems would have to be addressed individually for each population. + + Although the authors adhere strictly to a uniform procedure, the data for each population also receive significant individualized attention. Each country or area is assigned to an individual researcher, who takes responsibility for assembling and checking the data for errors. In addition, the person assigned to each country/area checks the authors' data against other available sources. These procedures help to assure a high level of data quality, but assistance from database users in identifying problems is always appreciated! + description_snapshot: |- + HMD data by country. This contains the raw data, including their "input data", which HMD defines as: + + The Input Database houses the raw data that are the basis for all HMD calculations. Input data files for each population are accessible from the country page. + + date_published: "2024-11-13" + # Citation + producer: Human Mortality Database + citation_full: |- + HMD. Human Mortality Database. Max Planck Institute for Demographic Research (Germany), University of California, Berkeley (USA), and French Institute for Demographic Studies (France). Available at www.mortality.org. + + See also the methods protocol: + Wilmoth, J. R., Andreev, K., Jdanov, D., Glei, D. A., Riffe, T., Boe, C., Bubenheim, M., Philipov, D., Shkolnikov, V., Vachon, P., Winant, C., & Barbieri, M. (2021). Methods protocol for the human mortality database (v6). [Available online](https://www.mortality.org/File/GetDocument/Public/Docs/MethodsProtocolV6.pdf) (needs log in to mortality.org). + attribution_short: HMD + # Files + url_main: https://www.mortality.org/Data/ZippedDataFiles + date_accessed: 2024-11-27 + + # License + license: + name: CC BY 4.0 + url: https://www.mortality.org/Data/UserAgreement + +outs: + - md5: efae1882e47e8132bd5a2add9f7e445a + size: 345841896 + path: hmd_country.zip diff --git a/snapshots/homicide/2024-10-30/unodc.py b/snapshots/homicide/2024-10-30/unodc.py new file mode 100644 index 00000000000..072382a690e --- /dev/null +++ b/snapshots/homicide/2024-10-30/unodc.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"homicide/{SNAPSHOT_VERSION}/unodc.xlsx") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/homicide/2024-10-30/unodc.xlsx.dvc b/snapshots/homicide/2024-10-30/unodc.xlsx.dvc new file mode 100644 index 00000000000..d912a778137 --- /dev/null +++ b/snapshots/homicide/2024-10-30/unodc.xlsx.dvc @@ -0,0 +1,47 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: United Nations Office on Drugs and Crime - Intentional Homicide Victims + description: |- + The United Nations Office on Drugs and Crime Intentional Homicide data are sourced from either criminal justice or public health systems. In the former, data are generated by law enforcement or criminal justice authorities in the process of recording and investigating a crime event, whereas in the latter, data are produced by health authorities certifying the cause of death of an individual. + + The criminal justice data was collected from national authorities with the annual United Nations Survey of Crime Trends and Operations of Criminal Justice Systems (UN-CTS). National focal points working in national agencies responsible for statistics on crime and the criminal justice system and nominated by the Permanent Mission to UNODC are responsible for compiling the data from the other relevant agencies before transmitting the UN-CTS to UNODC. + + Following the submission, UNODC checks for consistency and coherence with other data sources. The population data used to calculate homicide rates is sourced from the World Population Prospect, Population Division, United Nations Department of Economic and Social Affairs. + + The statistical definition contains three elements that characterize the killing of a person as “intentional homicide”: + + 1. The killing of a person by another person (objective element). + + 2. The intent of the perpetrator to kill or seriously injure the victim (subjective element). + + 3. The unlawfulness of the killing (legal element). + + For recording purposes, all killings that meet the criteria listed above are to be considered intentional homicides, irrespective of definitions provided by national legislations or practices. Killings as a result of terrorist activities are also to be classified as a form of intentional homicide. + + In several cases data from multiple sources were combined to expand the number of available years within a country’s time series, so that a consistent time series of total homicides back to 1990 could be compiled. Time series adjustments were performed when a country had two sources covering an overlapping time period had similar trends but differening values. + + date_published: "2024-05-16" + + # Citation + producer: United Nations Office on Drugs and Crime + citation_full: |- + UNODC (2024), UNODC Research - Data Portal – Intentional Homicide. https://dataunodc.un.org/dp-intentional-homicide-victims (Accessed on [30 10 2024]). + attribution_short: UNODC + + # Files + url_main: https://dataunodc.un.org/dp-intentional-homicide-victims + url_download: https://dataunodc.un.org/sites/dataunodc.un.org/files/data_cts_intentional_homicide.xlsx + date_accessed: 2024-10-30 + + # License + license: + name: © United Nations + url: https://dataunodc.un.org/termsofuse + +outs: + - md5: 5e376d6b833ce2c5b09a2a8f104c2a3e + size: 5671341 + path: unodc.xlsx diff --git a/snapshots/iea/2024-11-20/fossil_fuel_subsidies.py b/snapshots/iea/2024-11-20/fossil_fuel_subsidies.py new file mode 100644 index 00000000000..5cb7627fb7c --- /dev/null +++ b/snapshots/iea/2024-11-20/fossil_fuel_subsidies.py @@ -0,0 +1,32 @@ +"""Script to create a snapshot of dataset. + +To obtain the file, you need to log in into the IEA website and download the XLSX file in: +https://www.iea.org/data-and-statistics/data-product/fossil-fuel-subsidies-database#data-sets + +Note that creating an account is free, and this dataset is also free of cost. + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"iea/{SNAPSHOT_VERSION}/fossil_fuel_subsidies.xlsx") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/iea/2024-11-20/fossil_fuel_subsidies.xlsx.dvc b/snapshots/iea/2024-11-20/fossil_fuel_subsidies.xlsx.dvc new file mode 100644 index 00000000000..a88b8b6e19d --- /dev/null +++ b/snapshots/iea/2024-11-20/fossil_fuel_subsidies.xlsx.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Fossil Fuel Subsidies Database + description: |- + Fossil fuel consumption subsidies for selected countries. + date_published: "2024-10-01" + + # Citation + producer: International Energy Agency + citation_full: |- + International Energy Agency - Fossil Fuel Subsidies Database (2024). + attribution_short: IEA + + # Files + url_main: https://www.iea.org/data-and-statistics/data-product/fossil-fuel-subsidies-database + date_accessed: 2024-11-20 + + # License + license: + name: CC BY 4.0 + url: https://www.iea.org/data-and-statistics/data-product/fossil-fuel-subsidies-database + +outs: + - md5: baa1ef5e6f740931575c19082d28f745 + size: 148786 + path: fossil_fuel_subsidies.xlsx diff --git a/snapshots/imf/2024-11-25/world_economic_outlook.py b/snapshots/imf/2024-11-25/world_economic_outlook.py new file mode 100644 index 00000000000..4160c43e799 --- /dev/null +++ b/snapshots/imf/2024-11-25/world_economic_outlook.py @@ -0,0 +1,37 @@ +""" +Script to create a snapshot of dataset. + +The IMF doesn't allow automatic download of the dataset, so we need to manually download the dataset from the IMF website. + 1. Visit https://www.imf.org/en/Publications/SPROLLS/world-economic-outlook-databases + 2. Select the latest version of the data. + 3. Select "Entire dataset" + 4. Select "By Countries" to download the file. + 5. Save the file to this folder. + 6. Run this command on the terminal: + python snapshots/imf/{version}/world_economic_outlook.py --path-to-file + 7. Delete the file from the folder. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"imf/{SNAPSHOT_VERSION}/world_economic_outlook.xls") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/imf/2024-11-25/world_economic_outlook.xls.dvc b/snapshots/imf/2024-11-25/world_economic_outlook.xls.dvc new file mode 100644 index 00000000000..c8d796e1d8b --- /dev/null +++ b/snapshots/imf/2024-11-25/world_economic_outlook.xls.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: World Economic Outlook (WEO) + description: |- + The World Economic Outlook (WEO) database contains selected macroeconomic data series from the statistical appendix of the report of the same name, which presents the IMF staff's analysis and projections of economic developments at the global level, in major country groups, and many individual countries. + date_published: "2024-10-22" + version_producer: October 2024 + + # Citation + producer: International Monetary Fund + citation_full: |- + IMF. 2024. World Economic Outlook, October 2024. Washington, DC: International Monetary Fund. ©IMF. https://doi.org/10.5089/9798400281150.081 + attribution_short: IMF + + # Files + url_main: https://www.imf.org/en/Publications/WEO/weo-database/2024/October + url_download: https://www.imf.org/-/media/Files/Publications/WEO/WEO-Database/2024/October/WEOOct2024all.ashx + date_accessed: 2024-11-25 + + # License + license: + name: IMF Copyright and Usage + url: https://www.imf.org/en/About/copyright-and-terms + +outs: + - md5: 2b56b3c547bd689518188f64d739e8d5 + size: 20297276 + path: world_economic_outlook.xls diff --git a/snapshots/irena/2024-11-01/renewable_capacity_statistics.py b/snapshots/irena/2024-11-01/renewable_capacity_statistics.py new file mode 100644 index 00000000000..6e9e7fcdd6f --- /dev/null +++ b/snapshots/irena/2024-11-01/renewable_capacity_statistics.py @@ -0,0 +1,29 @@ +"""Script to create a snapshot of dataset 'Renewable Capacity Statistics'.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"irena/{SNAPSHOT_VERSION}/renewable_capacity_statistics.xlsx") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/irena/2024-11-01/renewable_capacity_statistics.xlsx.dvc b/snapshots/irena/2024-11-01/renewable_capacity_statistics.xlsx.dvc new file mode 100644 index 00000000000..63c9e605b43 --- /dev/null +++ b/snapshots/irena/2024-11-01/renewable_capacity_statistics.xlsx.dvc @@ -0,0 +1,50 @@ +meta: + origin: + producer: IRENA + title: Renewable Capacity Statistics + description: |- + The renewable power capacity data represents the maximum net generating capacity of power plants and other installations that use renewable energy sources to produce electricity. For most countries and technologies, the data reflects the capacity installed and connected at the end of the calendar year. The data is presented in megawatts (MW) rounded to the nearest one megawatt, with figures between zero and 0.5MW shown as a 0. The data has been obtained from a variety of sources, including: the IRENA questionnaire; official statistics; industry association reports; and other reports and news articles. + + Some technologies include others, following this schema: + + - Total renewable capacity (on-grid and off-grid) + - Hydropower + - Renewable hydropower (including mixed plants) + - Pumped storage (note that this is included in total hydropower capacity, but not in total renewable capacity) + - Marine energy + - Wind energy + - Onshore wind energy + - Offshore wind energy + - Solar energy + - Solar photovoltaic + - Concentrated solar power + - Bioenergy + - Solid biofuels and renewable waste + - Renewable municipal waste + - Bagasse + - Other solid biofuels + - Liquid biofuels + - Biogas + - Geothermal + - Total off-grid renewable capacity + - Off-grid hydropower + - Off-grid solar photovoltaic + - Other off-grid renewable energy + + citation_full: IRENA - Renewable capacity statistics 2024. International Renewable Energy Agency, Abu Dhabi (2024). + attribution_short: IRENA + #################################################################################################################### + # TODO: On next update, consider using the Renewable Energy Statistics report as the main source, + # instead of the Renewable Capacity Statistics. The former seems to be more complete. + #################################################################################################################### + url_main: https://www.irena.org/Publications/2024/Mar/Renewable-capacity-statistics-2024 + url_download: https://www.irena.org/-/media/Files/IRENA/Agency/Publication/2024/Mar/IRENA_Stats_Extract_%202024_H1_V1.xlsx + date_accessed: '2024-11-01' + date_published: '2024-03-01' + license: + name: ©IRENA 2024 + url: https://www.irena.org/Publications/2024/Mar/Renewable-capacity-statistics-2024 +outs: + - md5: b3f57cc92388c842e10cad837bd81a60 + size: 7686496 + path: renewable_capacity_statistics.xlsx diff --git a/snapshots/irena/2024-11-15/renewable_power_generation_costs.py b/snapshots/irena/2024-11-15/renewable_power_generation_costs.py new file mode 100644 index 00000000000..ac2c77cd59f --- /dev/null +++ b/snapshots/irena/2024-11-15/renewable_power_generation_costs.py @@ -0,0 +1,29 @@ +"""Script to create a snapshot of dataset 'Renewable Power Generation Costs'.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"irena/{SNAPSHOT_VERSION}/renewable_power_generation_costs.xlsx") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/irena/2024-11-15/renewable_power_generation_costs.xlsx.dvc b/snapshots/irena/2024-11-15/renewable_power_generation_costs.xlsx.dvc new file mode 100644 index 00000000000..9c2f5aaad37 --- /dev/null +++ b/snapshots/irena/2024-11-15/renewable_power_generation_costs.xlsx.dvc @@ -0,0 +1,17 @@ +meta: + origin: + producer: IRENA + title: Renewable Power Generation Costs + citation_full: IRENA - Renewable Power Generation Costs in 2023. International Renewable Energy Agency, Abu Dhabi (2024). + attribution_short: IRENA + url_main: https://www.irena.org/Publications/2024/Sep/Renewable-Power-Generation-Costs-in-2023 + url_download: https://www.irena.org/-/media/Files/IRENA/Agency/Publication/2024/Sep/IRENA-Datafile-RenPwrGenCosts-in-2023-v1.xlsx + date_accessed: '2024-11-15' + date_published: '2024-09-01' + license: + name: ©IRENA 2024 + url: https://www.irena.org/Publications/2024/Sep/Renewable-Power-Generation-Costs-in-2023 +outs: + - md5: 99102802d39b7705f5599c995b1e28c6 + size: 6139686 + path: renewable_power_generation_costs.xlsx diff --git a/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature.py b/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature.py new file mode 100644 index 00000000000..7d00127f77e --- /dev/null +++ b/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature.py @@ -0,0 +1,58 @@ +"""Ingest snapshot of the HadCRUT5 near surface temperature dataset (temperature anomaly) by Met Office Hadley Centre. + +The HadCRUT5 near surface temperature data set is produced by blending data from the CRUTEM5 surface air temperature +dataset and the HadSST4 sea-surface temperature dataset. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Names of input data files to create snapshots for. +DATA_FILES = [ + "near_surface_temperature_global.csv", + "near_surface_temperature_northern_hemisphere.csv", + "near_surface_temperature_southern_hemisphere.csv", +] + +# Define common metadata fields (to be written to dvc files). +CITATION_FULL = """Morice, C.P., J.J. Kennedy, N.A. Rayner, J.P. Winn, E. Hogan, R.E. Killick, R.J.H. Dunn, T.J. Osborn, P.D. Jones and I.R. Simpson (in press) An updated assessment of near-surface temperature change from 1850: the HadCRUT5 dataset. Journal of Geophysical Research (Atmospheres) [doi:10.1029/2019JD032361](https://www.metoffice.gov.uk/hadobs/hadcrut5/HadCRUT5_accepted.pdf) ([supporting information](https://www.metoffice.gov.uk/hadobs/hadcrut5/HadCRUT5_supporting_information_accepted.pdf)).""" +DESCRIPTION = """The HadCRUT5 near surface temperature data set is produced by blending data from the CRUTEM5 surface air temperature dataset and the HadSST4 sea-surface temperature dataset.\n\nTemperature anomalies are based on the HadCRUT5 near-surface temperature dataset as published by the Met Office Hadley Centre.""" + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot for each dataset. + for data_file in DATA_FILES: + snap = Snapshot(f"met_office_hadley_centre/{SNAPSHOT_VERSION}/{data_file}") + + # Replace the full citation and description in the metadata. + snap.metadata.origin.citation_full = CITATION_FULL # type: ignore + snap.metadata.origin.description = DESCRIPTION # type: ignore + + # Note that the publication date is not clear from their website. The date "Last updated" at the bottom of + # https://www.metoffice.gov.uk/hadobs/hadcrut5/data/HadCRUT.5.0.2.0/download.html + # does not seem to be the date of the latest update of the data (since currently there are data points posterior + # to that date). Hence, assume that the publication date is the access date. + snap.metadata.origin.date_published = snap.metadata.origin.date_accessed # type: ignore + + # Rewrite metadata to dvc file. + snap.metadata_path.write_text(snap.metadata.to_yaml()) + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature_global.csv.dvc b/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature_global.csv.dvc new file mode 100644 index 00000000000..988f2e697ab --- /dev/null +++ b/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature_global.csv.dvc @@ -0,0 +1,24 @@ +meta: + origin: + producer: Met Office Hadley Centre + title: HadCRUT5 + description: |- + The HadCRUT5 near surface temperature data set is produced by blending data from the CRUTEM5 surface air temperature dataset and the HadSST4 sea-surface temperature dataset. + + Temperature anomalies are based on the HadCRUT5 near-surface temperature dataset as published by the Met Office Hadley Centre. + title_snapshot: HadCRUT5 - Global near surface temperature + citation_full: |- + Morice, C.P., J.J. Kennedy, N.A. Rayner, J.P. Winn, E. Hogan, R.E. Killick, R.J.H. Dunn, T.J. Osborn, P.D. Jones and I.R. Simpson (in press) An updated assessment of near-surface temperature change from 1850: the HadCRUT5 dataset. Journal of Geophysical Research (Atmospheres) [doi:10.1029/2019JD032361](https://www.metoffice.gov.uk/hadobs/hadcrut5/HadCRUT5_accepted.pdf) ([supporting information](https://www.metoffice.gov.uk/hadobs/hadcrut5/HadCRUT5_supporting_information_accepted.pdf)). + version_producer: HadCRUT.5.0.2.0 + url_main: https://www.metoffice.gov.uk/hadobs/hadcrut5/ + url_download: |- + https://www.metoffice.gov.uk/hadobs/hadcrut5/data/HadCRUT.5.0.2.0/analysis/diagnostics/HadCRUT.5.0.2.0.analysis.summary_series.global.annual.csv + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: Open Government License v3 + url: https://www.metoffice.gov.uk/hadobs/hadcrut5/terms_and_conditions.html +outs: + - md5: 55ffc2941c5fd3ad360ea06e3c443e3a + size: 6988 + path: near_surface_temperature_global.csv diff --git a/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature_northern_hemisphere.csv.dvc b/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature_northern_hemisphere.csv.dvc new file mode 100644 index 00000000000..5a2d9c81563 --- /dev/null +++ b/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature_northern_hemisphere.csv.dvc @@ -0,0 +1,24 @@ +meta: + origin: + producer: Met Office Hadley Centre + title: HadCRUT5 + description: |- + The HadCRUT5 near surface temperature data set is produced by blending data from the CRUTEM5 surface air temperature dataset and the HadSST4 sea-surface temperature dataset. + + Temperature anomalies are based on the HadCRUT5 near-surface temperature dataset as published by the Met Office Hadley Centre. + title_snapshot: HadCRUT5 - Northern hemisphere near surface temperature + citation_full: |- + Morice, C.P., J.J. Kennedy, N.A. Rayner, J.P. Winn, E. Hogan, R.E. Killick, R.J.H. Dunn, T.J. Osborn, P.D. Jones and I.R. Simpson (in press) An updated assessment of near-surface temperature change from 1850: the HadCRUT5 dataset. Journal of Geophysical Research (Atmospheres) [doi:10.1029/2019JD032361](https://www.metoffice.gov.uk/hadobs/hadcrut5/HadCRUT5_accepted.pdf) ([supporting information](https://www.metoffice.gov.uk/hadobs/hadcrut5/HadCRUT5_supporting_information_accepted.pdf)). + version_producer: HadCRUT.5.0.2.0 + url_main: https://www.metoffice.gov.uk/hadobs/hadcrut5/ + url_download: |- + https://www.metoffice.gov.uk/hadobs/hadcrut5/data/HadCRUT.5.0.2.0/analysis/diagnostics/HadCRUT.5.0.2.0.analysis.summary_series.northern_hemisphere.annual.csv + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: Open Government License v3 + url: https://www.metoffice.gov.uk/hadobs/hadcrut5/terms_and_conditions.html +outs: + - md5: dc611848c8e3ee283059affef018a163 + size: 6972 + path: near_surface_temperature_northern_hemisphere.csv diff --git a/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature_southern_hemisphere.csv.dvc b/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature_southern_hemisphere.csv.dvc new file mode 100644 index 00000000000..8f666ed9767 --- /dev/null +++ b/snapshots/met_office_hadley_centre/2024-11-18/near_surface_temperature_southern_hemisphere.csv.dvc @@ -0,0 +1,24 @@ +meta: + origin: + producer: Met Office Hadley Centre + title: HadCRUT5 + description: |- + The HadCRUT5 near surface temperature data set is produced by blending data from the CRUTEM5 surface air temperature dataset and the HadSST4 sea-surface temperature dataset. + + Temperature anomalies are based on the HadCRUT5 near-surface temperature dataset as published by the Met Office Hadley Centre. + title_snapshot: HadCRUT5 - Southern hemisphere near surface temperature + citation_full: |- + Morice, C.P., J.J. Kennedy, N.A. Rayner, J.P. Winn, E. Hogan, R.E. Killick, R.J.H. Dunn, T.J. Osborn, P.D. Jones and I.R. Simpson (in press) An updated assessment of near-surface temperature change from 1850: the HadCRUT5 dataset. Journal of Geophysical Research (Atmospheres) [doi:10.1029/2019JD032361](https://www.metoffice.gov.uk/hadobs/hadcrut5/HadCRUT5_accepted.pdf) ([supporting information](https://www.metoffice.gov.uk/hadobs/hadcrut5/HadCRUT5_supporting_information_accepted.pdf)). + version_producer: HadCRUT.5.0.2.0 + url_main: https://www.metoffice.gov.uk/hadobs/hadcrut5/ + url_download: |- + https://www.metoffice.gov.uk/hadobs/hadcrut5/data/HadCRUT.5.0.2.0/analysis/diagnostics/HadCRUT.5.0.2.0.analysis.summary_series.southern_hemisphere.annual.csv + date_accessed: '2024-11-18' + date_published: '2024-11-18' + license: + name: Open Government License v3 + url: https://www.metoffice.gov.uk/hadobs/hadcrut5/terms_and_conditions.html +outs: + - md5: 0e093420a2b56b7fa94009a8def6c7b1 + size: 6974 + path: near_surface_temperature_southern_hemisphere.csv diff --git a/snapshots/ophi/2024-10-28/multidimensional_poverty_index.py b/snapshots/ophi/2024-10-28/multidimensional_poverty_index.py new file mode 100644 index 00000000000..fb8dc8b1ec2 --- /dev/null +++ b/snapshots/ophi/2024-10-28/multidimensional_poverty_index.py @@ -0,0 +1,28 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Define snapshot variants +SNAPSHOT_VARIANTS = ["cme", "hot"] + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + for variant in SNAPSHOT_VARIANTS: + # Create a new snapshot. + snap = Snapshot(f"ophi/{SNAPSHOT_VERSION}/multidimensional_poverty_index_{variant}.csv") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/ophi/2024-10-28/multidimensional_poverty_index_cme.csv.dvc b/snapshots/ophi/2024-10-28/multidimensional_poverty_index_cme.csv.dvc new file mode 100644 index 00000000000..6d7635f29bd --- /dev/null +++ b/snapshots/ophi/2024-10-28/multidimensional_poverty_index_cme.csv.dvc @@ -0,0 +1,43 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Global Multidimensional Poverty Index (MPI) + description: |- + The global Multidimensional Poverty Index (MPI) is an international measure of acute multidimensional poverty covering over 100 developing countries. It complements traditional monetary poverty measures by capturing the acute deprivations in health, education, and living standards that a person faces simultaneously. + + The MPI assesses poverty at the individual level. If a person is deprived in a third or more of ten (weighted) indicators, the global MPI identifies them as ‘MPI poor’. The extent – or intensity – of their poverty is also measured through the percentage of deprivations they are experiencing. + + The global MPI shows who is poor and how they are poor and can be used to create a comprehensive picture of people living in poverty. It permits comparisons both across countries and world regions, and within countries by ethnic group, urban/rural area, subnational region, and age group, as well as other key household and community characteristics. For each group and for countries as a whole, the composition of MPI by each of the ten indicators shows how people are poor. + + This makes the MPI and its linked information platform invaluable as an analytical tool to identify the most vulnerable people – the poorest among the poor, revealing poverty patterns within countries and over time, enabling policy makers to target resources and design policies more effectively. + + The global MPI was developed by OPHI with the UN Development Programme (UNDP) for inclusion in UNDP’s flagship Human Development Report in 2010. It has been published annually by OPHI and in the HDRs ever since. + date_published: "2024-10-17" + version_producer: 2024 + title_snapshot: Global Multidimensional Poverty Index (MPI) - Current margin estimates (CME) + description_snapshot: |- + This dataset contains current margin estimates (CME), based on the most recent survey data. + + # Citation + producer: Alkire, Kanagaratnam and Suppa + citation_full: |- + - Alkire, S., Kanagaratnam, U., and Suppa, N. (2024). The Global Multidimensional Poverty Index (MPI) 2024. Country Results and Methodological Note. OPHI MPI Methodological Note 58, Oxford Poverty and Human Development Initiative, University of Oxford. + - Alkire, S., Kanagaratnam, U., and Suppa, N. (2024). The Global Multidimensional Poverty Index (MPI) 2024. Disaggregation Results and Methodological Note. OPHI MPI Methodological Note 59, Oxford Poverty and Human Development Initiative, University of Oxford. + attribution: Alkire, Kanagaratnam and Suppa (2024) - The Global Multidimensional Poverty Index (MPI) 2024 + + # Files + url_main: https://ophi.org.uk/global-mpi + url_download: https://cloud-ophi.qeh.ox.ac.uk/index.php/s/eRLL5jGKPLTygYT/download?path=%2F&files=GMPI2024_puf.csv + date_accessed: 2024-10-28 + + # License + license: + name: CC BY 4.0 + url: https://ophi.org.uk/global-mpi-frequently-asked-questions + +outs: + - md5: b1513539648aa4f1a89b9a98cc3f4d91 + size: 19293090 + path: multidimensional_poverty_index_cme.csv diff --git a/snapshots/ophi/2024-10-28/multidimensional_poverty_index_hot.csv.dvc b/snapshots/ophi/2024-10-28/multidimensional_poverty_index_hot.csv.dvc new file mode 100644 index 00000000000..649c37f3bde --- /dev/null +++ b/snapshots/ophi/2024-10-28/multidimensional_poverty_index_hot.csv.dvc @@ -0,0 +1,43 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Global Multidimensional Poverty Index (MPI) + description: |- + The global Multidimensional Poverty Index (MPI) is an international measure of acute multidimensional poverty covering over 100 developing countries. It complements traditional monetary poverty measures by capturing the acute deprivations in health, education, and living standards that a person faces simultaneously. + + The MPI assesses poverty at the individual level. If a person is deprived in a third or more of ten (weighted) indicators, the global MPI identifies them as ‘MPI poor’. The extent – or intensity – of their poverty is also measured through the percentage of deprivations they are experiencing. + + The global MPI shows who is poor and how they are poor and can be used to create a comprehensive picture of people living in poverty. It permits comparisons both across countries and world regions, and within countries by ethnic group, urban/rural area, subnational region, and age group, as well as other key household and community characteristics. For each group and for countries as a whole, the composition of MPI by each of the ten indicators shows how people are poor. + + This makes the MPI and its linked information platform invaluable as an analytical tool to identify the most vulnerable people – the poorest among the poor, revealing poverty patterns within countries and over time, enabling policy makers to target resources and design policies more effectively. + + The global MPI was developed by OPHI with the UN Development Programme (UNDP) for inclusion in UNDP’s flagship Human Development Report in 2010. It has been published annually by OPHI and in the HDRs ever since. + date_published: "2024-10-17" + version_producer: 2024 + title_snapshot: Global Multidimensional Poverty Index (MPI) - Harmonized over time (HOT) + description_snapshot: |- + This dataset contains harmonized over time (HOT) estimates. This harmonization seeks to make two or more MPI estimations comparable by aligning the indicator definitions in each survey. + + # Citation + producer: Alkire, Kanagaratnam and Suppa + citation_full: |- + - Alkire, S., Kanagaratnam, U., and Suppa, N. (2024). A methodological note on the global Multidimensional Poverty Index (MPI) 2024 changes over time results for 86 countries. OPHI MPI Methodological Note 60, Oxford Poverty and Human Development Initiative, University of Oxford. + - Alkire, S., Kanagaratnam, U., and Suppa, N. (2024). The Global Multidimensional Poverty Index (MPI) 2024. Disaggregation Results and Methodological Note. OPHI MPI Methodological Note 59, Oxford Poverty and Human Development Initiative, University of Oxford. + attribution: Alkire, Kanagaratnam and Suppa (2024) - The Global Multidimensional Poverty Index (MPI) 2024 + + # Files + url_main: https://ophi.org.uk/global-mpi + url_download: https://cloud-ophi.qeh.ox.ac.uk/index.php/s/eRLL5jGKPLTygYT/download?path=%2F&files=GMPI_HOT_2024_puf.csv + date_accessed: 2024-10-28 + + # License + license: + name: CC BY 4.0 + url: https://ophi.org.uk/global-mpi-frequently-asked-questions + +outs: + - md5: 7de477b10168ce0dfea0c48046c1b905 + size: 31115054 + path: multidimensional_poverty_index_hot.csv diff --git a/snapshots/owid/latest/ig_countries.csv.dvc b/snapshots/owid/latest/ig_countries.csv.dvc new file mode 100644 index 00000000000..b0d41138d52 --- /dev/null +++ b/snapshots/owid/latest/ig_countries.csv.dvc @@ -0,0 +1,27 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Countries covered in IG posts + date_published: "2024" + + # Citation + producer: Our World in Data + citation_full: |- + Our World in Data. https://instagram.com/ourworldindata + + # Files + url_main: https://instagram.com/ourworldindata + date_accessed: 2024-11-07 + + # License + license: + name: CC BY 4.0 + + + is_public: false +outs: + - md5: ec13666b8ad90755ca913d8ad90a943d + size: 5399 + path: ig_countries.csv diff --git a/snapshots/owid/latest/ig_countries.py b/snapshots/owid/latest/ig_countries.py new file mode 100644 index 00000000000..efb5a160771 --- /dev/null +++ b/snapshots/owid/latest/ig_countries.py @@ -0,0 +1,33 @@ +"""Script to create a snapshot of dataset. + +File lives in GD: https://docs.google.com/spreadsheets/d/1SY7K_hyMtJUhyXDtQQwXAHgSNl22vrOqR6e3RljEU9I/edit?gid=917952968#gid=917952968 + +Example execution: + + python snapshots/owid/latest/ig_countries.py --path-to-file snapshots/owid/latest/countries.csv + etlr ig_countries --private +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"owid/{SNAPSHOT_VERSION}/ig_countries.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/un/2024-10-21/census_dates.csv.dvc b/snapshots/un/2024-10-21/census_dates.csv.dvc new file mode 100644 index 00000000000..742e1ef737d --- /dev/null +++ b/snapshots/un/2024-10-21/census_dates.csv.dvc @@ -0,0 +1,32 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: 2020 World Population and Housing Census Programme + description: |- + The 2020 World Population and Housing Census Programme recognizes population and housing censuses as one of the primary sources of data needed for formulating, implementing and monitoring policies and programmes aimed at inclusive socioeconomic development and environmental sustainability. It comprises a number of inter-related objectives and activities aimed at ensuring that Member States conduct at least one population and housing census during the period from 2015 to 2024. + date_published: "2024-07-05" + title_snapshot: Census dates for all countries + description_snapshot: |- + Census dates since 1985 for all member countries + + # Citation + producer: UN Statistics Division + citation_full: |- + World Population and Housing Census Programme, UN Statistics Division, https://unstats.un.org/unsd/demographic-social/census/censusdates/ (2024) + attribution_short: UNSD + + # Files + url_main: https://unstats.un.org/unsd/demographic-social/census/censusdates/ + date_accessed: 2024-10-21 + + # License + license: + name: © 2024 United Nations + url: https://www.un.org/en/about-us/copyright + +outs: + - md5: cb3a5131fb76b2dcc54e3f0dbfaac756 + size: 27276 + path: census_dates.csv diff --git a/snapshots/un/2024-10-21/census_dates.py b/snapshots/un/2024-10-21/census_dates.py new file mode 100644 index 00000000000..54673d00541 --- /dev/null +++ b/snapshots/un/2024-10-21/census_dates.py @@ -0,0 +1,100 @@ +"""Script to create a snapshot of dataset. +This script creates a snapshot from a local file. +The local file is created by copy-pasting the data table from this website: https://unstats.un.org/unsd/demographic-social/census/censusdates/ and then running the "clean_file" function""" + +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +# These are helper functions to clean the data from the website +def file_to_list(filename): + with open(filename) as f: + return [line.strip() for line in f] + + +def clean_list_entries(txt_ls): + rm_entries = [ + "AFRICA", + "ASIA", + "EUROPE", + "AMERICA, SOUTH", + "AMERICA, NORTH", + "OCEANIA", + "Countries or areas", + "1990 round", + "2000 round", + "2010 round", + "2020 round", + "(1985-1994)", + "(1995-2004)", + "(2005-2014)", + "(2015-2024)", + "-", + "", + "(16) -", + "(19) -", + "F", + ] + txt_ls = [x for x in txt_ls if x not in rm_entries] + + for i in range(0, len(txt_ls)): + if txt_ls[i].startswith("(") and txt_ls[i].endswith(")"): + txt_ls[i] = txt_ls[i][1:-1] + + return txt_ls + + +def list_to_dict(txt_ls): + i = 0 + rows = [] + while i < len(txt_ls): + entry = txt_ls[i] + last_element = entry.split(" ") + if not all(char.isdigit() for char in last_element): # if last element is not a number -> it is a country name + cty = entry + i += 1 + if i < len(txt_ls): + while (all(char.isdigit() for char in txt_ls[i].split(" ")[-1])) or ( + txt_ls[i] in ["(H) [2003]", "31 Dec.2011-31 Mar.2012", "1985-1989"] + ): + date = txt_ls[i] + rows.append({"Country": cty, "Date": date}) + i += 1 + if i == len(txt_ls): + break + else: + print("weird entry: ", entry) + i += 1 + + return rows + + +def clean_file(filename, output_filename): + """Clean the result of copy-pasting the website and save it as a csv.""" + txt_ls = file_to_list(filename) + txt_ls = clean_list_entries(txt_ls) + census_list = list_to_dict(txt_ls) + census_df = pd.DataFrame(census_list) + census_df.to_csv(output_filename, index=False) + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"un/{SNAPSHOT_VERSION}/census_dates.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/un/2024-12-02/un_wpp_lt.py b/snapshots/un/2024-12-02/un_wpp_lt.py new file mode 100644 index 00000000000..41b239039cc --- /dev/null +++ b/snapshots/un/2024-12-02/un_wpp_lt.py @@ -0,0 +1,47 @@ +"""Script to create a snapshot of dataset. + +While this ingest script could be done automatically, as of today (2023-10-02) downloading the files from UN WPP is extremely slow. Hence, I've decided to first manually download these and then run the snapshot ingest script. + +To download this files: + + 1. Go to the CSV Format section of UN WPP page: https://population.un.org/wpp/Download/Standard/CSV/ + 2. Download the Life Tables ZIP files with the estimates (1950-2021): + - https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_Life_Table_Complete_Medium_Both_1950-2023.csv.gz + - https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_Life_Table_Complete_Medium_Female_1950-2023.csv.gz + - https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_Life_Table_Complete_Medium_Male_1950-2023.csv.gz + 3. Run the snapshot script and wait for it to be ingested into S3: + python snapshots/un/2023-10-02/un_wpp_lt.py --path-to-file-all /path/WPP2022_Life_Table_Complete_Medium_Both_1950-2021.zip --path-to-file-f path/WPP2022_Life_Table_Complete_Medium_Female_1950-2021.zip --path-to-file-m path/WPP2022_Life_Table_Complete_Medium_Male_1950-2021.zip + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file-all", prompt=True, type=str, help="Path to local data file (both sexes).") +@click.option("--path-to-file-f", prompt=True, type=str, help="Path to local data file (female).") +@click.option("--path-to-file-m", prompt=True, type=str, help="Path to local data file (male).") +def main(path_to_file_all: str, path_to_file_f: str, path_to_file_m: str, upload: bool) -> None: + snaps = [ + ("un_wpp_lt_all", path_to_file_all), # ALL + ("un_wpp_lt_f", path_to_file_f), # FEMALE + ("un_wpp_lt_m", path_to_file_m), # MALE + ] + + for snap_props in snaps: + # Create a new snapshot. + snap = Snapshot(f"un/{SNAPSHOT_VERSION}/{snap_props[0]}.csv") + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=snap_props[1], upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/un/2024-12-02/un_wpp_lt_all.csv.dvc b/snapshots/un/2024-12-02/un_wpp_lt_all.csv.dvc new file mode 100644 index 00000000000..cfae7bef621 --- /dev/null +++ b/snapshots/un/2024-12-02/un_wpp_lt_all.csv.dvc @@ -0,0 +1,36 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Population Prospects + description: |- + The World Population Prospects 2024 is the 28th edition of the official estimates and projections of the global population published by the United Nations since 1951. The estimates are based on all available sources of data on population size and levels of fertility, mortality, and international migration for 237 countries or areas. + + For each revision, any new, recent, and historical, information that has become available from population censuses, vital registration of births and deaths, and household surveys is considered to produce consistent time series of population estimates for each country or areas from 1950 to today + + For the estimation period between 1950 and 2023, data from 1,910 censuses were considered in the present evaluation, which is 79 more than the 2022 revision. In some countries, population registers based on administrative data systems provide the necessary information. Population data from censuses or registers referring to 2019 or later were available for 114 countries or areas, representing 48 per cent of the 237 countries or areas included in this analysis (and 54 per cent of the world population). For 43 countries or areas, the most recent available population count was from the period 2014-2018, and for another 57 locations from the period 2009-2013. For the remaining 23 countries or areas, the most recent available census data were from before 2009, that is more than 15 years ago. + date_published: 2024-07-11 + title_snapshot: World Population Prospects - Life Tables (Both sexes) + description_snapshot: |- + Provides single-age life tables up to age 100 for both sexes with a set of values showing the mortality experience of a hypothetical group of infants born at the same time and subject throughout their lifetime to the specific mortality rates of a given year. The following series are provided: age-specific mortality rates (mx), probabilities of dying (qx), probabilities of surviving (px), number surviving (lx), number dying (dx), number of person-years lived (Lx), survivorship ratios (Sx), cumulative stationary population (Tx), average remaining life expectancy (ex) and average number of years lived (ax). + + # Citation + producer: United Nations + citation_full: |- + United Nations, Department of Economic and Social Affairs, Population Division (2024). World Population Prospects 2024, Online Edition. + attribution: UN, World Population Prospects (2024) + attribution_short: UN WPP + + # Files + url_main: https://population.un.org/wpp/Download/ + date_accessed: 2024-12-02 + + # License + license: + name: CC BY 3.0 IGO + url: https://population.un.org/wpp/Download/Standard/MostUsed/ +outs: + - md5: 8efb8acd80396c280c61e8de2cd94fb6 + size: 200036668 + path: un_wpp_lt_all.csv diff --git a/snapshots/un/2024-12-02/un_wpp_lt_f.csv.dvc b/snapshots/un/2024-12-02/un_wpp_lt_f.csv.dvc new file mode 100644 index 00000000000..315bc32d7a3 --- /dev/null +++ b/snapshots/un/2024-12-02/un_wpp_lt_f.csv.dvc @@ -0,0 +1,36 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Population Prospects + description: |- + The World Population Prospects 2024 is the 28th edition of the official estimates and projections of the global population published by the United Nations since 1951. The estimates are based on all available sources of data on population size and levels of fertility, mortality, and international migration for 237 countries or areas. + + For each revision, any new, recent, and historical, information that has become available from population censuses, vital registration of births and deaths, and household surveys is considered to produce consistent time series of population estimates for each country or areas from 1950 to today + + For the estimation period between 1950 and 2023, data from 1,910 censuses were considered in the present evaluation, which is 79 more than the 2022 revision. In some countries, population registers based on administrative data systems provide the necessary information. Population data from censuses or registers referring to 2019 or later were available for 114 countries or areas, representing 48 per cent of the 237 countries or areas included in this analysis (and 54 per cent of the world population). For 43 countries or areas, the most recent available population count was from the period 2014-2018, and for another 57 locations from the period 2009-2013. For the remaining 23 countries or areas, the most recent available census data were from before 2009, that is more than 15 years ago. + date_published: 2024-07-11 + title_snapshot: World Population Prospects - Life Tables (Female) + description_snapshot: |- + Provides single-age life tables up to age 100 for females with a set of values showing the mortality experience of a hypothetical group of infants born at the same time and subject throughout their lifetime to the specific mortality rates of a given year. The following series are provided: age-specific mortality rates (mx), probabilities of dying (qx), probabilities of surviving (px), number surviving (lx), number dying (dx), number of person-years lived (Lx), survivorship ratios (Sx), cumulative stationary population (Tx), average remaining life expectancy (ex) and average number of years lived (ax). + + # Citation + producer: United Nations + citation_full: |- + United Nations, Department of Economic and Social Affairs, Population Division (2024). World Population Prospects 2024, Online Edition. + attribution: UN, World Population Prospects (2024) + attribution_short: UN WPP + + # Files + url_main: https://population.un.org/wpp/Download/ + date_accessed: 2024-12-02 + + # License + license: + name: CC BY 3.0 IGO + url: https://population.un.org/wpp/Download/Standard/MostUsed/ +outs: + - md5: bc8637e9e160cc99e496e245f1d4bff6 + size: 199164889 + path: un_wpp_lt_f.csv diff --git a/snapshots/un/2024-12-02/un_wpp_lt_m.csv.dvc b/snapshots/un/2024-12-02/un_wpp_lt_m.csv.dvc new file mode 100644 index 00000000000..f4e272590a4 --- /dev/null +++ b/snapshots/un/2024-12-02/un_wpp_lt_m.csv.dvc @@ -0,0 +1,36 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Population Prospects + description: |- + The World Population Prospects 2024 is the 28th edition of the official estimates and projections of the global population published by the United Nations since 1951. The estimates are based on all available sources of data on population size and levels of fertility, mortality, and international migration for 237 countries or areas. + + For each revision, any new, recent, and historical, information that has become available from population censuses, vital registration of births and deaths, and household surveys is considered to produce consistent time series of population estimates for each country or areas from 1950 to today + + For the estimation period between 1950 and 2023, data from 1,910 censuses were considered in the present evaluation, which is 79 more than the 2022 revision. In some countries, population registers based on administrative data systems provide the necessary information. Population data from censuses or registers referring to 2019 or later were available for 114 countries or areas, representing 48 per cent of the 237 countries or areas included in this analysis (and 54 per cent of the world population). For 43 countries or areas, the most recent available population count was from the period 2014-2018, and for another 57 locations from the period 2009-2013. For the remaining 23 countries or areas, the most recent available census data were from before 2009, that is more than 15 years ago. + date_published: 2024-07-11 + title_snapshot: World Population Prospects - Life Tables (Male) + description_snapshot: |- + Provides single-age life tables up to age 100 for males with a set of values showing the mortality experience of a hypothetical group of infants born at the same time and subject throughout their lifetime to the specific mortality rates of a given year. The following series are provided: age-specific mortality rates (mx), probabilities of dying (qx), probabilities of surviving (px), number surviving (lx), number dying (dx), number of person-years lived (Lx), survivorship ratios (Sx), cumulative stationary population (Tx), average remaining life expectancy (ex) and average number of years lived (ax). + + # Citation + producer: United Nations + citation_full: |- + United Nations, Department of Economic and Social Affairs, Population Division (2024). World Population Prospects 2024, Online Edition. + attribution: UN, World Population Prospects (2024) + attribution_short: UN WPP + + # Files + url_main: https://population.un.org/wpp/Download/ + date_accessed: 2024-12-02 + + # License + license: + name: CC BY 3.0 IGO + url: https://population.un.org/wpp/Download/Standard/MostUsed/ +outs: + - md5: da77cb47298654d160e523568d443b29 + size: 199442259 + path: un_wpp_lt_m.csv diff --git a/snapshots/unesco/2024-11-21/enrolment_rates.csv.dvc b/snapshots/unesco/2024-11-21/enrolment_rates.csv.dvc new file mode 100644 index 00000000000..23593d8a089 --- /dev/null +++ b/snapshots/unesco/2024-11-21/enrolment_rates.csv.dvc @@ -0,0 +1,32 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: UNESCO Institute for Statistics (UIS) - Enrolment Rates (OPRI) + description: |- + The UNESCO Institute for Statistics (UIS) is the official and trusted source of internationally-comparable data on education, science, culture and communication. As the official statistical agency of UNESCO, the UIS produces a wide range of state-of-the-art databases to fuel the policies and investments needed to transform lives and propel the world towards its development goals. The UIS provides free access to data for all UNESCO countries and regional groupings from 1970 to the most recent year available. + + date_published: "2024-02-01" + title_snapshot: UNESCO Institute for Statistics (UIS) - Enrolment Rates (OPRI) + description_snapshot: |- + This snapshot is for the most recent enrolment rates data from the Other Policy Relevant Indicators dataset. + + # Citation + producer: UNESCO Institute for Statistics + citation_full: |- + UNESCO Institute for Statistics (UIS), Education, https://uis.unesco.org/bdds, 2024 + attribution_short: UIS + + # Files + url_main: https://data.uis.unesco.org/index.aspx?queryid=3813 + date_accessed: 2024-11-21 + + # License + license: + name: CC BY 3.0 IGO + url: http://creativecommons.org/licenses/by-sa/3.0/igo/ +outs: + - md5: 138230fdaf14bdc647b7e05e244e0bb9 + size: 5972916 + path: enrolment_rates.csv diff --git a/snapshots/unesco/2024-11-21/enrolment_rates.py b/snapshots/unesco/2024-11-21/enrolment_rates.py new file mode 100644 index 00000000000..3e1a2af0edf --- /dev/null +++ b/snapshots/unesco/2024-11-21/enrolment_rates.py @@ -0,0 +1,28 @@ +"""Script to create a snapshot of dataset. +To download the csv file go to https://data.uis.unesco.org/index.aspx?queryid=3813 click on the Data Other Policy Indicators (full dataset) and then export as CSV. +This data should be the most recent version of the OPRI dataset. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"unesco/{SNAPSHOT_VERSION}/enrolment_rates.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/urbanization/2024-10-14/ghsl_degree_of_urbanisation.xlsx.dvc b/snapshots/urbanization/2024-10-14/ghsl_degree_of_urbanisation.xlsx.dvc index 1701120fca4..9d606d204d9 100644 --- a/snapshots/urbanization/2024-10-14/ghsl_degree_of_urbanisation.xlsx.dvc +++ b/snapshots/urbanization/2024-10-14/ghsl_degree_of_urbanisation.xlsx.dvc @@ -11,10 +11,10 @@ meta: # Citation producer: European Commission, Joint Research Centre (JRC) citation_full: |- - Carioli A., Schiavina M., Melchiorri M. (2024): GHS-COUNTRY-STATS R2024A - GHSL Country Statistics by Degree of Urbanization, multitemporal (1975-2030). European Commission, Joint Research Centre (JRC) [Dataset] doi:10.2905/341c0608-5ca5-4ddb-b068-a412e35a3326 PID: http://data.europa.eu/89h/341c0608-5ca5-4ddb-b068-a412e35a3326 + Carioli, Alessandra; Schiavina, Marcello; Melchiorri, Michele (2024): GHS-COUNTRY-STATS R2024A - GHSL Country Statistics by Degree of Urbanization, multitemporal (1975-2030). European Commission, Joint Research Centre (JRC) [Dataset] doi: 10.2905/341c0608-5ca5-4ddb-b068-a412e35a3326 PID: http://data.europa.eu/89h/341c0608-5ca5-4ddb-b068-a412e35a3326 # Files - url_main: https://ghsl.jrc.ec.europa.eu/CFS.php + url_main: https://data.jrc.ec.europa.eu/dataset/341c0608-5ca5-4ddb-b068-a412e35a3326 date_accessed: 2024-10-14 # License diff --git a/snapshots/urbanization/2024-12-02/ghsl_urban_centers.py b/snapshots/urbanization/2024-12-02/ghsl_urban_centers.py new file mode 100644 index 00000000000..4c0ede9b1f5 --- /dev/null +++ b/snapshots/urbanization/2024-12-02/ghsl_urban_centers.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset. This version of the dataset was provided directly by the source via email (DIJKSTRA Lewis ).""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"urbanization/{SNAPSHOT_VERSION}/ghsl_urban_centers.xlsx") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/urbanization/2024-12-02/ghsl_urban_centers.xlsx.dvc b/snapshots/urbanization/2024-12-02/ghsl_urban_centers.xlsx.dvc new file mode 100644 index 00000000000..f2c69bd3b2c --- /dev/null +++ b/snapshots/urbanization/2024-12-02/ghsl_urban_centers.xlsx.dvc @@ -0,0 +1,33 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: Global Human Settlement Layer Dataset - Stats in the City Database + description: |- + The "Stats in the City Database" offers harmonized data on population and population density for 11,422 urban centres. + + This data, based on the Global Human Settlement Layer Dataset, uses the Degree of Urbanisation framework to delineate spatial entities and integrates geospatial data from a variety of open-source datasets. It represents one of the most comprehensive resources for understanding urban population patterns and densities worldwide + date_published: "2024" + + # Citation + producer: European Commission, Joint Research Centre (JRC) + citation_full: |- + Center For International Earth Science Information Network-CIESIN-Columbia University. 2018. “Gridded Population of the World, Version 4 (GPWv4): Population Count, Revision 11.” Palisades, NY: NASA Socioeconomic Data and Applications Center (SEDAC). https://doi.org/10.7927/H4JW8BX5 + Pesaresi M., Politis P. (2023): GHS-BUILT-V R2023A - GHS built-up volume grids derived from joint assessment of Sentinel2, Landsat, and global DEM data, multitemporal (1975-2030).European Commission, Joint Research Centre (JRC) PID: http://data.europa.eu/89h/ab2f107a-03cd-47a3-85e5-139d8ec63283, doi:10.2905/AB2F107A-03CD-47A3-85E5-139D8EC63283 + Pesaresi M., Politis P. (2023): GHS-BUILT-S R2023A - GHS built-up surface grid, derived from Sentinel2 composite and Landsat, multitemporal (1975-2030)European Commission, Joint Research Centre (JRC) PID: http://data.europa.eu/89h/9f06f36f-4b11-47ec-abb0-4f8b7b1d72ea, doi:10.2905/9F06F36F-4B11-47EC-ABB0-4F8B7B1D72EA + Schiavina M., Freire S., Carioli A., MacManus K. (2023): GHS-POP R2023A - GHS population grid multitemporal (1975-2030).European Commission, Joint Research Centre (JRC) PID: http://data.europa.eu/89h/2ff68a52-5b5b-4a22-8f40-c41da8332cfe, doi:10.2905/2FF68A52-5B5B-4A22-8F40-C41DA8332CFE + Schiavina M., Freire S., Carioli A., MacManus K. (2023): GHS-POP R2023A - GHS population grid multitemporal (1975-2030).European Commission, Joint Research Centre (JRC) PID: http://data.europa.eu/89h/2ff68a52-5b5b-4a22-8f40-c41da8332cfe, doi:10.2905/2FF68A52-5B5B-4A22-8F40-C41DA8332CFE + Schiavina M., Melchiorri M., Pesaresi M. (2023): GHS-SMOD R2023A - GHS settlement layers, application of the Degree of Urbanisation methodology (stage I) to GHS-POP R2023A and GHS-BUILT-S R2023A, multitemporal (1975-2030)European Commission, Joint Research Centre (JRC) PID: http://data.europa.eu/89h/a0df7a6f-49de-46ea-9bde-563437a6e2ba, doi:10.2905/A0DF7A6F-49DE-46EA-9BDE-563437A6E2BA + + url_main: https://human-settlement.emergency.copernicus.eu/ghs_ucdb_2024.php + date_accessed: 2024-12-02 + + # License + license: + name: CC BY 4.0 + url: https://commission.europa.eu/legal-notice_en +outs: + - md5: 78dbc4fc3cbcbe24cd51fe4f884319e2 + size: 2963003 + path: ghsl_urban_centers.xlsx diff --git a/snapshots/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py b/snapshots/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py index 911e9df6b16..3584244c9d7 100644 --- a/snapshots/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py +++ b/snapshots/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.py @@ -32,26 +32,24 @@ def get_table_of_commodities_and_urls(url_main: str) -> pd.DataFrame: # Extract the rows data = [] - for row in table.find_all("tr")[2:]: # type: ignore + for row in table.find_all("tr")[2:]: cols = row.find_all("td") - if len(cols) >= 6: # Ensure there are enough columns - commodity = cols[0].get_text(strip=True) - supply_demand_url = cols[1].find("a")["href"] if cols[1].find("a") else "NA" - supply_demand_year_update = cols[2].get_text(strip=True) - end_use_url = cols[4].find("a")["href"] if cols[4].find("a") else "NA" - end_use_year_update = cols[5].get_text(strip=True) - - # Add row to data if urls are not 'NA' - if not supply_demand_url.lower().endswith(".pdf") and not end_use_url.lower().endswith(".pdf"): - data.append( - [ - commodity, - supply_demand_url if not supply_demand_url.lower().endswith(".pdf") else "NA", - supply_demand_year_update, - end_use_url if not end_use_url.lower().endswith(".pdf") else "NA", - end_use_year_update, - ] - ) + commodity = cols[0].get_text(strip=True) if len(cols) > 0 else "NA" + supply_demand_url = cols[1].find("a")["href"] if len(cols) > 1 and cols[1].find("a") else "NA" + supply_demand_year_update = cols[2].get_text(strip=True) if len(cols) > 2 else "NA" + end_use_url = cols[4].find("a")["href"] if len(cols) > 4 and cols[4].find("a") else "NA" + end_use_year_update = cols[5].get_text(strip=True) if len(cols) > 5 else "NA" + + # Add row to data regardless of whether URLs are present + data.append( + [ + commodity, + supply_demand_url, + supply_demand_year_update, + end_use_url, + end_use_year_update, + ] + ) # Create a DataFrame with the fetched data. df = pd.DataFrame( @@ -105,15 +103,6 @@ def _fetch_file_url_from_media_path(media_path: str) -> Optional[str]: return data_file_url # type: ignore -def _download_file(url: str, dest_folder: Path, commodity: str) -> None: - response = requests.get(url) - file_path = dest_folder / f"{underscore(commodity)}.xlsx" - file_path.write_bytes(response.content) - - # Wait before sending next query. - sleep(TIME_BETWEEN_QUERIES) - - def download_all_files(df: pd.DataFrame, snapshot_path: Path) -> None: # Ensure the output folder exists. snapshot_path.parent.mkdir(exist_ok=True, parents=True) @@ -129,7 +118,7 @@ def download_all_files(df: pd.DataFrame, snapshot_path: Path) -> None: end_use_dir.mkdir(parents=True, exist_ok=True) # Download files for all commodities. - for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Downloading files"): + for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Downloading files"): if row["supply_demand_url"] != "NA": download_file_from_url( url=row["supply_demand_url"], @@ -167,11 +156,12 @@ def main(upload: bool) -> None: # NOTE: This may take a few minutes. df = get_table_of_commodities_and_urls(url_main=snap.metadata.origin.url_main) # type: ignore - # Download the supply-demand statistics file and end-use statistics file for each commodity. + # Download the supply-demand statistics file and end-use statistics file for each commodity in a temporary folder. + # A compressed file will be created at the end in the corresponding data/snapshots/ folder. # NOTE: This may take a few minutes. download_all_files(df=df, snapshot_path=snap.path) - # Upload file to S3. + # Upload zip file to S3. snap.create_snapshot(upload=upload, filename=snap.path) diff --git a/snapshots/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.zip.dvc b/snapshots/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.zip.dvc index e0c6e244f9d..8a8d7286703 100644 --- a/snapshots/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.zip.dvc +++ b/snapshots/usgs/2024-07-15/historical_statistics_for_mineral_and_material_commodities.zip.dvc @@ -28,6 +28,6 @@ meta: url: https://www.usgs.gov/centers/national-minerals-information-center/historical-statistics-mineral-and-material-commodities outs: - - md5: bae42fc20438313cb7dd37a93682f6e6 - size: 7838597 + - md5: 13a7622cbcb1e671075622d8c022b295 + size: 8750002 path: historical_statistics_for_mineral_and_material_commodities.zip diff --git a/snapshots/war/2024-11-22/ucdp_ced.py b/snapshots/war/2024-11-22/ucdp_ced.py new file mode 100644 index 00000000000..0519de5618d --- /dev/null +++ b/snapshots/war/2024-11-22/ucdp_ced.py @@ -0,0 +1,39 @@ +"""Script to create a snapshot of dataset 'UCDP Candidate Events Dataset'. + +The UCDP Candidate Events Dataset (UCDP Candidate) is based on UCDP Georeferenced Event Dataset (UCDP GED), but published at a monthly release cycle. It makes available monthly releases of candidate events data with not more than a month’s lag globally. See codebook for similarieties and differences between the two products. + +Go to https://ucdp.uu.se/downloads/index.html#candidate to find latest available versions. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +VERSIONS = [ + "v24_0_10", + "v24_01_24_09", +] + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + for version in VERSIONS: + snapshot_path = f"war/{SNAPSHOT_VERSION}/ucdp_ced_{version}.csv" + snap = Snapshot(snapshot_path) + snap.download_from_source() + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/war/2024-11-22/ucdp_ced_v24_01_24_09.csv.dvc b/snapshots/war/2024-11-22/ucdp_ced_v24_01_24_09.csv.dvc new file mode 100644 index 00000000000..a7b661f0369 --- /dev/null +++ b/snapshots/war/2024-11-22/ucdp_ced_v24_01_24_09.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Uppsala Conflict Data Program + title: Candidate Events Dataset + description: |- + The UCDP Candidate Events Dataset (UCDP Candidate) is based on UCDP Georeferenced Event Dataset (UCDP GED), but published at a monthly release cycle. It makes available monthly releases of candidate events data with not more than a month’s lag globally. See codebook for similarieties and differences between the two products. + + You can find more notes at https://ucdp.uu.se/downloads/candidateged/ucdp-candidate-codebook1.3.pdf + title_snapshot: Candidate Events Dataset (January - September 2024) + description_snapshot: |- + This is a third quarterly export, covering events from January to September 2024. + citation_full: |- + Hegre, Håvard, Mihai Croicu, Kristine Eck, and Stina Högbladh, July 2020. Introducing the UCDP Candidate Events Dataset”, Research & Politics. doi:10.1177/2053168020935257 + + Högbladh Stina, 2023, “UCDP Candidate Events Dataset Codebook, v.1.2”, Department of Peace and Conflict Research, Uppsala University. + attribution_short: UCDP + version_producer: v24.01.24.09 + url_main: https://ucdp.uu.se/downloads/index.html#candidate + url_download: https://ucdp.uu.se/downloads/candidateged/GEDEvent_v24_01_24_09.csv + date_accessed: "2024-11-22" + date_published: "2024-10-20" + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ +outs: + - md5: 78a92b457f82f411c973c2795fbb7286 + size: 16856392 + path: ucdp_ced_v24_01_24_09.csv diff --git a/snapshots/war/2024-11-22/ucdp_ced_v24_0_10.csv.dvc b/snapshots/war/2024-11-22/ucdp_ced_v24_0_10.csv.dvc new file mode 100644 index 00000000000..aed250710f0 --- /dev/null +++ b/snapshots/war/2024-11-22/ucdp_ced_v24_0_10.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Uppsala Conflict Data Program + title: Candidate Events Dataset + description: |- + The UCDP Candidate Events Dataset (UCDP Candidate) is based on UCDP Georeferenced Event Dataset (UCDP GED), but published at a monthly release cycle. It makes available monthly releases of candidate events data with not more than a month’s lag globally. See codebook for similarieties and differences between the two products. + + You can find more notes at https://ucdp.uu.se/downloads/candidateged/ucdp-candidate-codebook1.3.pdf + title_snapshot: Candidate Events Dataset (October 2024) + description_snapshot: |- + This is a monthly release, covering events in October 2024. + citation_full: |- + Hegre, Håvard, Mihai Croicu, Kristine Eck, and Stina Högbladh, July 2020. Introducing the UCDP Candidate Events Dataset”, Research & Politics. doi:10.1177/2053168020935257 + + Högbladh Stina, 2023, “UCDP Candidate Events Dataset Codebook, v.1.2”, Department of Peace and Conflict Research, Uppsala University. + attribution_short: UCDP + version_producer: 24.0.10 + url_main: https://ucdp.uu.se/downloads/index.html#candidate + url_download: https://ucdp.uu.se/downloads/candidateged/GEDEvent_v24_0_10.csv + date_accessed: "2024-11-22" + date_published: "2024-10-20" + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ +outs: + - md5: a081ef74bc40dfa3bf79caa5406bfbe7 + size: 2268616 + path: ucdp_ced_v24_0_10.csv diff --git a/snapshots/wb/2024-11-04/edstats.csv.dvc b/snapshots/wb/2024-11-04/edstats.csv.dvc new file mode 100644 index 00000000000..35b98c01b56 --- /dev/null +++ b/snapshots/wb/2024-11-04/edstats.csv.dvc @@ -0,0 +1,26 @@ +meta: + origin: + # Data product / Snapshot + title: World Bank Education Statistics (EdStats) + description: |- + The World Bank EdStats database offers a comprehensive array of over 8,000 internationally comparable indicators related to education access, progression, completion, literacy, teachers, demographics, and expenditures. It covers the education cycle from pre-primary to vocational and tertiary education, including data on learning outcomes from assessments like PISA, TIMSS, PIRLS, equity data from household surveys, and educational projections up to 2050. + date_published: '2024-10-01' + + # Citation + producer: World Bank + citation_full: |- + World Bank Education Statistics (EdStats), World Bank, 2023. Licence: CC BY 4.0. + attribution_short: World Bank + + # Files + url_main: https://datacatalog.worldbank.org/search/dataset/0038480/education-statistics + date_accessed: 2024-11-04 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ +outs: + - md5: 83f93b0ab9ba48850ea75045d76c8c38 + size: 31626492 + path: edstats.csv diff --git a/snapshots/wb/2024-11-04/edstats.py b/snapshots/wb/2024-11-04/edstats.py new file mode 100644 index 00000000000..6943826e741 --- /dev/null +++ b/snapshots/wb/2024-11-04/edstats.py @@ -0,0 +1,150 @@ +""" +The script fetches education data from the World Bank API and adds metadata for each indicator. +It uses parallel requests to fetch metadata for each indicator largely because previously we were importing all of the data from the database and I'd like to keep the same structure in case we need to expand the number of indicators we import from this database in the future. +At the moment we extract the indicators from the grapher database that are actually used in our charts and then fetch the data and the metadata for only these indicators but this might change in the future.""" + +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import click +import numpy as np +import pandas as pd +import requests +import world_bank_data as wb +from owid.datautils.io import df_to_file +from tqdm import tqdm + +from etl.db import get_engine +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wb/{SNAPSHOT_VERSION}/edstats.csv") + + # Fetch data from the World Bank API. + wb_education_df = get_data() + + # Using ThreadPoolExecutor for parallel requests with progress bar. Fetch metadata for each indicator. + indicators = wb_education_df["wb_seriescode"].unique() + results = [] + + for indicator in indicators: + result = fetch_indicator_metadata(indicator) + results.append(result) + + # Create a temporary DataFrame from the results + temp_df = pd.DataFrame(results, columns=["wb_seriescode", "source_note", "source"]) + + # Merge the results back into the original DataFrame + df = pd.merge(temp_df, wb_education_df, on="wb_seriescode", how="right") + df_to_file(df, file_path=snap.path) + # Download data from source, add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +def fetch_indicator_metadata(indicator): + # Fetch metadata for an indicator + url = f"https://api.worldbank.org/v2/indicator/{indicator}?format=json" + response = requests.get(url) + if response.status_code == 200: + data = response.json() + # Check if the response contains the expected data structure + if len(data) > 1 and isinstance(data[1], list) and len(data[1]) > 0: + source_note = data[1][0].get("sourceNote", np.nan) + source = data[1][0].get("sourceOrganization", np.nan) + return indicator, source_note, source + else: + print(f"No metadata found for indicator: {indicator}") + return indicator, np.nan, np.nan + else: + print(f"Failed to fetch data for indicator: {indicator}. Status code: {response.status_code}") + return indicator, np.nan, np.nan + + +def used_world_bank_ids(): + # This will connect to MySQL from specified ENV, so to run it against production you'd run + # ETL=.env.prod python snapshots/wb/2024-11-04/edstats.py + engine = get_engine() + q = """ + select distinct + SUBSTRING_INDEX(SUBSTRING(v.descriptionFromProducer, LOCATE('World Bank variable id: ', v.descriptionFromProducer) + LENGTH('World Bank variable id: ')), ' ', 1) AS wb_id, + v.* + from chart_dimensions as cd + join charts as c on c.id = cd.chartId + join variables as v on v.id = cd.variableId + where v.datasetId = 6194 + """ + df = pd.read_sql(q, engine) + df["wb_id"] = df["wb_id"].str.replace(r"\n\nOriginal", "", regex=True) + + return list(df["wb_id"].unique()) + + +def fetch_education_data(education_code: str) -> pd.DataFrame: + """ + Fetches education data for the given code from the World Bank API. + + Args: + education_code (str): Education code for fetching data. + + Returns: + DataFrame: DataFrame with fetched data or None if an error occurs. + """ + try: + # Fetch data for the given indicator code + data_series = wb.get_series(education_code) + + # Convert the series to a DataFrame and reset the index + df = data_series.to_frame("value").reset_index() + df["wb_seriescode"] = education_code + df.dropna(subset=["value"], inplace=True) + + return df + except ValueError as e: + print(f"ValueError: {e}") + except Exception as e: + print(f"An error occurred while fetching the data: {e}") + + return pd.DataFrame() # Return an empty DataFrame in case of an error + + +def get_data(): + """ + Reads the data with indicators from the given file path and fetches education data for each indicator. + + Args: + path_to_file (str): Path to the file with metadata. + + Returns: + DataFrame: DataFrame with education data for all indicators. + """ + # Get the list of World Bank series codes from live Grapher + wb_ids = used_world_bank_ids() + + # Some variables were created posthoc and don't use the standard World bank id convention + wb_ids = [element for element in wb_ids if element is not None] + + # Add some additional World Bank indicators that aren't used in the charts directly but other datasets use them. + wb_ids = wb_ids + ["PRJ.ATT.15UP.NED.MF", "SE.ADT.LITR.ZS"] + + # Assert that the list of indicators is not empty + assert len(wb_ids) > 0, "The list wb_ids is empty after removing None elements." + + with ThreadPoolExecutor() as executor: + futures = [executor.submit(fetch_education_data, code) for code in wb_ids] + wb_education = [f.result() for f in tqdm(futures, total=len(wb_ids), desc="Fetching data")] + + # Concatenate all non-empty dataframes efficiently + wb_education_df = pd.concat(wb_education, ignore_index=True) + + return wb_education_df + + +if __name__ == "__main__": + main() diff --git a/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.py b/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.py new file mode 100644 index 00000000000..0fa8bed05b3 --- /dev/null +++ b/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wb/{SNAPSHOT_VERSION}/reproducibility_package_poverty_prosperity_planet.zip") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc b/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc new file mode 100644 index 00000000000..08689d46ba7 --- /dev/null +++ b/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Reproducibility package for Poverty, Prosperity and Planet Report 2024 + description: |- + The World Bank has set a clear mission: ending extreme poverty and boosting shared prosperity on a livable planet. This new edition of the biennial series, previously titled Poverty and Shared Prosperity, assesses the three components of the mission and emphasizes that reducing poverty and increasing shared prosperity must be achieved without high costs to the environment. The current polycrisis—where the multiple crises of slow economic growth, increased fragility, climate risks, and heightened uncertainty have come together at the same time—makes national development strategies and international cooperation difficult. This overview summarizes the progress toward achieving these goals, outlines promising pathways to speed up the progress on multiple fronts, and proposes priorities tailored to countries at various levels of poverty, income, and environmental vulnerability. Offering the first post-COVID-19 (Coronavirus) pandemic assessment of global progress on this interlinked agenda, the report finds that global poverty reduction has resumed but at a pace slower than before the COVID-19 crisis. It also provides evidence that the number of countries with high levels of income inequality has declined considerably during the past two decades, but the pace of improvements in shared prosperity has slowed and that inequality remains high in Latin America and the Caribbean and in Sub-Saharan Africa. The report also finds evidence of countries’ increasing ability to manage natural hazards where there has been progress in poverty reduction and shared prosperity; but in the poorest settings, the report finds that climate risks are significantly higher. + date_published: "2024-09-26" + + # Citation + producer: Lakner et al. + citation_full: |- + Lakner, C., Genoni, M. E., Stemmler, H., Yonzan, N., & Tetteh Baah, S. K. (2024). Reproducibility package for Poverty, Prosperity and Planet Report 2024. World Bank. https://doi.org/10.60572/KGE4-CX54 + attribution: Lakner et al. (2024). Reproducibility package for Poverty, Prosperity and Planet Report 2024 + + # Files + url_main: https://reproducibility.worldbank.org/index.php/catalog/189/ + url_download: https://reproducibility.worldbank.org/index.php/catalog/189/download/552/FR_WLD_2024_198.zip + date_accessed: 2024-12-03 + + # License + license: + name: Modified BSD3 + url: https://reproducibility.worldbank.org/index.php/catalog/189/#project_desc_container1674844764972 + +outs: + - md5: 3a942e2ccc863f67d4879250d7468e57 + size: 91787338 + path: reproducibility_package_poverty_prosperity_planet.zip diff --git a/snapshots/who/latest/avian_influenza_ah5n1.csv.dvc b/snapshots/who/latest/avian_influenza_ah5n1.csv.dvc index 7d9963ab4c7..9671a276ede 100644 --- a/snapshots/who/latest/avian_influenza_ah5n1.csv.dvc +++ b/snapshots/who/latest/avian_influenza_ah5n1.csv.dvc @@ -12,12 +12,12 @@ meta: Human Cases with Highly Pathogenic Avian Influenza A/H5N1. World Health Organization, Global Influenza Programme; 2024. Licence: CC BY-NC-SA 3.0 IGO. Retrieved from CDC May 23, 2024. attribution_short: WHO url_main: https://www.cdc.gov/bird-flu/php/avian-flu-summary/chart-epi-curve-ah5n1.html - date_accessed: 2024-09-30 - date_published: "2024-08-07" + date_accessed: 2024-11-26 + date_published: "2024-10-26" license: name: CC BY-NC-SA 3.0 IGO url: https://www.who.int/about/policies/publishing/copyright outs: - - md5: 8eb1eab7b38b2c98dcb750c23e926acf - size: 24467 + - md5: 79b4747eae76381d715acba09196ca23 + size: 24674 path: avian_influenza_ah5n1.csv diff --git a/snapshots/who/latest/avian_influenza_ah5n1.py b/snapshots/who/latest/avian_influenza_ah5n1.py index a8693e3c883..0fbcf1367c9 100644 --- a/snapshots/who/latest/avian_influenza_ah5n1.py +++ b/snapshots/who/latest/avian_influenza_ah5n1.py @@ -1,6 +1,12 @@ """This data is collected by the WHO, and summarised in PDF reports. -CDC provides this same data but in a machine-readable format, which one can download from https://www.cdc.gov/bird-flu/php/avian-flu-summary/chart-epi-curve-ah5n1.html under "Download data (CSV)". +CDC provides this same data but in a machine-readable format. To download it: + +- Go to https://www.cdc.gov/bird-flu/php/avian-flu-summary/chart-epi-curve-ah5n1.html +- Under the main chart, click on "Download data (CSV)". + +To get the publication date, look at the top right part of the site. + """ from pathlib import Path diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc index 936eaf4f10e..30b2f569464 100644 --- a/snapshots/who/latest/fluid.csv.dvc +++ b/snapshots/who/latest/fluid.csv.dvc @@ -16,6 +16,6 @@ meta: The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza. wdir: ../../../data/snapshots/who/latest outs: - - md5: 048f3b15789837fad71fe6f70e815089 - size: 166517511 + - md5: 811f5ca9e719e680bc1cde286e599f9d + size: 168107745 path: fluid.csv diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc index a6b4f477728..6a11439d09e 100644 --- a/snapshots/who/latest/flunet.csv.dvc +++ b/snapshots/who/latest/flunet.csv.dvc @@ -16,6 +16,6 @@ meta: The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases. wdir: ../../../data/snapshots/who/latest outs: - - md5: 23522eeeafda752efe92c72f7f82b3ed - size: 27008247 + - md5: b687f5f92351d148e71bb3b5d60c0c50 + size: 27222953 path: flunet.csv diff --git a/snapshots/who/latest/monkeypox.csv.dvc b/snapshots/who/latest/monkeypox.csv.dvc index 0acaa449bfc..43439fe9ce3 100644 --- a/snapshots/who/latest/monkeypox.csv.dvc +++ b/snapshots/who/latest/monkeypox.csv.dvc @@ -24,6 +24,6 @@ meta: name: CC BY 4.0 outs: - - md5: 0c2f431c2fa1b70422aaf65a0e334844 - size: 591690 + - md5: 55ca84faa2028764bbde4a308f466cd8 + size: 603658 path: monkeypox.csv diff --git a/snapshots_archive/animal_welfare/2023-09-08/fur_laws.pdf.dvc b/snapshots_archive/animal_welfare/2023-09-08/fur_laws.pdf.dvc deleted file mode 100644 index 56980fddfd9..00000000000 --- a/snapshots_archive/animal_welfare/2023-09-08/fur_laws.pdf.dvc +++ /dev/null @@ -1,20 +0,0 @@ -meta: - origin: - title: Fur banning - producer: Fur Free Alliance - citation_full: Overview national fur legislation, Fur Free Alliance (2023). - url_main: https://www.furfreealliance.com/fur-bans/ - url_download: - https://www.furfreealliance.com/wp-content/uploads/2023/04/Overview-national-fur-legislation-General-Provisions.pdf - date_published: '2023-04-01' - date_accessed: '2023-09-08' - license: - name: CC BY 4.0 - license: - name: CC BY 4.0 - is_public: true -wdir: ../../../data/snapshots/animal_welfare/2023-09-08 -outs: -- md5: e326e86b4c1225f688951df82a2f85af - size: 178968 - path: fur_laws.pdf diff --git a/snapshots_archive/animal_welfare/2023-09-08/fur_laws.py b/snapshots_archive/animal_welfare/2023-09-08/fur_laws.py deleted file mode 100644 index e25b960dce5..00000000000 --- a/snapshots_archive/animal_welfare/2023-09-08/fur_laws.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Script to create a snapshot of dataset.""" - -from pathlib import Path - -import click - -from etl.snapshot import Snapshot - -# Version for current snapshot dataset. -SNAPSHOT_VERSION = Path(__file__).parent.name - - -@click.command() -@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") -def main(upload: bool) -> None: - # Create a new snapshot. - snap = Snapshot(f"animal_welfare/{SNAPSHOT_VERSION}/fur_laws.pdf") - - # Download data from source, add file to DVC and upload to S3. - snap.create_snapshot(upload=upload) - - -if __name__ == "__main__": - main() diff --git a/tests/data_helpers/test_geo.py b/tests/data_helpers/test_geo.py index 637cd6e6d38..813cec00e7e 100644 --- a/tests/data_helpers/test_geo.py +++ b/tests/data_helpers/test_geo.py @@ -189,18 +189,19 @@ def test_one_country_unchanged_and_another_unknown(self): def test_two_unknown_countries_made_nan(self): df_in = pd.DataFrame({"country": ["Country 1", "country_04"], "some_variable": [1, 2]}) - df_out = pd.DataFrame({"country": [np.nan, np.nan], "some_variable": [1, 2]}) - df_out["country"] = df_out["country"].astype(object) - assert dataframes.are_equal( - df1=df_out, - df2=geo.harmonize_countries( - df=df_in, - countries_file="MOCK_COUNTRIES_FILE", - make_missing_countries_nan=True, - warn_on_unused_countries=False, - warn_on_missing_countries=False, - ), - )[0] + df_out = pd.DataFrame({"country": [pd.NA, pd.NA], "some_variable": [1, 2]}) + df_out["country"] = df_out["country"].astype("str") + + result = geo.harmonize_countries( + df=df_in, + countries_file="MOCK_COUNTRIES_FILE", + make_missing_countries_nan=True, + warn_on_unused_countries=False, + warn_on_missing_countries=False, + ) + df_out.country = df_out.country.astype("string") + result.country = result.country.astype("string") + assert dataframes.are_equal(df1=df_out, df2=result)[0] def test_one_unknown_country_made_nan_and_a_known_country_changed(self): df_in = pd.DataFrame({"country": ["Country 1", "country_02"], "some_variable": [1, 2]}) @@ -220,10 +221,8 @@ def test_on_dataframe_with_no_countries(self): df_in = pd.DataFrame({"country": []}) df_out = pd.DataFrame({"country": []}) df_out["country"] = df_out["country"].astype(object) - assert dataframes.are_equal( - df1=df_out, - df2=geo.harmonize_countries(df=df_in, countries_file="MOCK_COUNTRIES_FILE", warn_on_unused_countries=False), - )[0] + result = geo.harmonize_countries(df=df_in, countries_file="MOCK_COUNTRIES_FILE", warn_on_unused_countries=False) + assert result.empty def test_change_country_column_name(self): df_in = pd.DataFrame({"Country": ["country_02"]}) diff --git a/tests/test_datadiff.py b/tests/test_datadiff.py index ffeac301229..be4466781c1 100644 --- a/tests/test_datadiff.py +++ b/tests/test_datadiff.py @@ -1,3 +1,6 @@ +import os +from unittest.mock import patch + import pandas as pd from owid.catalog import Dataset, DatasetMeta, Table @@ -19,6 +22,7 @@ def _create_datasets(tmp_path): return ds_a, ds_b +@patch.dict(os.environ, {"OWID_STRICT": ""}) def test_DatasetDiff_summary(tmp_path): ds_a, ds_b = _create_datasets(tmp_path) @@ -43,6 +47,7 @@ def test_DatasetDiff_summary(tmp_path): ] +@patch.dict(os.environ, {"OWID_STRICT": ""}) def test_new_data(tmp_path): ds_a, ds_b = _create_datasets(tmp_path) @@ -62,5 +67,5 @@ def test_new_data(tmp_path): "\t\t[yellow]~ Dim [b]country[/b]", "\t\t\t\t[violet]+ New values: 1 / 3 (33.33%)\n\t\t\t\t[violet] country\n\t\t\t\t[violet] FR", "\t\t[yellow]~ Column [b]a[/b] (new [u]data[/u], changed [u]data[/u])", - "\t\t\t\t[violet]+ New values: 1 / 3 (33.33%)\n\t\t\t\t[violet] country a\n\t\t\t\t[violet] FR 3\n\t\t\t\t[violet]~ Changed values: 1 / 3 (33.33%)\n\t\t\t\t[violet] country a - a +\n\t\t\t\t[violet] US 3.0 2", + "\t\t\t\t[violet]+ New values: 1 / 3 (33.33%)\n\t\t\t\t[violet] country a\n\t\t\t\t[violet] FR 3\n\t\t\t\t[violet]~ Changed values: 1 / 3 (33.33%)\n\t\t\t\t[violet] country a - a +\n\t\t\t\t[violet] US 3 2", ] diff --git a/tests/test_grapher_helpers.py b/tests/test_grapher_helpers.py index b54ba3d2f6d..b68fb92f1e2 100644 --- a/tests/test_grapher_helpers.py +++ b/tests/test_grapher_helpers.py @@ -222,12 +222,16 @@ def test_expand_jinja(): presentation=VariablePresentationMeta( title_variant="Variant << foo >>", ), + display={ + "isProjection": "<% if foo == 'bar' %>true<% else %>false<% endif %>", + }, ) out = gh._expand_jinja(m, dim_dict={"foo": "bar"}) assert out.to_dict() == { "title": "Title bar", "description_key": ["This is bar"], "presentation": {"title_variant": "Variant bar"}, + "display": {"isProjection": True}, } diff --git a/tests/test_steps.py b/tests/test_steps.py index ff266f1917d..5693fcd05fd 100644 --- a/tests/test_steps.py +++ b/tests/test_steps.py @@ -15,6 +15,7 @@ from unittest.mock import patch import pandas as pd +import requests from owid.catalog import Dataset from etl import paths @@ -162,7 +163,11 @@ def test_select_dirty_steps(): def test_get_etag(): - etag = get_etag("https://raw.githubusercontent.com/owid/owid-grapher/master/README.md") + try: + etag = get_etag("https://raw.githubusercontent.com/owid/owid-grapher/master/README.md") + # ignore SSL errors + except requests.exceptions.SSLError: + return assert etag diff --git a/uv.lock b/uv.lock index 7557f7afeed..d52f17610dc 100644 --- a/uv.lock +++ b/uv.lock @@ -1,8 +1,9 @@ version = 1 -requires-python = ">=3.10, <3.12" +requires-python = ">=3.10, <3.13" resolution-markers = [ "python_full_version < '3.11'", - "python_full_version >= '3.11'", + "python_full_version == '3.11.*'", + "python_full_version >= '3.12'", ] [[package]] @@ -14,6 +15,104 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/f7/85273299ab57117850cc0a936c64151171fac4da49bc6fba0dad984a7c5f/affine-2.4.0-py3-none-any.whl", hash = "sha256:8a3df80e2b2378aef598a83c1392efd47967afec4242021a0b06b4c7cbc61a92", size = 15662 }, ] +[[package]] +name = "aiohappyeyeballs" +version = "2.4.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bc/69/2f6d5a019bd02e920a3417689a89887b39ad1e350b562f9955693d900c40/aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586", size = 21809 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/d8/120cd0fe3e8530df0539e71ba9683eade12cae103dd7543e50d15f737917/aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572", size = 14742 }, +] + +[[package]] +name = "aiohttp" +version = "3.11.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs", marker = "python_full_version >= '3.12'" }, + { name = "aiosignal", marker = "python_full_version >= '3.12'" }, + { name = "attrs", marker = "python_full_version >= '3.12'" }, + { name = "frozenlist", marker = "python_full_version >= '3.12'" }, + { name = "multidict", marker = "python_full_version >= '3.12'" }, + { name = "propcache", marker = "python_full_version >= '3.12'" }, + { name = "yarl", marker = "python_full_version >= '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4b/cb/f9bb10e0cf6f01730b27d370b10cc15822bea4395acd687abc8cc5fed3ed/aiohttp-3.11.7.tar.gz", hash = "sha256:01a8aca4af3da85cea5c90141d23f4b0eee3cbecfd33b029a45a80f28c66c668", size = 7666482 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/7e/fb4723d280b4de2642c57593cb94f942bfdc15def510d12b5d22a1b955a6/aiohttp-3.11.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8bedb1f6cb919af3b6353921c71281b1491f948ca64408871465d889b4ee1b66", size = 706857 }, + { url = "https://files.pythonhosted.org/packages/57/f1/4eb447ad029801b1007ff23025c2bcb2519af2e03085717efa333f1803a5/aiohttp-3.11.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f5022504adab881e2d801a88b748ea63f2a9d130e0b2c430824682a96f6534be", size = 466733 }, + { url = "https://files.pythonhosted.org/packages/ed/7e/e385e54fa3d9360f9d1ea502a5627f2f4bdd141dd227a1f8785335c4fca9/aiohttp-3.11.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e22d1721c978a6494adc824e0916f9d187fa57baeda34b55140315fa2f740184", size = 453993 }, + { url = "https://files.pythonhosted.org/packages/ee/41/660cba8b4b10a9072ae77ce81558cca94d98aaec649a3085e50b8226fc17/aiohttp-3.11.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e993676c71288618eb07e20622572b1250d8713e7e00ab3aabae28cb70f3640d", size = 1576329 }, + { url = "https://files.pythonhosted.org/packages/e1/51/4c59724afde127001b22cf09b28171829329cf2c838cb05f6de521f125cf/aiohttp-3.11.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e13a05db87d3b241c186d0936808d0e4e12decc267c617d54e9c643807e968b6", size = 1630344 }, + { url = "https://files.pythonhosted.org/packages/c7/66/513f15cec950410dbc4439926ea4d9361136df7a97ddffab0deea1b68131/aiohttp-3.11.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ba8d043fed7ffa117024d7ba66fdea011c0e7602327c6d73cacaea38abe4491", size = 1666837 }, + { url = "https://files.pythonhosted.org/packages/7a/c0/3e59d4cd8fd4c0e365d0ec962e0679dfc7629bdf0e67be398ca842ad4661/aiohttp-3.11.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dda3ed0a7869d2fa16aa41f9961ade73aa2c2e3b2fcb0a352524e7b744881889", size = 1580628 }, + { url = "https://files.pythonhosted.org/packages/22/a6/c4aea2cf583821e02f7a92c43f5f554d2334e22b741e21e8f31da2b2386b/aiohttp-3.11.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43bfd25113c1e98aec6c70e26d5f4331efbf4aa9037ba9ad88f090853bf64d7f", size = 1539922 }, + { url = "https://files.pythonhosted.org/packages/7b/54/52f33fc9cecaf28f8400e92d9c22e37939c856c4a8af26a71023ec1de689/aiohttp-3.11.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3dd3e7e7c9ef3e7214f014f1ae260892286647b3cf7c7f1b644a568fd410f8ca", size = 1527342 }, + { url = "https://files.pythonhosted.org/packages/d4/e0/fc91528bfb0283691b0448e93fe64d2416254a9ca34c58c666240440db89/aiohttp-3.11.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:78c657ece7a73b976905ab9ec8be9ef2df12ed8984c24598a1791c58ce3b4ce4", size = 1534194 }, + { url = "https://files.pythonhosted.org/packages/34/be/c6d571f46e9ef1720a850dce4c04dbfe38627a64bfdabdefb448c547e267/aiohttp-3.11.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:db70a47987e34494b451a334605bee57a126fe8d290511349e86810b4be53b01", size = 1609532 }, + { url = "https://files.pythonhosted.org/packages/3d/af/1da6918c83fb427e0f23401dca03b8d6ec776fb61ad25d2f5a8d564418e6/aiohttp-3.11.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:9e67531370a3b07e49b280c1f8c2df67985c790ad2834d1b288a2f13cd341c5f", size = 1630627 }, + { url = "https://files.pythonhosted.org/packages/32/20/fd3f4d8bc60227f1eb2fc20e75679e270ef05f81ae618cd869a68f19a32c/aiohttp-3.11.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9202f184cc0582b1db15056f2225ab4c1e3dac4d9ade50dd0613ac3c46352ac2", size = 1565670 }, + { url = "https://files.pythonhosted.org/packages/b0/9f/db692e10567acb0970618557be3bfe47fe92eac69fa7d3e81315d39b4a8b/aiohttp-3.11.7-cp310-cp310-win32.whl", hash = "sha256:2257bdd5cf54a4039a4337162cd8048f05a724380a2283df34620f55d4e29341", size = 415107 }, + { url = "https://files.pythonhosted.org/packages/0b/8c/9fb539a8a773356df3dbddd77d4a3aff3eda448a602a90e5582d8b1903a4/aiohttp-3.11.7-cp310-cp310-win_amd64.whl", hash = "sha256:b7215bf2b53bc6cb35808149980c2ae80a4ae4e273890ac85459c014d5aa60ac", size = 440569 }, + { url = "https://files.pythonhosted.org/packages/13/7f/272fa1adf68fe2fbebfe686a67b50cfb40d86dfe47d0441aff6f0b7c4c0e/aiohttp-3.11.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cea52d11e02123f125f9055dfe0ccf1c3857225fb879e4a944fae12989e2aef2", size = 706820 }, + { url = "https://files.pythonhosted.org/packages/79/3c/6d612ef77cdba75364393f04c5c577481e3b5123a774eea447ada1ddd14f/aiohttp-3.11.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3ce18f703b7298e7f7633efd6a90138d99a3f9a656cb52c1201e76cb5d79cf08", size = 466654 }, + { url = "https://files.pythonhosted.org/packages/4f/b8/1052667d4800cd49bb4f869f1ed42f5e9d5acd4676275e64ccc244c9c040/aiohttp-3.11.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:670847ee6aeb3a569cd7cdfbe0c3bec1d44828bbfbe78c5d305f7f804870ef9e", size = 454041 }, + { url = "https://files.pythonhosted.org/packages/9f/07/80fa7302314a6ee1c9278550e9d95b77a4c895999bfbc5364ed0ee28dc7c/aiohttp-3.11.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4dda726f89bfa5c465ba45b76515135a3ece0088dfa2da49b8bb278f3bdeea12", size = 1684778 }, + { url = "https://files.pythonhosted.org/packages/2e/30/a71eb45197ad6bb6af87dfb39be8b56417d24d916047d35ef3f164af87f4/aiohttp-3.11.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c25b74a811dba37c7ea6a14d99eb9402d89c8d739d50748a75f3cf994cf19c43", size = 1740992 }, + { url = "https://files.pythonhosted.org/packages/22/74/0f9394429f3c4197129333a150a85cb2a642df30097a39dd41257f0b3bdc/aiohttp-3.11.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5522ee72f95661e79db691310290c4618b86dff2d9b90baedf343fd7a08bf79", size = 1781816 }, + { url = "https://files.pythonhosted.org/packages/7f/1a/1e256b39179c98d16d53ac62f64bfcfe7c5b2c1e68b83cddd4165854524f/aiohttp-3.11.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fbf41a6bbc319a7816ae0f0177c265b62f2a59ad301a0e49b395746eb2a9884", size = 1676692 }, + { url = "https://files.pythonhosted.org/packages/9b/37/f19d2e00efcabb9183b16bd91244de1d9c4ff7bf0fb5b8302e29a78f3286/aiohttp-3.11.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:59ee1925b5a5efdf6c4e7be51deee93984d0ac14a6897bd521b498b9916f1544", size = 1619523 }, + { url = "https://files.pythonhosted.org/packages/ae/3c/af50cf5e06b98783fd776f17077f7b7e755d461114af5d6744dc037fc3b0/aiohttp-3.11.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:24054fce8c6d6f33a3e35d1c603ef1b91bbcba73e3f04a22b4f2f27dac59b347", size = 1644084 }, + { url = "https://files.pythonhosted.org/packages/c0/a6/4e0233b085cbf2b6de573515c1eddde82f1c1f17e69347e32a5a5f2617ff/aiohttp-3.11.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:351849aca2c6f814575c1a485c01c17a4240413f960df1bf9f5deb0003c61a53", size = 1648332 }, + { url = "https://files.pythonhosted.org/packages/06/20/7062e76e7817318c421c0f9d7b650fb81aaecf6d2f3a9833805b45ec2ea8/aiohttp-3.11.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:12724f3a211fa243570e601f65a8831372caf1a149d2f1859f68479f07efec3d", size = 1730912 }, + { url = "https://files.pythonhosted.org/packages/6c/1c/ff6ae4b1789894e6faf8a4e260cd3861cad618dc80ad15326789a7765750/aiohttp-3.11.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:7ea4490360b605804bea8173d2d086b6c379d6bb22ac434de605a9cbce006e7d", size = 1752619 }, + { url = "https://files.pythonhosted.org/packages/33/58/ddd5cba5ca245c00b04e9d28a7988b0f0eda02de494f8e62ecd2780655c2/aiohttp-3.11.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e0bf378db07df0a713a1e32381a1b277e62ad106d0dbe17b5479e76ec706d720", size = 1692801 }, + { url = "https://files.pythonhosted.org/packages/b2/fc/32d5e2070b43d3722b7ea65ddc6b03ffa39bcc4b5ab6395a825cde0872ad/aiohttp-3.11.7-cp311-cp311-win32.whl", hash = "sha256:cd8d62cab363dfe713067027a5adb4907515861f1e4ce63e7be810b83668b847", size = 414899 }, + { url = "https://files.pythonhosted.org/packages/ec/7e/50324c6d3df4540f5963def810b9927f220c99864065849a1dfcae77a6ce/aiohttp-3.11.7-cp311-cp311-win_amd64.whl", hash = "sha256:bf0e6cce113596377cadda4e3ac5fb89f095bd492226e46d91b4baef1dd16f60", size = 440938 }, + { url = "https://files.pythonhosted.org/packages/bf/1e/2e96b2526c590dcb99db0b94ac4f9b927ecc07f94735a8a941dee143d48b/aiohttp-3.11.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:4bb7493c3e3a36d3012b8564bd0e2783259ddd7ef3a81a74f0dbfa000fce48b7", size = 702326 }, + { url = "https://files.pythonhosted.org/packages/b5/ce/b5d7f3e68849f1f5e0b85af4ac9080b9d3c0a600857140024603653c2209/aiohttp-3.11.7-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e143b0ef9cb1a2b4f74f56d4fbe50caa7c2bb93390aff52f9398d21d89bc73ea", size = 461944 }, + { url = "https://files.pythonhosted.org/packages/28/fa/f4d98db1b7f8f0c3f74bdbd6d0d98cfc89984205cd33f1b8ee3f588ee5ad/aiohttp-3.11.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f7c58a240260822dc07f6ae32a0293dd5bccd618bb2d0f36d51c5dbd526f89c0", size = 454348 }, + { url = "https://files.pythonhosted.org/packages/04/f0/c238dda5dc9a3d12b76636e2cf0ea475890ac3a1c7e4ff0fd6c3cea2fc2d/aiohttp-3.11.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d20cfe63a1c135d26bde8c1d0ea46fd1200884afbc523466d2f1cf517d1fe33", size = 1678795 }, + { url = "https://files.pythonhosted.org/packages/79/ee/3a18f792247e6d95dba13aaedc9dc317c3c6e75f4b88c2dd4b960d20ad2f/aiohttp-3.11.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12e4d45847a174f77b2b9919719203769f220058f642b08504cf8b1cf185dacf", size = 1734411 }, + { url = "https://files.pythonhosted.org/packages/f5/79/3eb84243087a9a32cae821622c935107b4b55a5b21b76772e8e6c41092e9/aiohttp-3.11.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cf4efa2d01f697a7dbd0509891a286a4af0d86902fc594e20e3b1712c28c0106", size = 1788959 }, + { url = "https://files.pythonhosted.org/packages/91/93/ad77782c5edfa17aafc070bef978fbfb8459b2f150595ffb01b559c136f9/aiohttp-3.11.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ee6a4cdcbf54b8083dc9723cdf5f41f722c00db40ccf9ec2616e27869151129", size = 1687463 }, + { url = "https://files.pythonhosted.org/packages/ba/48/db35bd21b7877efa0be5f28385d8978c55323c5ce7685712e53f3f6c0bd9/aiohttp-3.11.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c6095aaf852c34f42e1bd0cf0dc32d1e4b48a90bfb5054abdbb9d64b36acadcb", size = 1618374 }, + { url = "https://files.pythonhosted.org/packages/ba/77/30f87db55c79fd145ed5fd15b92f2e820ce81065d41ae437797aaa550e3b/aiohttp-3.11.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1cf03d27885f8c5ebf3993a220cc84fc66375e1e6e812731f51aab2b2748f4a6", size = 1637021 }, + { url = "https://files.pythonhosted.org/packages/af/76/10b188b78ee18d0595af156d6a238bc60f9d8571f0f546027eb7eaf65b25/aiohttp-3.11.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:1a17f6a230f81eb53282503823f59d61dff14fb2a93847bf0399dc8e87817307", size = 1650792 }, + { url = "https://files.pythonhosted.org/packages/fa/33/4411bbb8ad04c47d0f4c7bd53332aaf350e49469cf6b65b132d4becafe27/aiohttp-3.11.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:481f10a1a45c5f4c4a578bbd74cff22eb64460a6549819242a87a80788461fba", size = 1696248 }, + { url = "https://files.pythonhosted.org/packages/fe/2d/6135d0dc1851a33d3faa937b20fef81340bc95e8310536d4c7f1f8ecc026/aiohttp-3.11.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:db37248535d1ae40735d15bdf26ad43be19e3d93ab3f3dad8507eb0f85bb8124", size = 1729188 }, + { url = "https://files.pythonhosted.org/packages/f5/76/a57ceff577ae26fe9a6f31ac799bc638ecf26e4acdf04295290b9929b349/aiohttp-3.11.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9d18a8b44ec8502a7fde91446cd9c9b95ce7c49f1eacc1fb2358b8907d4369fd", size = 1690038 }, + { url = "https://files.pythonhosted.org/packages/4b/81/b20e09003b6989a7f23a721692137a6143420a151063c750ab2a04878e3c/aiohttp-3.11.7-cp312-cp312-win32.whl", hash = "sha256:3d1c9c15d3999107cbb9b2d76ca6172e6710a12fda22434ee8bd3f432b7b17e8", size = 409887 }, + { url = "https://files.pythonhosted.org/packages/b7/0b/607c98bff1d07bb21e0c39e7711108ef9ff4f2a361a3ec1ce8dce93623a5/aiohttp-3.11.7-cp312-cp312-win_amd64.whl", hash = "sha256:018f1b04883a12e77e7fc161934c0f298865d3a484aea536a6a2ca8d909f0ba0", size = 436462 }, + { url = "https://files.pythonhosted.org/packages/7a/53/8d77186c6a33bd087714df18274cdcf6e36fd69a9e841c85b7e81a20b18e/aiohttp-3.11.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:241a6ca732d2766836d62c58c49ca7a93d08251daef0c1e3c850df1d1ca0cbc4", size = 695811 }, + { url = "https://files.pythonhosted.org/packages/62/b6/4c3d107a5406aa6f99f618afea82783f54ce2d9644020f50b9c88f6e823d/aiohttp-3.11.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:aa3705a8d14de39898da0fbad920b2a37b7547c3afd2a18b9b81f0223b7d0f68", size = 458530 }, + { url = "https://files.pythonhosted.org/packages/d9/05/dbf0bd3966be8ebed3beb4007a2d1356d79af4fe7c93e54f984df6385193/aiohttp-3.11.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9acfc7f652b31853eed3b92095b0acf06fd5597eeea42e939bd23a17137679d5", size = 451371 }, + { url = "https://files.pythonhosted.org/packages/19/6a/2198580314617b6cf9c4b813b84df5832b5f8efedcb8a7e8b321a187233c/aiohttp-3.11.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcefcf2915a2dbdbce37e2fc1622129a1918abfe3d06721ce9f6cdac9b6d2eaa", size = 1662905 }, + { url = "https://files.pythonhosted.org/packages/2b/65/08696fd7503f6a6f9f782bd012bf47f36d4ed179a7d8c95dba4726d5cc67/aiohttp-3.11.7-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c1f6490dd1862af5aae6cfcf2a274bffa9a5b32a8f5acb519a7ecf5a99a88866", size = 1713794 }, + { url = "https://files.pythonhosted.org/packages/c8/a3/b9a72dce6f15e2efbc09fa67c1067c4f3a3bb05661c0ae7b40799cde02b7/aiohttp-3.11.7-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1ac5462582d6561c1c1708853a9faf612ff4e5ea5e679e99be36143d6eabd8e", size = 1770757 }, + { url = "https://files.pythonhosted.org/packages/78/7e/8fb371b5f8c4c1eaa0d0a50750c0dd68059f86794aeb36919644815486f5/aiohttp-3.11.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c1a6309005acc4b2bcc577ba3b9169fea52638709ffacbd071f3503264620da", size = 1673136 }, + { url = "https://files.pythonhosted.org/packages/2f/0f/09685d13d2c7634cb808868ea29c170d4dcde4215a4a90fb86491cd3ae25/aiohttp-3.11.7-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f5b973cce96793725ef63eb449adfb74f99c043c718acb76e0d2a447ae369962", size = 1600370 }, + { url = "https://files.pythonhosted.org/packages/00/2e/18fd38b117f9b3a375166ccb70ed43cf7e3dfe2cc947139acc15feefc5a2/aiohttp-3.11.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ce91a24aac80de6be8512fb1c4838a9881aa713f44f4e91dd7bb3b34061b497d", size = 1613459 }, + { url = "https://files.pythonhosted.org/packages/2c/94/10a82abc680d753be33506be699aaa330152ecc4f316eaf081f996ee56c2/aiohttp-3.11.7-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:875f7100ce0e74af51d4139495eec4025affa1a605280f23990b6434b81df1bd", size = 1613924 }, + { url = "https://files.pythonhosted.org/packages/e9/58/897c0561f5c522dda6e173192f1e4f10144e1a7126096f17a3f12b7aa168/aiohttp-3.11.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c171fc35d3174bbf4787381716564042a4cbc008824d8195eede3d9b938e29a8", size = 1681164 }, + { url = "https://files.pythonhosted.org/packages/8b/8b/3a48b1cdafa612679d976274355f6a822de90b85d7dba55654ecfb01c979/aiohttp-3.11.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:ee9afa1b0d2293c46954f47f33e150798ad68b78925e3710044e0d67a9487791", size = 1712139 }, + { url = "https://files.pythonhosted.org/packages/aa/9d/70ab5b4dd7900db04af72840e033aee06e472b1343e372ea256ed675511c/aiohttp-3.11.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8360c7cc620abb320e1b8d603c39095101391a82b1d0be05fb2225471c9c5c52", size = 1667446 }, + { url = "https://files.pythonhosted.org/packages/cb/98/b5fbcc8f6056f0c56001c75227e6b7ca9ee4f2e5572feca82ff3d65d485d/aiohttp-3.11.7-cp313-cp313-win32.whl", hash = "sha256:7a9318da4b4ada9a67c1dd84d1c0834123081e746bee311a16bb449f363d965e", size = 408689 }, + { url = "https://files.pythonhosted.org/packages/ef/07/4d1504577fa6349dd2e3839e89fb56e5dee38d64efe3d4366e9fcfda0cdb/aiohttp-3.11.7-cp313-cp313-win_amd64.whl", hash = "sha256:fc6da202068e0a268e298d7cd09b6e9f3997736cd9b060e2750963754552a0a9", size = 434809 }, +] + +[[package]] +name = "aiosignal" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist", marker = "python_full_version >= '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ae/67/0952ed97a9793b4958e5736f6d2b346b414a2cd63e82d05940032f45b32f/aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc", size = 19422 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/ac/a7305707cb852b7e16ff80eaf5692309bde30e2b1100a1fcacdc8f731d97/aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17", size = 7617 }, +] + [[package]] name = "altair" version = "5.4.1" @@ -251,7 +350,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "botocore-stubs" }, { name = "types-s3transfer" }, - { name = "typing-extensions" }, + { name = "typing-extensions", marker = "python_full_version < '3.12'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0f/bd/f03e472d1ce2c0584e5899ceb9c8ad4211e1018e8ad865e1169d20dcc33b/boto3_stubs-1.35.18.tar.gz", hash = "sha256:da1c449efaf685b720c54a5c5bdfbe668451e2b6764aa704dcfe3f69226173f0", size = 90537 } wheels = [ @@ -289,6 +388,98 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ed/22/1b0892a70916439eefbd457c1d881a5a8c2ae17c671ddea05a2439515fd4/botocore_stubs-1.35.18-py3-none-any.whl", hash = "sha256:9f8cd580b2c9781f21d43bc5b3bf421a502270c81fd5ff235e655e92e91dc248", size = 60133 }, ] +[[package]] +name = "brotli" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/c2/f9e977608bdf958650638c3f1e28f85a1b075f075ebbe77db8555463787b/Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724", size = 7372270 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/3a/dbf4fb970c1019a57b5e492e1e0eae745d32e59ba4d6161ab5422b08eefe/Brotli-1.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e1140c64812cb9b06c922e77f1c26a75ec5e3f0fb2bf92cc8c58720dec276752", size = 873045 }, + { url = "https://files.pythonhosted.org/packages/dd/11/afc14026ea7f44bd6eb9316d800d439d092c8d508752055ce8d03086079a/Brotli-1.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c8fd5270e906eef71d4a8d19b7c6a43760c6abcfcc10c9101d14eb2357418de9", size = 446218 }, + { url = "https://files.pythonhosted.org/packages/36/83/7545a6e7729db43cb36c4287ae388d6885c85a86dd251768a47015dfde32/Brotli-1.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ae56aca0402a0f9a3431cddda62ad71666ca9d4dc3a10a142b9dce2e3c0cda3", size = 2903872 }, + { url = "https://files.pythonhosted.org/packages/32/23/35331c4d9391fcc0f29fd9bec2c76e4b4eeab769afbc4b11dd2e1098fb13/Brotli-1.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:43ce1b9935bfa1ede40028054d7f48b5469cd02733a365eec8a329ffd342915d", size = 2941254 }, + { url = "https://files.pythonhosted.org/packages/3b/24/1671acb450c902edb64bd765d73603797c6c7280a9ada85a195f6b78c6e5/Brotli-1.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7c4855522edb2e6ae7fdb58e07c3ba9111e7621a8956f481c68d5d979c93032e", size = 2857293 }, + { url = "https://files.pythonhosted.org/packages/d5/00/40f760cc27007912b327fe15bf6bfd8eaecbe451687f72a8abc587d503b3/Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:38025d9f30cf4634f8309c6874ef871b841eb3c347e90b0851f63d1ded5212da", size = 3002385 }, + { url = "https://files.pythonhosted.org/packages/b8/cb/8aaa83f7a4caa131757668c0fb0c4b6384b09ffa77f2fba9570d87ab587d/Brotli-1.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e6a904cb26bfefc2f0a6f240bdf5233be78cd2488900a2f846f3c3ac8489ab80", size = 2911104 }, + { url = "https://files.pythonhosted.org/packages/bc/c4/65456561d89d3c49f46b7fbeb8fe6e449f13bdc8ea7791832c5d476b2faf/Brotli-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a37b8f0391212d29b3a91a799c8e4a2855e0576911cdfb2515487e30e322253d", size = 2809981 }, + { url = "https://files.pythonhosted.org/packages/05/1b/cf49528437bae28abce5f6e059f0d0be6fecdcc1d3e33e7c54b3ca498425/Brotli-1.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e84799f09591700a4154154cab9787452925578841a94321d5ee8fb9a9a328f0", size = 2935297 }, + { url = "https://files.pythonhosted.org/packages/81/ff/190d4af610680bf0c5a09eb5d1eac6e99c7c8e216440f9c7cfd42b7adab5/Brotli-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f66b5337fa213f1da0d9000bc8dc0cb5b896b726eefd9c6046f699b169c41b9e", size = 2930735 }, + { url = "https://files.pythonhosted.org/packages/80/7d/f1abbc0c98f6e09abd3cad63ec34af17abc4c44f308a7a539010f79aae7a/Brotli-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5dab0844f2cf82be357a0eb11a9087f70c5430b2c241493fc122bb6f2bb0917c", size = 2933107 }, + { url = "https://files.pythonhosted.org/packages/34/ce/5a5020ba48f2b5a4ad1c0522d095ad5847a0be508e7d7569c8630ce25062/Brotli-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4fe605b917c70283db7dfe5ada75e04561479075761a0b3866c081d035b01c1", size = 2845400 }, + { url = "https://files.pythonhosted.org/packages/44/89/fa2c4355ab1eecf3994e5a0a7f5492c6ff81dfcb5f9ba7859bd534bb5c1a/Brotli-1.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1e9a65b5736232e7a7f91ff3d02277f11d339bf34099a56cdab6a8b3410a02b2", size = 3031985 }, + { url = "https://files.pythonhosted.org/packages/af/a4/79196b4a1674143d19dca400866b1a4d1a089040df7b93b88ebae81f3447/Brotli-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58d4b711689366d4a03ac7957ab8c28890415e267f9b6589969e74b6e42225ec", size = 2927099 }, + { url = "https://files.pythonhosted.org/packages/e9/54/1c0278556a097f9651e657b873ab08f01b9a9ae4cac128ceb66427d7cd20/Brotli-1.1.0-cp310-cp310-win32.whl", hash = "sha256:be36e3d172dc816333f33520154d708a2657ea63762ec16b62ece02ab5e4daf2", size = 333172 }, + { url = "https://files.pythonhosted.org/packages/f7/65/b785722e941193fd8b571afd9edbec2a9b838ddec4375d8af33a50b8dab9/Brotli-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:0c6244521dda65ea562d5a69b9a26120769b7a9fb3db2fe9545935ed6735b128", size = 357255 }, + { url = "https://files.pythonhosted.org/packages/96/12/ad41e7fadd5db55459c4c401842b47f7fee51068f86dd2894dd0dcfc2d2a/Brotli-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3daabb76a78f829cafc365531c972016e4aa8d5b4bf60660ad8ecee19df7ccc", size = 873068 }, + { url = "https://files.pythonhosted.org/packages/95/4e/5afab7b2b4b61a84e9c75b17814198ce515343a44e2ed4488fac314cd0a9/Brotli-1.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c8146669223164fc87a7e3de9f81e9423c67a79d6b3447994dfb9c95da16e2d6", size = 446244 }, + { url = "https://files.pythonhosted.org/packages/9d/e6/f305eb61fb9a8580c525478a4a34c5ae1a9bcb12c3aee619114940bc513d/Brotli-1.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30924eb4c57903d5a7526b08ef4a584acc22ab1ffa085faceb521521d2de32dd", size = 2906500 }, + { url = "https://files.pythonhosted.org/packages/3e/4f/af6846cfbc1550a3024e5d3775ede1e00474c40882c7bf5b37a43ca35e91/Brotli-1.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ceb64bbc6eac5a140ca649003756940f8d6a7c444a68af170b3187623b43bebf", size = 2943950 }, + { url = "https://files.pythonhosted.org/packages/b3/e7/ca2993c7682d8629b62630ebf0d1f3bb3d579e667ce8e7ca03a0a0576a2d/Brotli-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a469274ad18dc0e4d316eefa616d1d0c2ff9da369af19fa6f3daa4f09671fd61", size = 2918527 }, + { url = "https://files.pythonhosted.org/packages/b3/96/da98e7bedc4c51104d29cc61e5f449a502dd3dbc211944546a4cc65500d3/Brotli-1.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:524f35912131cc2cabb00edfd8d573b07f2d9f21fa824bd3fb19725a9cf06327", size = 2845489 }, + { url = "https://files.pythonhosted.org/packages/e8/ef/ccbc16947d6ce943a7f57e1a40596c75859eeb6d279c6994eddd69615265/Brotli-1.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:5b3cc074004d968722f51e550b41a27be656ec48f8afaeeb45ebf65b561481dd", size = 2914080 }, + { url = "https://files.pythonhosted.org/packages/80/d6/0bd38d758d1afa62a5524172f0b18626bb2392d717ff94806f741fcd5ee9/Brotli-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:19c116e796420b0cee3da1ccec3b764ed2952ccfcc298b55a10e5610ad7885f9", size = 2813051 }, + { url = "https://files.pythonhosted.org/packages/14/56/48859dd5d129d7519e001f06dcfbb6e2cf6db92b2702c0c2ce7d97e086c1/Brotli-1.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:510b5b1bfbe20e1a7b3baf5fed9e9451873559a976c1a78eebaa3b86c57b4265", size = 2938172 }, + { url = "https://files.pythonhosted.org/packages/3d/77/a236d5f8cd9e9f4348da5acc75ab032ab1ab2c03cc8f430d24eea2672888/Brotli-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a1fd8a29719ccce974d523580987b7f8229aeace506952fa9ce1d53a033873c8", size = 2933023 }, + { url = "https://files.pythonhosted.org/packages/f1/87/3b283efc0f5cb35f7f84c0c240b1e1a1003a5e47141a4881bf87c86d0ce2/Brotli-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c247dd99d39e0338a604f8c2b3bc7061d5c2e9e2ac7ba9cc1be5a69cb6cd832f", size = 2935871 }, + { url = "https://files.pythonhosted.org/packages/f3/eb/2be4cc3e2141dc1a43ad4ca1875a72088229de38c68e842746b342667b2a/Brotli-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1b2c248cd517c222d89e74669a4adfa5577e06ab68771a529060cf5a156e9757", size = 2847784 }, + { url = "https://files.pythonhosted.org/packages/66/13/b58ddebfd35edde572ccefe6890cf7c493f0c319aad2a5badee134b4d8ec/Brotli-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2a24c50840d89ded6c9a8fdc7b6ed3692ed4e86f1c4a4a938e1e92def92933e0", size = 3034905 }, + { url = "https://files.pythonhosted.org/packages/84/9c/bc96b6c7db824998a49ed3b38e441a2cae9234da6fa11f6ed17e8cf4f147/Brotli-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f31859074d57b4639318523d6ffdca586ace54271a73ad23ad021acd807eb14b", size = 2929467 }, + { url = "https://files.pythonhosted.org/packages/e7/71/8f161dee223c7ff7fea9d44893fba953ce97cf2c3c33f78ba260a91bcff5/Brotli-1.1.0-cp311-cp311-win32.whl", hash = "sha256:39da8adedf6942d76dc3e46653e52df937a3c4d6d18fdc94a7c29d263b1f5b50", size = 333169 }, + { url = "https://files.pythonhosted.org/packages/02/8a/fece0ee1057643cb2a5bbf59682de13f1725f8482b2c057d4e799d7ade75/Brotli-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:aac0411d20e345dc0920bdec5548e438e999ff68d77564d5e9463a7ca9d3e7b1", size = 357253 }, + { url = "https://files.pythonhosted.org/packages/5c/d0/5373ae13b93fe00095a58efcbce837fd470ca39f703a235d2a999baadfbc/Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:32d95b80260d79926f5fab3c41701dbb818fde1c9da590e77e571eefd14abe28", size = 815693 }, + { url = "https://files.pythonhosted.org/packages/8e/48/f6e1cdf86751300c288c1459724bfa6917a80e30dbfc326f92cea5d3683a/Brotli-1.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b760c65308ff1e462f65d69c12e4ae085cff3b332d894637f6273a12a482d09f", size = 422489 }, + { url = "https://files.pythonhosted.org/packages/06/88/564958cedce636d0f1bed313381dfc4b4e3d3f6015a63dae6146e1b8c65c/Brotli-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:316cc9b17edf613ac76b1f1f305d2a748f1b976b033b049a6ecdfd5612c70409", size = 873081 }, + { url = "https://files.pythonhosted.org/packages/58/79/b7026a8bb65da9a6bb7d14329fd2bd48d2b7f86d7329d5cc8ddc6a90526f/Brotli-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:caf9ee9a5775f3111642d33b86237b05808dafcd6268faa492250e9b78046eb2", size = 446244 }, + { url = "https://files.pythonhosted.org/packages/e5/18/c18c32ecea41b6c0004e15606e274006366fe19436b6adccc1ae7b2e50c2/Brotli-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70051525001750221daa10907c77830bc889cb6d865cc0b813d9db7fefc21451", size = 2906505 }, + { url = "https://files.pythonhosted.org/packages/08/c8/69ec0496b1ada7569b62d85893d928e865df29b90736558d6c98c2031208/Brotli-1.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7f4bf76817c14aa98cc6697ac02f3972cb8c3da93e9ef16b9c66573a68014f91", size = 2944152 }, + { url = "https://files.pythonhosted.org/packages/ab/fb/0517cea182219d6768113a38167ef6d4eb157a033178cc938033a552ed6d/Brotli-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0c5516f0aed654134a2fc936325cc2e642f8a0e096d075209672eb321cff408", size = 2919252 }, + { url = "https://files.pythonhosted.org/packages/c7/53/73a3431662e33ae61a5c80b1b9d2d18f58dfa910ae8dd696e57d39f1a2f5/Brotli-1.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c3020404e0b5eefd7c9485ccf8393cfb75ec38ce75586e046573c9dc29967a0", size = 2845955 }, + { url = "https://files.pythonhosted.org/packages/55/ac/bd280708d9c5ebdbf9de01459e625a3e3803cce0784f47d633562cf40e83/Brotli-1.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4ed11165dd45ce798d99a136808a794a748d5dc38511303239d4e2363c0695dc", size = 2914304 }, + { url = "https://files.pythonhosted.org/packages/76/58/5c391b41ecfc4527d2cc3350719b02e87cb424ef8ba2023fb662f9bf743c/Brotli-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4093c631e96fdd49e0377a9c167bfd75b6d0bad2ace734c6eb20b348bc3ea180", size = 2814452 }, + { url = "https://files.pythonhosted.org/packages/c7/4e/91b8256dfe99c407f174924b65a01f5305e303f486cc7a2e8a5d43c8bec3/Brotli-1.1.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e4c4629ddad63006efa0ef968c8e4751c5868ff0b1c5c40f76524e894c50248", size = 2938751 }, + { url = "https://files.pythonhosted.org/packages/5a/a6/e2a39a5d3b412938362bbbeba5af904092bf3f95b867b4a3eb856104074e/Brotli-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:861bf317735688269936f755fa136a99d1ed526883859f86e41a5d43c61d8966", size = 2933757 }, + { url = "https://files.pythonhosted.org/packages/13/f0/358354786280a509482e0e77c1a5459e439766597d280f28cb097642fc26/Brotli-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:87a3044c3a35055527ac75e419dfa9f4f3667a1e887ee80360589eb8c90aabb9", size = 2936146 }, + { url = "https://files.pythonhosted.org/packages/80/f7/daf538c1060d3a88266b80ecc1d1c98b79553b3f117a485653f17070ea2a/Brotli-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c5529b34c1c9d937168297f2c1fde7ebe9ebdd5e121297ff9c043bdb2ae3d6fb", size = 2848055 }, + { url = "https://files.pythonhosted.org/packages/ad/cf/0eaa0585c4077d3c2d1edf322d8e97aabf317941d3a72d7b3ad8bce004b0/Brotli-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ca63e1890ede90b2e4454f9a65135a4d387a4585ff8282bb72964fab893f2111", size = 3035102 }, + { url = "https://files.pythonhosted.org/packages/d8/63/1c1585b2aa554fe6dbce30f0c18bdbc877fa9a1bf5ff17677d9cca0ac122/Brotli-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e79e6520141d792237c70bcd7a3b122d00f2613769ae0cb61c52e89fd3443839", size = 2930029 }, + { url = "https://files.pythonhosted.org/packages/5f/3b/4e3fd1893eb3bbfef8e5a80d4508bec17a57bb92d586c85c12d28666bb13/Brotli-1.1.0-cp312-cp312-win32.whl", hash = "sha256:5f4d5ea15c9382135076d2fb28dde923352fe02951e66935a9efaac8f10e81b0", size = 333276 }, + { url = "https://files.pythonhosted.org/packages/3d/d5/942051b45a9e883b5b6e98c041698b1eb2012d25e5948c58d6bf85b1bb43/Brotli-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:906bc3a79de8c4ae5b86d3d75a8b77e44404b0f4261714306e3ad248d8ab0951", size = 357255 }, + { url = "https://files.pythonhosted.org/packages/0a/9f/fb37bb8ffc52a8da37b1c03c459a8cd55df7a57bdccd8831d500e994a0ca/Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8bf32b98b75c13ec7cf774164172683d6e7891088f6316e54425fde1efc276d5", size = 815681 }, + { url = "https://files.pythonhosted.org/packages/06/b3/dbd332a988586fefb0aa49c779f59f47cae76855c2d00f450364bb574cac/Brotli-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bc37c4d6b87fb1017ea28c9508b36bbcb0c3d18b4260fcdf08b200c74a6aee8", size = 422475 }, + { url = "https://files.pythonhosted.org/packages/bb/80/6aaddc2f63dbcf2d93c2d204e49c11a9ec93a8c7c63261e2b4bd35198283/Brotli-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c0ef38c7a7014ffac184db9e04debe495d317cc9c6fb10071f7fefd93100a4f", size = 2906173 }, + { url = "https://files.pythonhosted.org/packages/ea/1d/e6ca79c96ff5b641df6097d299347507d39a9604bde8915e76bf026d6c77/Brotli-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91d7cc2a76b5567591d12c01f019dd7afce6ba8cba6571187e21e2fc418ae648", size = 2943803 }, + { url = "https://files.pythonhosted.org/packages/ac/a3/d98d2472e0130b7dd3acdbb7f390d478123dbf62b7d32bda5c830a96116d/Brotli-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93dde851926f4f2678e704fadeb39e16c35d8baebd5252c9fd94ce8ce68c4a0", size = 2918946 }, + { url = "https://files.pythonhosted.org/packages/c4/a5/c69e6d272aee3e1423ed005d8915a7eaa0384c7de503da987f2d224d0721/Brotli-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0db75f47be8b8abc8d9e31bc7aad0547ca26f24a54e6fd10231d623f183d089", size = 2845707 }, + { url = "https://files.pythonhosted.org/packages/58/9f/4149d38b52725afa39067350696c09526de0125ebfbaab5acc5af28b42ea/Brotli-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6967ced6730aed543b8673008b5a391c3b1076d834ca438bbd70635c73775368", size = 2936231 }, + { url = "https://files.pythonhosted.org/packages/5a/5a/145de884285611838a16bebfdb060c231c52b8f84dfbe52b852a15780386/Brotli-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7eedaa5d036d9336c95915035fb57422054014ebdeb6f3b42eac809928e40d0c", size = 2848157 }, + { url = "https://files.pythonhosted.org/packages/50/ae/408b6bfb8525dadebd3b3dd5b19d631da4f7d46420321db44cd99dcf2f2c/Brotli-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d487f5432bf35b60ed625d7e1b448e2dc855422e87469e3f450aa5552b0eb284", size = 3035122 }, + { url = "https://files.pythonhosted.org/packages/af/85/a94e5cfaa0ca449d8f91c3d6f78313ebf919a0dbd55a100c711c6e9655bc/Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7", size = 2930206 }, + { url = "https://files.pythonhosted.org/packages/c2/f0/a61d9262cd01351df22e57ad7c34f66794709acab13f34be2675f45bf89d/Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0", size = 333804 }, + { url = "https://files.pythonhosted.org/packages/7e/c1/ec214e9c94000d1c1974ec67ced1c970c148aa6b8d8373066123fc3dbf06/Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b", size = 358517 }, +] + +[[package]] +name = "brotlicffi" +version = "1.1.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/95/9d/70caa61192f570fcf0352766331b735afa931b4c6bc9a348a0925cc13288/brotlicffi-1.1.0.0.tar.gz", hash = "sha256:b77827a689905143f87915310b93b273ab17888fd43ef350d4832c4a71083c13", size = 465192 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/11/7b96009d3dcc2c931e828ce1e157f03824a69fb728d06bfd7b2fc6f93718/brotlicffi-1.1.0.0-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9b7ae6bd1a3f0df532b6d67ff674099a96d22bc0948955cb338488c31bfb8851", size = 453786 }, + { url = "https://files.pythonhosted.org/packages/d6/e6/a8f46f4a4ee7856fbd6ac0c6fb0dc65ed181ba46cd77875b8d9bbe494d9e/brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19ffc919fa4fc6ace69286e0a23b3789b4219058313cf9b45625016bf7ff996b", size = 2911165 }, + { url = "https://files.pythonhosted.org/packages/be/20/201559dff14e83ba345a5ec03335607e47467b6633c210607e693aefac40/brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9feb210d932ffe7798ee62e6145d3a757eb6233aa9a4e7db78dd3690d7755814", size = 2927895 }, + { url = "https://files.pythonhosted.org/packages/cd/15/695b1409264143be3c933f708a3f81d53c4a1e1ebbc06f46331decbf6563/brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84763dbdef5dd5c24b75597a77e1b30c66604725707565188ba54bab4f114820", size = 2851834 }, + { url = "https://files.pythonhosted.org/packages/b4/40/b961a702463b6005baf952794c2e9e0099bde657d0d7e007f923883b907f/brotlicffi-1.1.0.0-cp37-abi3-win32.whl", hash = "sha256:1b12b50e07c3911e1efa3a8971543e7648100713d4e0971b13631cce22c587eb", size = 341731 }, + { url = "https://files.pythonhosted.org/packages/1c/fa/5408a03c041114ceab628ce21766a4ea882aa6f6f0a800e04ee3a30ec6b9/brotlicffi-1.1.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:994a4f0681bb6c6c3b0925530a1926b7a189d878e6e5e38fae8efa47c5d9c613", size = 366783 }, + { url = "https://files.pythonhosted.org/packages/e5/3b/bd4f3d2bcf2306ae66b0346f5b42af1962480b200096ffc7abc3bd130eca/brotlicffi-1.1.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2e4aeb0bd2540cb91b069dbdd54d458da8c4334ceaf2d25df2f4af576d6766ca", size = 397397 }, + { url = "https://files.pythonhosted.org/packages/54/10/1fd57864449360852c535c2381ee7120ba8f390aa3869df967c44ca7eba1/brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b7b0033b0d37bb33009fb2fef73310e432e76f688af76c156b3594389d81391", size = 379698 }, + { url = "https://files.pythonhosted.org/packages/e5/95/15aa422aa6450e6556e54a5fd1650ff59f470aed77ac739aa90ab63dc611/brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54a07bb2374a1eba8ebb52b6fafffa2afd3c4df85ddd38fcc0511f2bb387c2a8", size = 378635 }, + { url = "https://files.pythonhosted.org/packages/6c/a7/f254e13b2cb43337d6d99a4ec10394c134e41bfda8a2eff15b75627f4a3d/brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7901a7dc4b88f1c1475de59ae9be59799db1007b7d059817948d8e4f12e24e35", size = 385719 }, + { url = "https://files.pythonhosted.org/packages/72/a9/0971251c4427c14b2a827dba3d910d4d3330dabf23d4278bf6d06a978847/brotlicffi-1.1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce01c7316aebc7fce59da734286148b1d1b9455f89cf2c8a4dfce7d41db55c2d", size = 361760 }, +] + [[package]] name = "bugsnag" version = "4.7.1" @@ -650,16 +841,15 @@ wheels = [ [[package]] name = "dataclasses-json" -version = "0.5.8" +version = "0.6.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "marshmallow" }, - { name = "marshmallow-enum" }, { name = "typing-inspect" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ed/63/10cdc14908f3c9fdb9c21631275d2805853f6156b761a391e6f8918377e1/dataclasses-json-0.5.8.tar.gz", hash = "sha256:6572ac08ad9340abcb74fd8c4c8e9752db2a182a402c8e871d0a8aa119e3804e", size = 44113 } +sdist = { url = "https://files.pythonhosted.org/packages/64/a4/f71d9cf3a5ac257c993b5ca3f93df5f7fb395c725e7f1e6479d2514173c3/dataclasses_json-0.6.7.tar.gz", hash = "sha256:b6b3e528266ea45b9535223bc53ca645f5208833c29229e847b3f26a1cc55fc0", size = 32227 } wheels = [ - { url = "https://files.pythonhosted.org/packages/ae/bc/892e03650133583d5babbb7c7e5c1be6b68df86829c91fdc30cca996630d/dataclasses_json-0.5.8-py3-none-any.whl", hash = "sha256:65b167c15fdf9bde27569c09ac18dd39bf1cc5b7998525024cb4678d2653946c", size = 26211 }, + { url = "https://files.pythonhosted.org/packages/c3/be/d0d44e092656fe7a06b55e6103cbce807cdbdee17884a5367c68c9860853/dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a", size = 28686 }, ] [[package]] @@ -783,6 +973,7 @@ dependencies = [ { name = "frictionless", extra = ["pandas"] }, { name = "fsspec" }, { name = "geopandas" }, + { name = "geopy" }, { name = "gitpython" }, { name = "h5netcdf" }, { name = "html2text" }, @@ -797,6 +988,7 @@ dependencies = [ { name = "pandas" }, { name = "papermill" }, { name = "pdfplumber" }, + { name = "py7zr" }, { name = "pydantic" }, { name = "pygithub" }, { name = "pyhumps" }, @@ -841,8 +1033,10 @@ api = [ ] wizard = [ { name = "geographiclib" }, + { name = "moviepy" }, { name = "plotly" }, { name = "pyproj" }, + { name = "sentence-transformers" }, { name = "statsmodels" }, { name = "streamlit" }, { name = "streamlit-ace" }, @@ -850,6 +1044,7 @@ wizard = [ { name = "streamlit-agraph" }, { name = "streamlit-extras" }, { name = "streamlit-feedback" }, + { name = "torch" }, ] [package.dev-dependencies] @@ -873,6 +1068,7 @@ dev = [ { name = "mkdocs-git-revision-date-localized-plugin" }, { name = "mkdocs-glightbox" }, { name = "mkdocs-jupyter" }, + { name = "mkdocs-material" }, { name = "pandas-stubs" }, { name = "plotly" }, { name = "pyright" }, @@ -893,15 +1089,17 @@ requires-dist = [ { name = "earthengine-api", specifier = ">=0.1.411" }, { name = "fastapi", marker = "extra == 'api'", specifier = ">=0.109.0" }, { name = "fasteners", specifier = ">=0.19" }, - { name = "frictionless", extras = ["pandas"], specifier = ">=4.40.8,<5.0.0" }, - { name = "fsspec", specifier = "==2022.11.0" }, + { name = "frictionless", extras = ["pandas"], specifier = ">=5.0.3" }, + { name = "fsspec", specifier = ">=2022.11.0" }, { name = "geographiclib", marker = "extra == 'wizard'", specifier = ">=2.0" }, { name = "geopandas", specifier = ">=0.14.1" }, + { name = "geopy", specifier = ">=2.4.1" }, { name = "gitpython", specifier = ">=3.1.30" }, { name = "h5netcdf", specifier = ">=1.3.0" }, { name = "html2text", specifier = ">=2020.1.16" }, { name = "joblib", marker = "extra == 'api'", specifier = ">=1.3.2" }, { name = "jupyterlab", specifier = ">=3.1.13" }, + { name = "moviepy", marker = "extra == 'wizard'", specifier = ">=2.1.1" }, { name = "numpy", specifier = ">=1.22.1" }, { name = "odfpy", specifier = ">=1.4.1" }, { name = "openai", specifier = ">=1.3.6" }, @@ -909,10 +1107,11 @@ requires-dist = [ { name = "owid-catalog", editable = "lib/catalog" }, { name = "owid-datautils", editable = "lib/datautils" }, { name = "owid-repack", editable = "lib/repack" }, - { name = "pandas", specifier = "==2.2.2" }, + { name = "pandas", specifier = "==2.2.3" }, { name = "papermill", specifier = ">=2.3.3" }, { name = "pdfplumber", specifier = ">=0.9.0" }, { name = "plotly", marker = "extra == 'wizard'", specifier = ">=5.23.0" }, + { name = "py7zr", specifier = ">=0.22.0" }, { name = "pydantic", specifier = ">=1.9.0" }, { name = "pygithub", specifier = ">=2.3.0" }, { name = "pyhumps", specifier = ">=3.8.0" }, @@ -932,6 +1131,7 @@ requires-dist = [ { name = "scikit-learn", specifier = ">=1.5.2" }, { name = "scipy", specifier = ">=1.11.2" }, { name = "selenium", specifier = ">=4.15.1" }, + { name = "sentence-transformers", marker = "extra == 'wizard'", specifier = ">=2.2.2" }, { name = "sh", specifier = "==1.14.3" }, { name = "shapely", specifier = ">=2.0.3" }, { name = "simplejson", specifier = ">=3.17.6" }, @@ -939,7 +1139,7 @@ requires-dist = [ { name = "sparqlwrapper", specifier = ">=1.8.5" }, { name = "sqlalchemy", specifier = ">=2.0.30" }, { name = "statsmodels", marker = "extra == 'wizard'", specifier = ">=0.14.4" }, - { name = "streamlit", marker = "extra == 'wizard'", specifier = ">=1.39.0" }, + { name = "streamlit", marker = "extra == 'wizard'", specifier = ">=1.40.0" }, { name = "streamlit-ace", marker = "extra == 'wizard'", specifier = ">=0.1.1" }, { name = "streamlit-aggrid", marker = "extra == 'wizard'", specifier = ">=0.3.4.post3" }, { name = "streamlit-agraph", marker = "extra == 'wizard'", specifier = ">=0.0.45" }, @@ -948,6 +1148,7 @@ requires-dist = [ { name = "structlog", specifier = ">=21.5.0" }, { name = "tenacity", specifier = ">=8.0.1" }, { name = "tiktoken", specifier = ">=0.7.0" }, + { name = "torch", marker = "extra == 'wizard'", specifier = "<2.3.0" }, { name = "typing-extensions", specifier = ">=4.7.1" }, { name = "unidecode", specifier = ">=1.3.2" }, { name = "uvicorn", extras = ["standard"], marker = "extra == 'api'", specifier = ">=0.25.0" }, @@ -975,10 +1176,11 @@ dev = [ { name = "mkdocs-click", specifier = ">=0.8.1" }, { name = "mkdocs-exclude", specifier = ">=1.0.2" }, { name = "mkdocs-gen-files", specifier = ">=0.5.0" }, - { name = "mkdocs-git-authors-plugin", specifier = ">=0.7.2" }, + { name = "mkdocs-git-authors-plugin", specifier = ">=0.9.2" }, { name = "mkdocs-git-revision-date-localized-plugin", specifier = ">=1.2.6" }, { name = "mkdocs-glightbox", specifier = ">=0.3.7" }, { name = "mkdocs-jupyter", specifier = ">=0.24.8" }, + { name = "mkdocs-material", specifier = ">=9.5.34" }, { name = "pandas-stubs", specifier = "==1.2.0.62" }, { name = "plotly", specifier = ">=5.23.0" }, { name = "pyright", specifier = "==1.1.373" }, @@ -1149,15 +1351,18 @@ wheels = [ [[package]] name = "frictionless" -version = "4.40.11" +version = "5.18.0" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "attrs" }, { name = "chardet" }, + { name = "humanize" }, { name = "isodate" }, { name = "jinja2" }, { name = "jsonschema" }, { name = "marko" }, { name = "petl" }, + { name = "pydantic" }, { name = "python-dateutil" }, { name = "python-slugify" }, { name = "pyyaml" }, @@ -1167,16 +1372,87 @@ dependencies = [ { name = "stringcase" }, { name = "tabulate" }, { name = "typer" }, + { name = "typing-extensions" }, { name = "validators" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/56/90/6e4126b50d4edeffcd5f39657b4648fb6f3d17d43088ab09d62f49e1cad0/frictionless-4.40.11.tar.gz", hash = "sha256:e7d83d82cd3273820c74ac715e8d78285697f1eceda49a2417a72f839420d42e", size = 258035 } +sdist = { url = "https://files.pythonhosted.org/packages/26/b4/ded94e51965f95100893adcf78ef9307553414a0bb56217adf68450bd7e7/frictionless-5.18.0.tar.gz", hash = "sha256:4b21a10d3ac67e46a4a58a1e8a8a27c6882af4d1608eadfb6ccbfde0b5eef6b9", size = 74371639 } wheels = [ - { url = "https://files.pythonhosted.org/packages/4f/cb/13b97bcf9c2ed6a4dc3b7d6fe99f7d7a1f395a2847ca3d951afbf82d6787/frictionless-4.40.11-py2.py3-none-any.whl", hash = "sha256:5b2bbb3779d5e2ecfe99add2458a7b2bcb61eae6173696ea57ef0b28c085d976", size = 419905 }, + { url = "https://files.pythonhosted.org/packages/fb/e5/c7ff55b81286f24ddfaff45c9d46614c3e40c72a8ebd036c2cc18d902243/frictionless-5.18.0-py3-none-any.whl", hash = "sha256:a82433b81cfcfae21328aad6b93854feb86d5d054b22ac147672eb9c254b6a3d", size = 535385 }, ] [package.optional-dependencies] pandas = [ { name = "pandas" }, + { name = "pyarrow" }, +] + +[[package]] +name = "frozenlist" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8f/ed/0f4cec13a93c02c47ec32d81d11c0c1efbadf4a471e3f3ce7cad366cbbd3/frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817", size = 39930 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/79/29d44c4af36b2b240725dce566b20f63f9b36ef267aaaa64ee7466f4f2f8/frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a", size = 94451 }, + { url = "https://files.pythonhosted.org/packages/47/47/0c999aeace6ead8a44441b4f4173e2261b18219e4ad1fe9a479871ca02fc/frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb", size = 54301 }, + { url = "https://files.pythonhosted.org/packages/8d/60/107a38c1e54176d12e06e9d4b5d755b677d71d1219217cee063911b1384f/frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec", size = 52213 }, + { url = "https://files.pythonhosted.org/packages/17/62/594a6829ac5679c25755362a9dc93486a8a45241394564309641425d3ff6/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5", size = 240946 }, + { url = "https://files.pythonhosted.org/packages/7e/75/6c8419d8f92c80dd0ee3f63bdde2702ce6398b0ac8410ff459f9b6f2f9cb/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76", size = 264608 }, + { url = "https://files.pythonhosted.org/packages/88/3e/82a6f0b84bc6fb7e0be240e52863c6d4ab6098cd62e4f5b972cd31e002e8/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17", size = 261361 }, + { url = "https://files.pythonhosted.org/packages/fd/85/14e5f9ccac1b64ff2f10c927b3ffdf88772aea875882406f9ba0cec8ad84/frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba", size = 231649 }, + { url = "https://files.pythonhosted.org/packages/ee/59/928322800306f6529d1852323014ee9008551e9bb027cc38d276cbc0b0e7/frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d", size = 241853 }, + { url = "https://files.pythonhosted.org/packages/7d/bd/e01fa4f146a6f6c18c5d34cab8abdc4013774a26c4ff851128cd1bd3008e/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2", size = 243652 }, + { url = "https://files.pythonhosted.org/packages/a5/bd/e4771fd18a8ec6757033f0fa903e447aecc3fbba54e3630397b61596acf0/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f", size = 241734 }, + { url = "https://files.pythonhosted.org/packages/21/13/c83821fa5544af4f60c5d3a65d054af3213c26b14d3f5f48e43e5fb48556/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c", size = 260959 }, + { url = "https://files.pythonhosted.org/packages/71/f3/1f91c9a9bf7ed0e8edcf52698d23f3c211d8d00291a53c9f115ceb977ab1/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab", size = 262706 }, + { url = "https://files.pythonhosted.org/packages/4c/22/4a256fdf5d9bcb3ae32622c796ee5ff9451b3a13a68cfe3f68e2c95588ce/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5", size = 250401 }, + { url = "https://files.pythonhosted.org/packages/af/89/c48ebe1f7991bd2be6d5f4ed202d94960c01b3017a03d6954dd5fa9ea1e8/frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb", size = 45498 }, + { url = "https://files.pythonhosted.org/packages/28/2f/cc27d5f43e023d21fe5c19538e08894db3d7e081cbf582ad5ed366c24446/frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4", size = 51622 }, + { url = "https://files.pythonhosted.org/packages/79/43/0bed28bf5eb1c9e4301003b74453b8e7aa85fb293b31dde352aac528dafc/frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30", size = 94987 }, + { url = "https://files.pythonhosted.org/packages/bb/bf/b74e38f09a246e8abbe1e90eb65787ed745ccab6eaa58b9c9308e052323d/frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5", size = 54584 }, + { url = "https://files.pythonhosted.org/packages/2c/31/ab01375682f14f7613a1ade30149f684c84f9b8823a4391ed950c8285656/frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778", size = 52499 }, + { url = "https://files.pythonhosted.org/packages/98/a8/d0ac0b9276e1404f58fec3ab6e90a4f76b778a49373ccaf6a563f100dfbc/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a", size = 276357 }, + { url = "https://files.pythonhosted.org/packages/ad/c9/c7761084fa822f07dac38ac29f841d4587570dd211e2262544aa0b791d21/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869", size = 287516 }, + { url = "https://files.pythonhosted.org/packages/a1/ff/cd7479e703c39df7bdab431798cef89dc75010d8aa0ca2514c5b9321db27/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d", size = 283131 }, + { url = "https://files.pythonhosted.org/packages/59/a0/370941beb47d237eca4fbf27e4e91389fd68699e6f4b0ebcc95da463835b/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45", size = 261320 }, + { url = "https://files.pythonhosted.org/packages/b8/5f/c10123e8d64867bc9b4f2f510a32042a306ff5fcd7e2e09e5ae5100ee333/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d", size = 274877 }, + { url = "https://files.pythonhosted.org/packages/fa/79/38c505601ae29d4348f21706c5d89755ceded02a745016ba2f58bd5f1ea6/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3", size = 269592 }, + { url = "https://files.pythonhosted.org/packages/19/e2/39f3a53191b8204ba9f0bb574b926b73dd2efba2a2b9d2d730517e8f7622/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a", size = 265934 }, + { url = "https://files.pythonhosted.org/packages/d5/c9/3075eb7f7f3a91f1a6b00284af4de0a65a9ae47084930916f5528144c9dd/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9", size = 283859 }, + { url = "https://files.pythonhosted.org/packages/05/f5/549f44d314c29408b962fa2b0e69a1a67c59379fb143b92a0a065ffd1f0f/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2", size = 287560 }, + { url = "https://files.pythonhosted.org/packages/9d/f8/cb09b3c24a3eac02c4c07a9558e11e9e244fb02bf62c85ac2106d1eb0c0b/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf", size = 277150 }, + { url = "https://files.pythonhosted.org/packages/37/48/38c2db3f54d1501e692d6fe058f45b6ad1b358d82cd19436efab80cfc965/frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942", size = 45244 }, + { url = "https://files.pythonhosted.org/packages/ca/8c/2ddffeb8b60a4bce3b196c32fcc30d8830d4615e7b492ec2071da801b8ad/frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d", size = 51634 }, + { url = "https://files.pythonhosted.org/packages/79/73/fa6d1a96ab7fd6e6d1c3500700963eab46813847f01ef0ccbaa726181dd5/frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21", size = 94026 }, + { url = "https://files.pythonhosted.org/packages/ab/04/ea8bf62c8868b8eada363f20ff1b647cf2e93377a7b284d36062d21d81d1/frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d", size = 54150 }, + { url = "https://files.pythonhosted.org/packages/d0/9a/8e479b482a6f2070b26bda572c5e6889bb3ba48977e81beea35b5ae13ece/frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e", size = 51927 }, + { url = "https://files.pythonhosted.org/packages/e3/12/2aad87deb08a4e7ccfb33600871bbe8f0e08cb6d8224371387f3303654d7/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a", size = 282647 }, + { url = "https://files.pythonhosted.org/packages/77/f2/07f06b05d8a427ea0060a9cef6e63405ea9e0d761846b95ef3fb3be57111/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a", size = 289052 }, + { url = "https://files.pythonhosted.org/packages/bd/9f/8bf45a2f1cd4aa401acd271b077989c9267ae8463e7c8b1eb0d3f561b65e/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee", size = 291719 }, + { url = "https://files.pythonhosted.org/packages/41/d1/1f20fd05a6c42d3868709b7604c9f15538a29e4f734c694c6bcfc3d3b935/frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6", size = 267433 }, + { url = "https://files.pythonhosted.org/packages/af/f2/64b73a9bb86f5a89fb55450e97cd5c1f84a862d4ff90d9fd1a73ab0f64a5/frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e", size = 283591 }, + { url = "https://files.pythonhosted.org/packages/29/e2/ffbb1fae55a791fd6c2938dd9ea779509c977435ba3940b9f2e8dc9d5316/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9", size = 273249 }, + { url = "https://files.pythonhosted.org/packages/2e/6e/008136a30798bb63618a114b9321b5971172a5abddff44a100c7edc5ad4f/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039", size = 271075 }, + { url = "https://files.pythonhosted.org/packages/ae/f0/4e71e54a026b06724cec9b6c54f0b13a4e9e298cc8db0f82ec70e151f5ce/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784", size = 285398 }, + { url = "https://files.pythonhosted.org/packages/4d/36/70ec246851478b1c0b59f11ef8ade9c482ff447c1363c2bd5fad45098b12/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631", size = 294445 }, + { url = "https://files.pythonhosted.org/packages/37/e0/47f87544055b3349b633a03c4d94b405956cf2437f4ab46d0928b74b7526/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f", size = 280569 }, + { url = "https://files.pythonhosted.org/packages/f9/7c/490133c160fb6b84ed374c266f42800e33b50c3bbab1652764e6e1fc498a/frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8", size = 44721 }, + { url = "https://files.pythonhosted.org/packages/b1/56/4e45136ffc6bdbfa68c29ca56ef53783ef4c2fd395f7cbf99a2624aa9aaa/frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f", size = 51329 }, + { url = "https://files.pythonhosted.org/packages/da/3b/915f0bca8a7ea04483622e84a9bd90033bab54bdf485479556c74fd5eaf5/frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953", size = 91538 }, + { url = "https://files.pythonhosted.org/packages/c7/d1/a7c98aad7e44afe5306a2b068434a5830f1470675f0e715abb86eb15f15b/frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0", size = 52849 }, + { url = "https://files.pythonhosted.org/packages/3a/c8/76f23bf9ab15d5f760eb48701909645f686f9c64fbb8982674c241fbef14/frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2", size = 50583 }, + { url = "https://files.pythonhosted.org/packages/1f/22/462a3dd093d11df623179d7754a3b3269de3b42de2808cddef50ee0f4f48/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f", size = 265636 }, + { url = "https://files.pythonhosted.org/packages/80/cf/e075e407fc2ae7328155a1cd7e22f932773c8073c1fc78016607d19cc3e5/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608", size = 270214 }, + { url = "https://files.pythonhosted.org/packages/a1/58/0642d061d5de779f39c50cbb00df49682832923f3d2ebfb0fedf02d05f7f/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b", size = 273905 }, + { url = "https://files.pythonhosted.org/packages/ab/66/3fe0f5f8f2add5b4ab7aa4e199f767fd3b55da26e3ca4ce2cc36698e50c4/frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840", size = 250542 }, + { url = "https://files.pythonhosted.org/packages/f6/b8/260791bde9198c87a465224e0e2bb62c4e716f5d198fc3a1dacc4895dbd1/frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439", size = 267026 }, + { url = "https://files.pythonhosted.org/packages/2e/a4/3d24f88c527f08f8d44ade24eaee83b2627793fa62fa07cbb7ff7a2f7d42/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de", size = 257690 }, + { url = "https://files.pythonhosted.org/packages/de/9a/d311d660420b2beeff3459b6626f2ab4fb236d07afbdac034a4371fe696e/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641", size = 253893 }, + { url = "https://files.pythonhosted.org/packages/c6/23/e491aadc25b56eabd0f18c53bb19f3cdc6de30b2129ee0bc39cd387cd560/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e", size = 267006 }, + { url = "https://files.pythonhosted.org/packages/08/c4/ab918ce636a35fb974d13d666dcbe03969592aeca6c3ab3835acff01f79c/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9", size = 276157 }, + { url = "https://files.pythonhosted.org/packages/c0/29/3b7a0bbbbe5a34833ba26f686aabfe982924adbdcafdc294a7a129c31688/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03", size = 264642 }, + { url = "https://files.pythonhosted.org/packages/ab/42/0595b3dbffc2e82d7fe658c12d5a5bafcd7516c6bf2d1d1feb5387caa9c1/frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c", size = 44914 }, + { url = "https://files.pythonhosted.org/packages/17/c4/b7db1206a3fea44bf3b838ca61deb6f74424a8a5db1dd53ecb21da669be6/frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28", size = 51167 }, + { url = "https://files.pythonhosted.org/packages/c6/c8/a5be5b7550c10858fcf9b0ea054baccab474da77d37f1e828ce043a3a5d4/frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3", size = 11901 }, ] [[package]] @@ -1229,6 +1505,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3e/b0/69fa7a0f55122847506a42fea6988d03b34136938082f142151bc9d9f7e7/geopandas-0.14.4-py3-none-any.whl", hash = "sha256:3bb6473cb59d51e1a7fe2dbc24a1a063fb0ebdeddf3ce08ddbf8c7ddc99689aa", size = 1109913 }, ] +[[package]] +name = "geopy" +version = "2.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "geographiclib" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0e/fd/ef6d53875ceab72c1fad22dbed5ec1ad04eb378c2251a6a8024bad890c3b/geopy-2.4.1.tar.gz", hash = "sha256:50283d8e7ad07d89be5cb027338c6365a32044df3ae2556ad3f52f4840b3d0d1", size = 117625 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/15/cf2a69ade4b194aa524ac75112d5caac37414b20a3a03e6865dfe0bd1539/geopy-2.4.1-py3-none-any.whl", hash = "sha256:ae8b4bc5c1131820f4d75fce9d4aaaca0c85189b3aa5d64c3dcaf5e3b7b882a7", size = 125437 }, +] + [[package]] name = "ghp-import" version = "2.1.0" @@ -1623,6 +1911,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/56/95/9377bcb415797e44274b51d46e3249eba641711cf3348050f76ee7b15ffc/httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0", size = 76395 }, ] +[[package]] +name = "huggingface-hub" +version = "0.17.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/13/d2/e0d36491422425bb882e4a6432a06aee9e56348aeefd9aab648a995d173b/huggingface_hub-0.17.3.tar.gz", hash = "sha256:40439632b211311f788964602bf8b0d9d6b7a2314fba4e8d67b2ce3ecea0e3fd", size = 254782 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/f3/3fc97336a0e90516901befd4f500f08d691034d387406fdbde85bea827cc/huggingface_hub-0.17.3-py3-none-any.whl", hash = "sha256:545eb3665f6ac587add946e73984148f2ea5c7877eac2e845549730570c1933a", size = 295010 }, +] + +[[package]] +name = "humanize" +version = "4.11.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/40/64a912b9330786df25e58127194d4a5a7441f818b400b155e748a270f924/humanize-4.11.0.tar.gz", hash = "sha256:e66f36020a2d5a974c504bd2555cf770621dbdbb6d82f94a6857c0b1ea2608be", size = 80374 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/92/75/4bc3e242ad13f2e6c12e0b0401ab2c5e5c6f0d7da37ec69bc808e24e0ccb/humanize-4.11.0-py3-none-any.whl", hash = "sha256:b53caaec8532bcb2fff70c8826f904c35943f8cecaca29d272d9df38092736c0", size = 128055 }, +] + [[package]] name = "hydra-core" version = "1.3.2" @@ -1646,6 +1961,71 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/7e/d71db821f177828df9dea8c42ac46473366f191be53080e552e628aad991/idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac", size = 66894 }, ] +[[package]] +name = "imageio" +version = "2.36.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4f/34/a714fd354f5f7fe650477072d4da21446849b20c02045dcf7ac827495121/imageio-2.36.0.tar.gz", hash = "sha256:1c8f294db862c256e9562354d65aa54725b8dafed7f10f02bb3ec20ec1678850", size = 389492 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/e7/26045404a30c8a200e960fb54fbaf4b73d12e58cd28e03b306b084253f4f/imageio-2.36.0-py3-none-any.whl", hash = "sha256:471f1eda55618ee44a3c9960911c35e647d9284c68f077e868df633398f137f0", size = 315414 }, +] + +[[package]] +name = "imageio-ffmpeg" +version = "0.5.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "setuptools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/73/22/b0a0d96ecdbd4a8493c6cd7914a8b2bfbc39f8660f81c20e3bde847182e0/imageio-ffmpeg-0.5.1.tar.gz", hash = "sha256:0ed7a9b31f560b0c9d929c5291cd430edeb9bed3ce9a497480e536dd4326484c", size = 17704 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/3d/df5dc571520f495ba2152215cd26deebd46e1530eae0261f503bfd137e99/imageio_ffmpeg-0.5.1-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:1460e84712b9d06910c1f7bb524096b0341d4b7844cea6c20e099d0a24e795b1", size = 22532925 }, + { url = "https://files.pythonhosted.org/packages/fe/44/c9e18a73dfd939b5b0ec843870ac72f1f6c17e31a03149b687a85465bff7/imageio_ffmpeg-0.5.1-py3-none-manylinux2010_x86_64.whl", hash = "sha256:5289f75c7f755b499653f3209fea4efd1430cba0e39831c381aad2d458f7a316", size = 26900394 }, + { url = "https://files.pythonhosted.org/packages/cd/ca/8537cdbf1a6852912cb293fa23dc7adf256cec793113485447f0cbf0fe79/imageio_ffmpeg-0.5.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7fa9132a291d5eb28c44553550deb40cbdab831f2a614e55360301a6582eb205", size = 22880461 }, + { url = "https://files.pythonhosted.org/packages/a9/97/ff7de8ace4425fffc6e8d7646017500c9d5435df608b13cc34de4835ad4f/imageio_ffmpeg-0.5.1-py3-none-win32.whl", hash = "sha256:89efe2c79979d8174ba8476deb7f74d74c331caee3fb2b65ba2883bec0737625", size = 19652102 }, + { url = "https://files.pythonhosted.org/packages/a9/1c/1b9c72bf839def47626436ea5ebaf643404f7850482c5fafd71a3deeaa94/imageio_ffmpeg-0.5.1-py3-none-win_amd64.whl", hash = "sha256:1521e79e253bedbdd36a547e0cbd94a025ba0b558e17f08fea687d805a0e4698", size = 22619891 }, +] + +[[package]] +name = "inflate64" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8c/99/18f9940d4a3f2cabc4396a587ddf1bd93236bdb372d9e78e2b0365e40990/inflate64-1.0.0.tar.gz", hash = "sha256:3278827b803cf006a1df251f3e13374c7d26db779e5a33329cc11789b804bc2d", size = 895853 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/cf/06af80e81dd4bbb7e883291cf1726035d526f066a37c4ed4d4cd88a7a49d/inflate64-1.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a90c0bdf4a7ecddd8a64cc977181810036e35807f56b0bcacee9abb0fcfd18dc", size = 59418 }, + { url = "https://files.pythonhosted.org/packages/c9/4b/6f18918220b1a8e935121cece1dc917e62fa593fc637a621470f9b9a601a/inflate64-1.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:57fe7c14aebf1c5a74fc3b70d355be1280a011521a76aa3895486e62454f4242", size = 36231 }, + { url = "https://files.pythonhosted.org/packages/aa/f4/f4b5dbd78dd5af66b6ca32778ebaa9c14d67b68ea84e96592ccf40786a41/inflate64-1.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d90730165f471d61a1a694a5e354f3ffa938227e8dcecb62d5d728e8069cee94", size = 35738 }, + { url = "https://files.pythonhosted.org/packages/10/23/26289a700550767cf5eb7550f78ad826529706287393f224bbaee3c1b1e2/inflate64-1.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:543f400201f5c101141af3c79c82059e1aa6ef4f1584a7f1fa035fb2e465097f", size = 92855 }, + { url = "https://files.pythonhosted.org/packages/b8/f4/e387a50f5027194eac4f9712d57b97e3e1a012402eaae98bcf1ebe8a97d1/inflate64-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ceca14f7ec19fb44b047f56c50efb7521b389d222bba2b0a10286a0caeb03fa", size = 93141 }, + { url = "https://files.pythonhosted.org/packages/33/c8/e516aecd9ed0dc75d8df041ed4ef80f2e2be39d0e516c7269b7f274e760a/inflate64-1.0.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b559937a42f0c175b4d2dfc7eb53b97bdc87efa9add15ed5549c6abc1e89d02f", size = 95262 }, + { url = "https://files.pythonhosted.org/packages/0b/aa/ed3ab5f8c13afc432fb382edf97cede7a6f9be73ecf98bfe64b686c8d223/inflate64-1.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5ff8bd2a562343fcbc4eea26fdc368904a3b5f6bb8262344274d3d74a1de15bb", size = 95912 }, + { url = "https://files.pythonhosted.org/packages/e0/64/5637c4f67ed15518c0765b85b528ed79536caaf8ba167a9f7173e334d4a8/inflate64-1.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:0fe481f31695d35a433c3044ac8fd5d9f5069aaad03a0c04b570eb258ce655aa", size = 35166 }, + { url = "https://files.pythonhosted.org/packages/af/92/701b3c76b1cf244026c3e78dff8487955cf6960c1d9f350e2820a0d1a5d9/inflate64-1.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a45f6979ad5874d4d4898c2fc770b136e61b96b850118fdaec5a5af1b9123a", size = 59450 }, + { url = "https://files.pythonhosted.org/packages/bb/1d/af0253fafc27cadd29e3b111ebb3011b8c913a3554b403c90c7595f5933e/inflate64-1.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:022ca1cc928e7365a05f7371ff06af143c6c667144965e2cf9a9236a2ae1c291", size = 36267 }, + { url = "https://files.pythonhosted.org/packages/b6/22/7949030be11f4754bd6ed7067e9bebdf614013b89ccd4638330a85821b51/inflate64-1.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46792ecf3565d64fd2c519b0a780c03a57e195613c9954ef94e739a057b3fd06", size = 35740 }, + { url = "https://files.pythonhosted.org/packages/e4/87/c6ce0093a345c04811f6171a367665dec17dcc4617ca150dd37e9ae7bd33/inflate64-1.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a70ea2e456c15f7aa7c74b8ab8f20b4f8940ec657604c9f0a9de3342f280fff", size = 95896 }, + { url = "https://files.pythonhosted.org/packages/62/d6/fe113b12773cad2c093d381c2b1629f9cfa240c9ad86a7f9f9079e7a51b5/inflate64-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e243ea9bd36a035059f2365bd6d156ff59717fbafb0255cb0c75bf151bf6904", size = 96007 }, + { url = "https://files.pythonhosted.org/packages/f0/a6/9165bee4b7fc5af949fec12a2cea7ad73bf9ee97dfb96a0276274c48e709/inflate64-1.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4dc392dec1cd11cacda3d2637214ca45e38202e8a4f31d4a4e566d6e90625fc4", size = 98297 }, + { url = "https://files.pythonhosted.org/packages/ee/72/0aeb360101eeed32696fc6c623bc1780fac895a9fc2e93b582cb1e22ca54/inflate64-1.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8b402a50eda7ee75f342fc346d33a41bca58edc222a4b17f9be0db1daed459fa", size = 98858 }, + { url = "https://files.pythonhosted.org/packages/94/4a/8301ad59b57d9de504b0fdce22bf980dfb231753e6d7aed12af938f7f9fd/inflate64-1.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:f5924499dc8800928c0ee4580fa8eb4ffa880b2cce4431537d0390e503a9c9ee", size = 35167 }, + { url = "https://files.pythonhosted.org/packages/18/82/47021b8919c1dc276d0502296f15ffac1cd648b94b35cadb14cb812b6199/inflate64-1.0.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0c644bf7208e20825ca3bbb5fb1f7f495cfcb49eb01a5f67338796d44a42f2bf", size = 59509 }, + { url = "https://files.pythonhosted.org/packages/e0/c9/00701be8e48dc9c9b9488001d9c66d6cb6f6bb0c48af9abf33a69726d130/inflate64-1.0.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9964a4eaf26a9d36f82a1d9b12c28e35800dd3d99eb340453ed12ac90c2976a8", size = 36305 }, + { url = "https://files.pythonhosted.org/packages/25/c0/11dea5e298b2e7d61f0fbd1005553e8796e35536751980b676547fcc57ef/inflate64-1.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2cccded63865640d03253897be7232b2bbac295fe43914c61f86a57aa23bb61d", size = 35756 }, + { url = "https://files.pythonhosted.org/packages/86/ba/4debdaaafdc21853621caf463a498a754ee4352893454c596dbd65294e9f/inflate64-1.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d491f104fb3701926ebd82b8c9250dfba0ddcab584504e26f1e4adb26730378d", size = 96127 }, + { url = "https://files.pythonhosted.org/packages/89/81/8f559c199ec13d0b70d0dc46811490b2976873c96c564941583777e9b343/inflate64-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ebad4a6cd2a2c1d81be0b09d4006479f3b258803c49a9224ef8ca0b649072fa", size = 96903 }, + { url = "https://files.pythonhosted.org/packages/46/41/39ac4c7e17d0690578b716a0ff34e00600616994795b0645fd61fc600c0f/inflate64-1.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:6823b2c0cff3a8159140f3b17ec64fb8ec0e663b45a6593618ecdde8aeecb5b2", size = 98855 }, + { url = "https://files.pythonhosted.org/packages/44/dd/be5d69492c180f94a6af8a15564ce365bdcb84bd1a6fb32949d6913959aa/inflate64-1.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:228d504239d27958e71fc77e3119a6ac4528127df38468a0c95a5bd3927204b8", size = 99884 }, + { url = "https://files.pythonhosted.org/packages/8c/0d/a5266bd4f2cdb7fad1eae3ffe4dcc16f9769323660a0a6cfbe9cc1d2cf03/inflate64-1.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae2572e06bcfe15e3bbf77d4e4a6d6c55e2a70d6abceaaf60c5c3653ddb96dfd", size = 35334 }, + { url = "https://files.pythonhosted.org/packages/53/91/43238dd8a7e5bab71abae872c09931db4b31aebf672afccb305f79aacb3e/inflate64-1.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f79542478e49e471e8b23556700e6f688a40dc93e9a746f77a546c13251b59b1", size = 34648 }, + { url = "https://files.pythonhosted.org/packages/ef/6f/ce090934a80c1fd0b5b07c125ed6eb2845f11a78af344d69c0f051dcab97/inflate64-1.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a270be6b10cde01258c0097a663a307c62d12c78eb8f62f8e29f205335942c9", size = 36473 }, + { url = "https://files.pythonhosted.org/packages/b4/fe/2cd4bf78696213b807860002c182dd1751ba52c1559143b1b8daa7904733/inflate64-1.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1616a87ff04f583e9558cc247ec0b72a30d540ee0c17cc77823be175c0ec92f0", size = 36478 }, + { url = "https://files.pythonhosted.org/packages/43/dd/e62444c0ef7d1228b622e6d3dacf9ea237d8807a78619a83832a3b4a5adf/inflate64-1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:137ca6b315f0157a786c3a755a09395ca69aed8bcf42ad3437cb349f5ebc86d2", size = 35630 }, +] + [[package]] name = "inflect" version = "7.4.0" @@ -2428,18 +2808,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3c/78/c1de55eb3311f2c200a8b91724414b8d6f5ae78891c15d9d936ea43c3dba/marshmallow-3.22.0-py3-none-any.whl", hash = "sha256:71a2dce49ef901c3f97ed296ae5051135fd3febd2bf43afe0ae9a82143a494d9", size = 49334 }, ] -[[package]] -name = "marshmallow-enum" -version = "1.5.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "marshmallow" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8e/8c/ceecdce57dfd37913143087fffd15f38562a94f0d22823e3c66eac0dca31/marshmallow-enum-1.5.1.tar.gz", hash = "sha256:38e697e11f45a8e64b4a1e664000897c659b60aa57bfa18d44e226a9920b6e58", size = 4013 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/59/ef3a3dc499be447098d4a89399beb869f813fee1b5a57d5d79dee2c1bf51/marshmallow_enum-1.5.1-py2.py3-none-any.whl", hash = "sha256:57161ab3dbfde4f57adeb12090f39592e992b9c86d206d02f6bd03ebec60f072", size = 4186 }, -] - [[package]] name = "matplotlib" version = "3.9.1.post1" @@ -2614,14 +2982,14 @@ wheels = [ [[package]] name = "mkdocs-git-authors-plugin" -version = "0.7.2" +version = "0.9.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "mkdocs" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/43/ff/a759124da74b0874b8db4988af5f60f594917316ce82baaef36abae94073/mkdocs-git-authors-plugin-0.7.2.tar.gz", hash = "sha256:f541730e4cabdafa0ac758c94d28ba5e8ddca4c859e5de4c89f1226cb6ccd0ad", size = 15785 } +sdist = { url = "https://files.pythonhosted.org/packages/80/ef/09ab7178d580e342cb3ba279c48eaf3abf55795a2ae6e5426fe2c725143c/mkdocs_git_authors_plugin-0.9.2.tar.gz", hash = "sha256:77f97c321e08a8757beb866293eb257070b11cd5a080976bc6696b249cbade4f", size = 21403 } wheels = [ - { url = "https://files.pythonhosted.org/packages/55/7c/c4b6d71921dd0cf33f87bfd69d7c72774bf4ece57b6aa23221d1ac31d9fb/mkdocs_git_authors_plugin-0.7.2-py3-none-any.whl", hash = "sha256:c8a2784a867db79ad3b477a96ee96875d17b09192b6d3be71f08df25afff76c4", size = 18860 }, + { url = "https://files.pythonhosted.org/packages/48/08/57d0fea1cc30096fcc94ec9cd4ccdee625be89fd710626f78d90fc13738e/mkdocs_git_authors_plugin-0.9.2-py3-none-any.whl", hash = "sha256:f6cefc4dc832865d26f7f9f944c0a8c7dc852742d79320f3800e0d97814e2a84", size = 20332 }, ] [[package]] @@ -2705,6 +3073,102 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/7e/3a64597054a70f7c86eb0a7d4fc315b8c1ab932f64883a297bdffeb5f967/more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef", size = 60952 }, ] +[[package]] +name = "moviepy" +version = "2.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "decorator" }, + { name = "imageio" }, + { name = "imageio-ffmpeg" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "proglog" }, + { name = "python-dotenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1c/98/0ed6566a9d297dc41850d4bd5a2af4619f28abbfd9bdae76f5961410e2e2/moviepy-2.1.1.tar.gz", hash = "sha256:0210336944fcc88a1841fe7ca3701f8f449eead222f9d1665d11d3b8fa753454", size = 58421147 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/7b/edcb99095b403906becbeb61539123e391b22ec418eb0954a34d39a0bc83/moviepy-2.1.1-py3-none-any.whl", hash = "sha256:b236a794c1bc00162613f08b5b5a862531ac39666b6943161c1f46a4dc8d3064", size = 123485 }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198 }, +] + +[[package]] +name = "multidict" +version = "6.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/be/504b89a5e9ca731cd47487e91c469064f8ae5af93b7259758dcfc2b9c848/multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a", size = 64002 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/29/68/259dee7fd14cf56a17c554125e534f6274c2860159692a414d0b402b9a6d/multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60", size = 48628 }, + { url = "https://files.pythonhosted.org/packages/50/79/53ba256069fe5386a4a9e80d4e12857ced9de295baf3e20c68cdda746e04/multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1", size = 29327 }, + { url = "https://files.pythonhosted.org/packages/ff/10/71f1379b05b196dae749b5ac062e87273e3f11634f447ebac12a571d90ae/multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53", size = 29689 }, + { url = "https://files.pythonhosted.org/packages/71/45/70bac4f87438ded36ad4793793c0095de6572d433d98575a5752629ef549/multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5", size = 126639 }, + { url = "https://files.pythonhosted.org/packages/80/cf/17f35b3b9509b4959303c05379c4bfb0d7dd05c3306039fc79cf035bbac0/multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581", size = 134315 }, + { url = "https://files.pythonhosted.org/packages/ef/1f/652d70ab5effb33c031510a3503d4d6efc5ec93153562f1ee0acdc895a57/multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56", size = 129471 }, + { url = "https://files.pythonhosted.org/packages/a6/64/2dd6c4c681688c0165dea3975a6a4eab4944ea30f35000f8b8af1df3148c/multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429", size = 124585 }, + { url = "https://files.pythonhosted.org/packages/87/56/e6ee5459894c7e554b57ba88f7257dc3c3d2d379cb15baaa1e265b8c6165/multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748", size = 116957 }, + { url = "https://files.pythonhosted.org/packages/36/9e/616ce5e8d375c24b84f14fc263c7ef1d8d5e8ef529dbc0f1df8ce71bb5b8/multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db", size = 128609 }, + { url = "https://files.pythonhosted.org/packages/8c/4f/4783e48a38495d000f2124020dc96bacc806a4340345211b1ab6175a6cb4/multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056", size = 123016 }, + { url = "https://files.pythonhosted.org/packages/3e/b3/4950551ab8fc39862ba5e9907dc821f896aa829b4524b4deefd3e12945ab/multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76", size = 133542 }, + { url = "https://files.pythonhosted.org/packages/96/4d/f0ce6ac9914168a2a71df117935bb1f1781916acdecbb43285e225b484b8/multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160", size = 130163 }, + { url = "https://files.pythonhosted.org/packages/be/72/17c9f67e7542a49dd252c5ae50248607dfb780bcc03035907dafefb067e3/multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7", size = 126832 }, + { url = "https://files.pythonhosted.org/packages/71/9f/72d719e248cbd755c8736c6d14780533a1606ffb3fbb0fbd77da9f0372da/multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0", size = 26402 }, + { url = "https://files.pythonhosted.org/packages/04/5a/d88cd5d00a184e1ddffc82aa2e6e915164a6d2641ed3606e766b5d2f275a/multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d", size = 28800 }, + { url = "https://files.pythonhosted.org/packages/93/13/df3505a46d0cd08428e4c8169a196131d1b0c4b515c3649829258843dde6/multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6", size = 48570 }, + { url = "https://files.pythonhosted.org/packages/f0/e1/a215908bfae1343cdb72f805366592bdd60487b4232d039c437fe8f5013d/multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156", size = 29316 }, + { url = "https://files.pythonhosted.org/packages/70/0f/6dc70ddf5d442702ed74f298d69977f904960b82368532c88e854b79f72b/multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb", size = 29640 }, + { url = "https://files.pythonhosted.org/packages/d8/6d/9c87b73a13d1cdea30b321ef4b3824449866bd7f7127eceed066ccb9b9ff/multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b", size = 131067 }, + { url = "https://files.pythonhosted.org/packages/cc/1e/1b34154fef373371fd6c65125b3d42ff5f56c7ccc6bfff91b9b3c60ae9e0/multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72", size = 138507 }, + { url = "https://files.pythonhosted.org/packages/fb/e0/0bc6b2bac6e461822b5f575eae85da6aae76d0e2a79b6665d6206b8e2e48/multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304", size = 133905 }, + { url = "https://files.pythonhosted.org/packages/ba/af/73d13b918071ff9b2205fcf773d316e0f8fefb4ec65354bbcf0b10908cc6/multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351", size = 129004 }, + { url = "https://files.pythonhosted.org/packages/74/21/23960627b00ed39643302d81bcda44c9444ebcdc04ee5bedd0757513f259/multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb", size = 121308 }, + { url = "https://files.pythonhosted.org/packages/8b/5c/cf282263ffce4a596ed0bb2aa1a1dddfe1996d6a62d08842a8d4b33dca13/multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3", size = 132608 }, + { url = "https://files.pythonhosted.org/packages/d7/3e/97e778c041c72063f42b290888daff008d3ab1427f5b09b714f5a8eff294/multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399", size = 127029 }, + { url = "https://files.pythonhosted.org/packages/47/ac/3efb7bfe2f3aefcf8d103e9a7162572f01936155ab2f7ebcc7c255a23212/multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423", size = 137594 }, + { url = "https://files.pythonhosted.org/packages/42/9b/6c6e9e8dc4f915fc90a9b7798c44a30773dea2995fdcb619870e705afe2b/multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3", size = 134556 }, + { url = "https://files.pythonhosted.org/packages/1d/10/8e881743b26aaf718379a14ac58572a240e8293a1c9d68e1418fb11c0f90/multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753", size = 130993 }, + { url = "https://files.pythonhosted.org/packages/45/84/3eb91b4b557442802d058a7579e864b329968c8d0ea57d907e7023c677f2/multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80", size = 26405 }, + { url = "https://files.pythonhosted.org/packages/9f/0b/ad879847ecbf6d27e90a6eabb7eff6b62c129eefe617ea45eae7c1f0aead/multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926", size = 28795 }, + { url = "https://files.pythonhosted.org/packages/fd/16/92057c74ba3b96d5e211b553895cd6dc7cc4d1e43d9ab8fafc727681ef71/multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa", size = 48713 }, + { url = "https://files.pythonhosted.org/packages/94/3d/37d1b8893ae79716179540b89fc6a0ee56b4a65fcc0d63535c6f5d96f217/multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436", size = 29516 }, + { url = "https://files.pythonhosted.org/packages/a2/12/adb6b3200c363062f805275b4c1e656be2b3681aada66c80129932ff0bae/multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761", size = 29557 }, + { url = "https://files.pythonhosted.org/packages/47/e9/604bb05e6e5bce1e6a5cf80a474e0f072e80d8ac105f1b994a53e0b28c42/multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e", size = 130170 }, + { url = "https://files.pythonhosted.org/packages/7e/13/9efa50801785eccbf7086b3c83b71a4fb501a4d43549c2f2f80b8787d69f/multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef", size = 134836 }, + { url = "https://files.pythonhosted.org/packages/bf/0f/93808b765192780d117814a6dfcc2e75de6dcc610009ad408b8814dca3ba/multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95", size = 133475 }, + { url = "https://files.pythonhosted.org/packages/d3/c8/529101d7176fe7dfe1d99604e48d69c5dfdcadb4f06561f465c8ef12b4df/multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925", size = 131049 }, + { url = "https://files.pythonhosted.org/packages/ca/0c/fc85b439014d5a58063e19c3a158a889deec399d47b5269a0f3b6a2e28bc/multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966", size = 120370 }, + { url = "https://files.pythonhosted.org/packages/db/46/d4416eb20176492d2258fbd47b4abe729ff3b6e9c829ea4236f93c865089/multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305", size = 125178 }, + { url = "https://files.pythonhosted.org/packages/5b/46/73697ad7ec521df7de5531a32780bbfd908ded0643cbe457f981a701457c/multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2", size = 119567 }, + { url = "https://files.pythonhosted.org/packages/cd/ed/51f060e2cb0e7635329fa6ff930aa5cffa17f4c7f5c6c3ddc3500708e2f2/multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2", size = 129822 }, + { url = "https://files.pythonhosted.org/packages/df/9e/ee7d1954b1331da3eddea0c4e08d9142da5f14b1321c7301f5014f49d492/multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6", size = 128656 }, + { url = "https://files.pythonhosted.org/packages/77/00/8538f11e3356b5d95fa4b024aa566cde7a38aa7a5f08f4912b32a037c5dc/multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3", size = 125360 }, + { url = "https://files.pythonhosted.org/packages/be/05/5d334c1f2462d43fec2363cd00b1c44c93a78c3925d952e9a71caf662e96/multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133", size = 26382 }, + { url = "https://files.pythonhosted.org/packages/a3/bf/f332a13486b1ed0496d624bcc7e8357bb8053823e8cd4b9a18edc1d97e73/multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1", size = 28529 }, + { url = "https://files.pythonhosted.org/packages/22/67/1c7c0f39fe069aa4e5d794f323be24bf4d33d62d2a348acdb7991f8f30db/multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008", size = 48771 }, + { url = "https://files.pythonhosted.org/packages/3c/25/c186ee7b212bdf0df2519eacfb1981a017bda34392c67542c274651daf23/multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f", size = 29533 }, + { url = "https://files.pythonhosted.org/packages/67/5e/04575fd837e0958e324ca035b339cea174554f6f641d3fb2b4f2e7ff44a2/multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28", size = 29595 }, + { url = "https://files.pythonhosted.org/packages/d3/b2/e56388f86663810c07cfe4a3c3d87227f3811eeb2d08450b9e5d19d78876/multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b", size = 130094 }, + { url = "https://files.pythonhosted.org/packages/6c/ee/30ae9b4186a644d284543d55d491fbd4239b015d36b23fea43b4c94f7052/multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c", size = 134876 }, + { url = "https://files.pythonhosted.org/packages/84/c7/70461c13ba8ce3c779503c70ec9d0345ae84de04521c1f45a04d5f48943d/multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3", size = 133500 }, + { url = "https://files.pythonhosted.org/packages/4a/9f/002af221253f10f99959561123fae676148dd730e2daa2cd053846a58507/multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44", size = 131099 }, + { url = "https://files.pythonhosted.org/packages/82/42/d1c7a7301d52af79d88548a97e297f9d99c961ad76bbe6f67442bb77f097/multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2", size = 120403 }, + { url = "https://files.pythonhosted.org/packages/68/f3/471985c2c7ac707547553e8f37cff5158030d36bdec4414cb825fbaa5327/multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3", size = 125348 }, + { url = "https://files.pythonhosted.org/packages/67/2c/e6df05c77e0e433c214ec1d21ddd203d9a4770a1f2866a8ca40a545869a0/multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa", size = 119673 }, + { url = "https://files.pythonhosted.org/packages/c5/cd/bc8608fff06239c9fb333f9db7743a1b2eafe98c2666c9a196e867a3a0a4/multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa", size = 129927 }, + { url = "https://files.pythonhosted.org/packages/44/8e/281b69b7bc84fc963a44dc6e0bbcc7150e517b91df368a27834299a526ac/multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4", size = 128711 }, + { url = "https://files.pythonhosted.org/packages/12/a4/63e7cd38ed29dd9f1881d5119f272c898ca92536cdb53ffe0843197f6c85/multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6", size = 125519 }, + { url = "https://files.pythonhosted.org/packages/38/e0/4f5855037a72cd8a7a2f60a3952d9aa45feedb37ae7831642102604e8a37/multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81", size = 26426 }, + { url = "https://files.pythonhosted.org/packages/7e/a5/17ee3a4db1e310b7405f5d25834460073a8ccd86198ce044dfaf69eac073/multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774", size = 28531 }, + { url = "https://files.pythonhosted.org/packages/99/b7/b9e70fde2c0f0c9af4cc5277782a89b66d35948ea3369ec9f598358c3ac5/multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506", size = 10051 }, +] + [[package]] name = "multiurl" version = "0.3.1" @@ -2717,12 +3181,21 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/cc/12/4e979f71d90ca5625647f93cd484c733a7e8ae4fd9f6d15369613d727301/multiurl-0.3.1.tar.gz", hash = "sha256:c7001437b59d56d4c310d725c3dcfff98c97c4b652893d88989853827465d442", size = 18161 } +[[package]] +name = "multivolumefile" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/50/f0/a7786212b5a4cb9ba05ae84a2bbd11d1d0279523aea0424b6d981d652a14/multivolumefile-0.2.3.tar.gz", hash = "sha256:a0648d0aafbc96e59198d5c17e9acad7eb531abea51035d08ce8060dcad709d6", size = 77984 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/31/ec5f46fd4c83185b806aa9c736e228cb780f13990a9cf4da0beb70025fcc/multivolumefile-0.2.3-py3-none-any.whl", hash = "sha256:237f4353b60af1703087cf7725755a1f6fcaeeea48421e1896940cd1c920d678", size = 17037 }, +] + [[package]] name = "mypy-boto3-s3" version = "1.35.16" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions" }, + { name = "typing-extensions", marker = "python_full_version < '3.12'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/bc/79/9237eadd3dca9ffeccbd9bf0a3dba780fd6efe5be853fc820e5ebf43b759/mypy_boto3_s3-1.35.16.tar.gz", hash = "sha256:599567e327eaabe4cdd0c226c07cac850431d048166aba49c2a162031ec48934", size = 71757 } wheels = [ @@ -2890,6 +3363,114 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754 }, ] +[[package]] +name = "nvidia-cublas-cu12" +version = "12.1.3.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/6d/121efd7382d5b0284239f4ab1fc1590d86d34ed4a4a2fdb13b30ca8e5740/nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728", size = 410594774 }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.1.105" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/00/6b218edd739ecfc60524e585ba8e6b00554dd908de2c9c66c1af3e44e18d/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e", size = 14109015 }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.1.105" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/9f/c64c03f49d6fbc56196664d05dba14e3a561038a81a638eeb47f4d4cfd48/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2", size = 23671734 }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.1.105" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/d5/c68b1d2cdfcc59e72e8a5949a37ddb22ae6cade80cd4a57a84d4c8b55472/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40", size = 823596 }, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "8.9.2.26" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/74/a2e2be7fb83aaedec84f391f082cf765dfb635e7caa9b49065f73e4835d8/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl", hash = "sha256:5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9", size = 731725872 }, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.0.2.54" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/94/eb540db023ce1d162e7bea9f8f5aa781d57c65aed513c33ee9a5123ead4d/nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56", size = 121635161 }, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.2.106" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/44/31/4890b1c9abc496303412947fc7dcea3d14861720642b49e8ceed89636705/nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0", size = 56467784 }, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.4.5.107" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cusparse-cu12" }, + { name = "nvidia-nvjitlink-cu12" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/1d/8de1e5c67099015c834315e333911273a8c6aaba78923dd1d1e25fc5f217/nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd", size = 124161928 }, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.1.0.106" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/65/5b/cfaeebf25cd9fdec14338ccb16f6b2c4c7fa9163aefcf057d86b9cc248bb/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c", size = 195958278 }, +] + +[[package]] +name = "nvidia-nccl-cu12" +version = "2.19.3" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl", hash = "sha256:a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d", size = 165987969 }, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 }, + { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 }, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.1.105" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/d3/8057f0587683ed2fcd4dbfbdfdfa807b9160b809976099d36b8f60d08f03/nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5", size = 99138 }, +] + [[package]] name = "oauth2client" version = "4.1.3" @@ -3014,13 +3595,13 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "boto3", specifier = ">=1.21.13" }, - { name = "dataclasses-json", specifier = "==0.5.8" }, + { name = "dataclasses-json", specifier = ">=0.6.7" }, { name = "dynamic-yaml", specifier = ">=1.3.5" }, { name = "ipdb", specifier = ">=0.13.9" }, { name = "jsonschema", specifier = ">=3.2.0" }, { name = "mistune", specifier = ">=3.0.1" }, { name = "owid-datautils", editable = "lib/datautils" }, - { name = "owid-repack", specifier = ">=0.1.1" }, + { name = "owid-repack", editable = "lib/repack" }, { name = "pandas", specifier = ">=2.2.1" }, { name = "pyarrow", specifier = ">=10.0.1" }, { name = "pyyaml", specifier = ">=6.0.1" }, @@ -3052,6 +3633,7 @@ dependencies = [ { name = "gdown" }, { name = "gsheets" }, { name = "pandas" }, + { name = "py7zr" }, { name = "pyarrow" }, { name = "pydrive2" }, { name = "structlog" }, @@ -3066,6 +3648,7 @@ requires-dist = [ { name = "gdown", specifier = ">=4.5.2" }, { name = "gsheets", specifier = ">=0.6.1" }, { name = "pandas", specifier = ">=2.2.1" }, + { name = "py7zr", specifier = ">=0.22.0" }, { name = "pyarrow", specifier = ">=10.0.1" }, { name = "pydrive2", specifier = ">=1.15.0" }, { name = "structlog", specifier = ">=21.5.0" }, @@ -3091,21 +3674,24 @@ dev = [ [[package]] name = "owid-repack" -version = "0.1.3" +version = "0.1.4" source = { editable = "lib/repack" } dependencies = [ { name = "numpy" }, { name = "pandas" }, + { name = "pyarrow" }, ] [package.metadata] requires-dist = [ { name = "numpy", specifier = ">=1.24.0" }, - { name = "pandas", specifier = ">=2.2.1" }, + { name = "pandas", specifier = ">=2.2.3" }, + { name = "pyarrow", specifier = ">=10.0.1,<18.0.0" }, ] [package.metadata.requires-dev] dev = [ + { name = "ipdb", specifier = ">=0.13.13" }, { name = "pyright", specifier = "==1.1.373" }, { name = "pytest", specifier = ">=7.2.0" }, { name = "ruff", specifier = "==0.1.6" }, @@ -3131,7 +3717,7 @@ wheels = [ [[package]] name = "pandas" -version = "2.2.2" +version = "2.2.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, @@ -3139,29 +3725,42 @@ dependencies = [ { name = "pytz" }, { name = "tzdata" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/88/d9/ecf715f34c73ccb1d8ceb82fc01cd1028a65a5f6dbc57bfa6ea155119058/pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54", size = 4398391 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/2d/39600d073ea70b9cafdc51fab91d69c72b49dd92810f24cb5ac6631f387f/pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce", size = 12551798 }, - { url = "https://files.pythonhosted.org/packages/fd/4b/0cd38e68ab690b9df8ef90cba625bf3f93b82d1c719703b8e1b333b2c72d/pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238", size = 11287392 }, - { url = "https://files.pythonhosted.org/packages/01/c6/d3d2612aea9b9f28e79a30b864835dad8f542dcf474eee09afeee5d15d75/pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08", size = 15634823 }, - { url = "https://files.pythonhosted.org/packages/89/1b/12521efcbc6058e2673583bb096c2b5046a9df39bd73eca392c1efed24e5/pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0", size = 13032214 }, - { url = "https://files.pythonhosted.org/packages/e4/d7/303dba73f1c3a9ef067d23e5afbb6175aa25e8121be79be354dcc740921a/pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51", size = 16278302 }, - { url = "https://files.pythonhosted.org/packages/ba/df/8ff7c5ed1cc4da8c6ab674dc8e4860a4310c3880df1283e01bac27a4333d/pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99", size = 13892866 }, - { url = "https://files.pythonhosted.org/packages/69/a6/81d5dc9a612cf0c1810c2ebc4f2afddb900382276522b18d128213faeae3/pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772", size = 11621592 }, - { url = "https://files.pythonhosted.org/packages/1b/70/61704497903d43043e288017cb2b82155c0d41e15f5c17807920877b45c2/pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288", size = 12574808 }, - { url = "https://files.pythonhosted.org/packages/16/c6/75231fd47afd6b3f89011e7077f1a3958441264aca7ae9ff596e3276a5d0/pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151", size = 11304876 }, - { url = "https://files.pythonhosted.org/packages/97/2d/7b54f80b93379ff94afb3bd9b0cd1d17b48183a0d6f98045bc01ce1e06a7/pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b", size = 15602548 }, - { url = "https://files.pythonhosted.org/packages/fc/a5/4d82be566f069d7a9a702dcdf6f9106df0e0b042e738043c0cc7ddd7e3f6/pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee", size = 13031332 }, - { url = "https://files.pythonhosted.org/packages/92/a2/b79c48f530673567805e607712b29814b47dcaf0d167e87145eb4b0118c6/pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db", size = 16286054 }, - { url = "https://files.pythonhosted.org/packages/40/c7/47e94907f1d8fdb4868d61bd6c93d57b3784a964d52691b77ebfdb062842/pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1", size = 13879507 }, - { url = "https://files.pythonhosted.org/packages/ab/63/966db1321a0ad55df1d1fe51505d2cdae191b84c907974873817b0a6e849/pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24", size = 11634249 }, - { url = "https://files.pythonhosted.org/packages/dd/49/de869130028fb8d90e25da3b7d8fb13e40f5afa4c4af1781583eb1ff3839/pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef", size = 12500886 }, - { url = "https://files.pythonhosted.org/packages/db/7c/9a60add21b96140e22465d9adf09832feade45235cd22f4cb1668a25e443/pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce", size = 11340320 }, - { url = "https://files.pythonhosted.org/packages/b0/85/f95b5f322e1ae13b7ed7e97bd999160fa003424711ab4dc8344b8772c270/pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad", size = 15204346 }, - { url = "https://files.pythonhosted.org/packages/40/10/79e52ef01dfeb1c1ca47a109a01a248754ebe990e159a844ece12914de83/pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad", size = 12733396 }, - { url = "https://files.pythonhosted.org/packages/35/9d/208febf8c4eb5c1d9ea3314d52d8bd415fd0ef0dd66bb24cc5bdbc8fa71a/pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76", size = 15858913 }, - { url = "https://files.pythonhosted.org/packages/99/d1/2d9bd05def7a9e08a92ec929b5a4c8d5556ec76fae22b0fa486cbf33ea63/pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32", size = 13417786 }, - { url = "https://files.pythonhosted.org/packages/22/a5/a0b255295406ed54269814bc93723cfd1a0da63fb9aaf99e1364f07923e5/pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23", size = 11498828 }, +sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5", size = 12580827 }, + { url = "https://files.pythonhosted.org/packages/99/f2/c4527768739ffa4469b2b4fff05aa3768a478aed89a2f271a79a40eee984/pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348", size = 11303897 }, + { url = "https://files.pythonhosted.org/packages/ed/12/86c1747ea27989d7a4064f806ce2bae2c6d575b950be087837bdfcabacc9/pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed", size = 66480908 }, + { url = "https://files.pythonhosted.org/packages/44/50/7db2cd5e6373ae796f0ddad3675268c8d59fb6076e66f0c339d61cea886b/pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57", size = 13064210 }, + { url = "https://files.pythonhosted.org/packages/61/61/a89015a6d5536cb0d6c3ba02cebed51a95538cf83472975275e28ebf7d0c/pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42", size = 16754292 }, + { url = "https://files.pythonhosted.org/packages/ce/0d/4cc7b69ce37fac07645a94e1d4b0880b15999494372c1523508511b09e40/pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f", size = 14416379 }, + { url = "https://files.pythonhosted.org/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645", size = 11598471 }, + { url = "https://files.pythonhosted.org/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222 }, + { url = "https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274 }, + { url = "https://files.pythonhosted.org/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836 }, + { url = "https://files.pythonhosted.org/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505 }, + { url = "https://files.pythonhosted.org/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420 }, + { url = "https://files.pythonhosted.org/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457 }, + { url = "https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166 }, + { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893 }, + { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475 }, + { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645 }, + { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445 }, + { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235 }, + { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756 }, + { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 }, + { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643 }, + { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573 }, + { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085 }, + { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809 }, + { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316 }, + { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055 }, + { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175 }, + { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650 }, + { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177 }, + { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526 }, + { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013 }, + { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620 }, + { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 }, ] [[package]] @@ -3187,6 +3786,7 @@ name = "papermill" version = "2.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "aiohttp", marker = "python_full_version >= '3.12'" }, { name = "ansicolors" }, { name = "click" }, { name = "entrypoints" }, @@ -3377,6 +3977,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, ] +[[package]] +name = "proglog" +version = "0.1.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bd/b9/1cfff8f6a797ea99bf7ffc8c5fd811c135837948fd90f3ea86fd5166ee4f/proglog-0.1.10.tar.gz", hash = "sha256:658c28c9c82e4caeb2f25f488fff9ceace22f8d69b15d0c1c86d64275e4ddab4", size = 11088 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/f5/cab5cf6a540c31f5099043de0ae43990fd9cf66f75ecb5e9f254a4e4d4ee/proglog-0.1.10-py3-none-any.whl", hash = "sha256:19d5da037e8c813da480b741e3fa71fb1ac0a5b02bf21c41577c7f327485ec50", size = 6114 }, +] + [[package]] name = "prometheus-client" version = "0.20.0" @@ -3398,6 +4010,79 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/37/791f1a6edd13c61cac85282368aa68cb0f3f164440fdf60032f2cc6ca34e/prompt_toolkit-3.0.36-py3-none-any.whl", hash = "sha256:aa64ad242a462c5ff0363a7b9cfe696c20d55d9fc60c11fd8e632d064804d305", size = 386414 }, ] +[[package]] +name = "propcache" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a9/4d/5e5a60b78dbc1d464f8a7bbaeb30957257afdc8512cbb9dfd5659304f5cd/propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70", size = 40951 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/08/1963dfb932b8d74d5b09098507b37e9b96c835ba89ab8aad35aa330f4ff3/propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58", size = 80712 }, + { url = "https://files.pythonhosted.org/packages/e6/59/49072aba9bf8a8ed958e576182d46f038e595b17ff7408bc7e8807e721e1/propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b", size = 46301 }, + { url = "https://files.pythonhosted.org/packages/33/a2/6b1978c2e0d80a678e2c483f45e5443c15fe5d32c483902e92a073314ef1/propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110", size = 45581 }, + { url = "https://files.pythonhosted.org/packages/43/95/55acc9adff8f997c7572f23d41993042290dfb29e404cdadb07039a4386f/propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2", size = 208659 }, + { url = "https://files.pythonhosted.org/packages/bd/2c/ef7371ff715e6cd19ea03fdd5637ecefbaa0752fee5b0f2fe8ea8407ee01/propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a", size = 222613 }, + { url = "https://files.pythonhosted.org/packages/5e/1c/fef251f79fd4971a413fa4b1ae369ee07727b4cc2c71e2d90dfcde664fbb/propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577", size = 221067 }, + { url = "https://files.pythonhosted.org/packages/8d/e7/22e76ae6fc5a1708bdce92bdb49de5ebe89a173db87e4ef597d6bbe9145a/propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850", size = 208920 }, + { url = "https://files.pythonhosted.org/packages/04/3e/f10aa562781bcd8a1e0b37683a23bef32bdbe501d9cc7e76969becaac30d/propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61", size = 200050 }, + { url = "https://files.pythonhosted.org/packages/d0/98/8ac69f638358c5f2a0043809c917802f96f86026e86726b65006830f3dc6/propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37", size = 202346 }, + { url = "https://files.pythonhosted.org/packages/ee/78/4acfc5544a5075d8e660af4d4e468d60c418bba93203d1363848444511ad/propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48", size = 199750 }, + { url = "https://files.pythonhosted.org/packages/a2/8f/90ada38448ca2e9cf25adc2fe05d08358bda1b9446f54a606ea38f41798b/propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630", size = 201279 }, + { url = "https://files.pythonhosted.org/packages/08/31/0e299f650f73903da851f50f576ef09bfffc8e1519e6a2f1e5ed2d19c591/propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394", size = 211035 }, + { url = "https://files.pythonhosted.org/packages/85/3e/e356cc6b09064bff1c06d0b2413593e7c925726f0139bc7acef8a21e87a8/propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b", size = 215565 }, + { url = "https://files.pythonhosted.org/packages/8b/54/4ef7236cd657e53098bd05aa59cbc3cbf7018fba37b40eaed112c3921e51/propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336", size = 207604 }, + { url = "https://files.pythonhosted.org/packages/1f/27/d01d7799c068443ee64002f0655d82fb067496897bf74b632e28ee6a32cf/propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad", size = 40526 }, + { url = "https://files.pythonhosted.org/packages/bb/44/6c2add5eeafb7f31ff0d25fbc005d930bea040a1364cf0f5768750ddf4d1/propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99", size = 44958 }, + { url = "https://files.pythonhosted.org/packages/e0/1c/71eec730e12aec6511e702ad0cd73c2872eccb7cad39de8ba3ba9de693ef/propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354", size = 80811 }, + { url = "https://files.pythonhosted.org/packages/89/c3/7e94009f9a4934c48a371632197406a8860b9f08e3f7f7d922ab69e57a41/propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de", size = 46365 }, + { url = "https://files.pythonhosted.org/packages/c0/1d/c700d16d1d6903aeab28372fe9999762f074b80b96a0ccc953175b858743/propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87", size = 45602 }, + { url = "https://files.pythonhosted.org/packages/2e/5e/4a3e96380805bf742712e39a4534689f4cddf5fa2d3a93f22e9fd8001b23/propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016", size = 236161 }, + { url = "https://files.pythonhosted.org/packages/a5/85/90132481183d1436dff6e29f4fa81b891afb6cb89a7306f32ac500a25932/propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb", size = 244938 }, + { url = "https://files.pythonhosted.org/packages/4a/89/c893533cb45c79c970834274e2d0f6d64383ec740be631b6a0a1d2b4ddc0/propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2", size = 243576 }, + { url = "https://files.pythonhosted.org/packages/8c/56/98c2054c8526331a05f205bf45cbb2cda4e58e56df70e76d6a509e5d6ec6/propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4", size = 236011 }, + { url = "https://files.pythonhosted.org/packages/2d/0c/8b8b9f8a6e1abd869c0fa79b907228e7abb966919047d294ef5df0d136cf/propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504", size = 224834 }, + { url = "https://files.pythonhosted.org/packages/18/bb/397d05a7298b7711b90e13108db697732325cafdcd8484c894885c1bf109/propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178", size = 224946 }, + { url = "https://files.pythonhosted.org/packages/25/19/4fc08dac19297ac58135c03770b42377be211622fd0147f015f78d47cd31/propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d", size = 217280 }, + { url = "https://files.pythonhosted.org/packages/7e/76/c79276a43df2096ce2aba07ce47576832b1174c0c480fe6b04bd70120e59/propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2", size = 220088 }, + { url = "https://files.pythonhosted.org/packages/c3/9a/8a8cf428a91b1336b883f09c8b884e1734c87f724d74b917129a24fe2093/propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db", size = 233008 }, + { url = "https://files.pythonhosted.org/packages/25/7b/768a8969abd447d5f0f3333df85c6a5d94982a1bc9a89c53c154bf7a8b11/propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b", size = 237719 }, + { url = "https://files.pythonhosted.org/packages/ed/0d/e5d68ccc7976ef8b57d80613ac07bbaf0614d43f4750cf953f0168ef114f/propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b", size = 227729 }, + { url = "https://files.pythonhosted.org/packages/05/64/17eb2796e2d1c3d0c431dc5f40078d7282f4645af0bb4da9097fbb628c6c/propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1", size = 40473 }, + { url = "https://files.pythonhosted.org/packages/83/c5/e89fc428ccdc897ade08cd7605f174c69390147526627a7650fb883e0cd0/propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71", size = 44921 }, + { url = "https://files.pythonhosted.org/packages/7c/46/a41ca1097769fc548fc9216ec4c1471b772cc39720eb47ed7e38ef0006a9/propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2", size = 80800 }, + { url = "https://files.pythonhosted.org/packages/75/4f/93df46aab9cc473498ff56be39b5f6ee1e33529223d7a4d8c0a6101a9ba2/propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7", size = 46443 }, + { url = "https://files.pythonhosted.org/packages/0b/17/308acc6aee65d0f9a8375e36c4807ac6605d1f38074b1581bd4042b9fb37/propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8", size = 45676 }, + { url = "https://files.pythonhosted.org/packages/65/44/626599d2854d6c1d4530b9a05e7ff2ee22b790358334b475ed7c89f7d625/propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793", size = 246191 }, + { url = "https://files.pythonhosted.org/packages/f2/df/5d996d7cb18df076debae7d76ac3da085c0575a9f2be6b1f707fe227b54c/propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09", size = 251791 }, + { url = "https://files.pythonhosted.org/packages/2e/6d/9f91e5dde8b1f662f6dd4dff36098ed22a1ef4e08e1316f05f4758f1576c/propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89", size = 253434 }, + { url = "https://files.pythonhosted.org/packages/3c/e9/1b54b7e26f50b3e0497cd13d3483d781d284452c2c50dd2a615a92a087a3/propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e", size = 248150 }, + { url = "https://files.pythonhosted.org/packages/a7/ef/a35bf191c8038fe3ce9a414b907371c81d102384eda5dbafe6f4dce0cf9b/propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9", size = 233568 }, + { url = "https://files.pythonhosted.org/packages/97/d9/d00bb9277a9165a5e6d60f2142cd1a38a750045c9c12e47ae087f686d781/propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4", size = 229874 }, + { url = "https://files.pythonhosted.org/packages/8e/78/c123cf22469bdc4b18efb78893e69c70a8b16de88e6160b69ca6bdd88b5d/propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c", size = 225857 }, + { url = "https://files.pythonhosted.org/packages/31/1b/fd6b2f1f36d028820d35475be78859d8c89c8f091ad30e377ac49fd66359/propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887", size = 227604 }, + { url = "https://files.pythonhosted.org/packages/99/36/b07be976edf77a07233ba712e53262937625af02154353171716894a86a6/propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57", size = 238430 }, + { url = "https://files.pythonhosted.org/packages/0d/64/5822f496c9010e3966e934a011ac08cac8734561842bc7c1f65586e0683c/propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23", size = 244814 }, + { url = "https://files.pythonhosted.org/packages/fd/bd/8657918a35d50b18a9e4d78a5df7b6c82a637a311ab20851eef4326305c1/propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348", size = 235922 }, + { url = "https://files.pythonhosted.org/packages/a8/6f/ec0095e1647b4727db945213a9f395b1103c442ef65e54c62e92a72a3f75/propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5", size = 40177 }, + { url = "https://files.pythonhosted.org/packages/20/a2/bd0896fdc4f4c1db46d9bc361c8c79a9bf08ccc08ba054a98e38e7ba1557/propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3", size = 44446 }, + { url = "https://files.pythonhosted.org/packages/a8/a7/5f37b69197d4f558bfef5b4bceaff7c43cc9b51adf5bd75e9081d7ea80e4/propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7", size = 78120 }, + { url = "https://files.pythonhosted.org/packages/c8/cd/48ab2b30a6b353ecb95a244915f85756d74f815862eb2ecc7a518d565b48/propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763", size = 45127 }, + { url = "https://files.pythonhosted.org/packages/a5/ba/0a1ef94a3412aab057bd996ed5f0ac7458be5bf469e85c70fa9ceb43290b/propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d", size = 44419 }, + { url = "https://files.pythonhosted.org/packages/b4/6c/ca70bee4f22fa99eacd04f4d2f1699be9d13538ccf22b3169a61c60a27fa/propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a", size = 229611 }, + { url = "https://files.pythonhosted.org/packages/19/70/47b872a263e8511ca33718d96a10c17d3c853aefadeb86dc26e8421184b9/propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b", size = 234005 }, + { url = "https://files.pythonhosted.org/packages/4f/be/3b0ab8c84a22e4a3224719099c1229ddfdd8a6a1558cf75cb55ee1e35c25/propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb", size = 237270 }, + { url = "https://files.pythonhosted.org/packages/04/d8/f071bb000d4b8f851d312c3c75701e586b3f643fe14a2e3409b1b9ab3936/propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf", size = 231877 }, + { url = "https://files.pythonhosted.org/packages/93/e7/57a035a1359e542bbb0a7df95aad6b9871ebee6dce2840cb157a415bd1f3/propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2", size = 217848 }, + { url = "https://files.pythonhosted.org/packages/f0/93/d1dea40f112ec183398fb6c42fde340edd7bab202411c4aa1a8289f461b6/propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f", size = 216987 }, + { url = "https://files.pythonhosted.org/packages/62/4c/877340871251145d3522c2b5d25c16a1690ad655fbab7bb9ece6b117e39f/propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136", size = 212451 }, + { url = "https://files.pythonhosted.org/packages/7c/bb/a91b72efeeb42906ef58ccf0cdb87947b54d7475fee3c93425d732f16a61/propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325", size = 212879 }, + { url = "https://files.pythonhosted.org/packages/9b/7f/ee7fea8faac57b3ec5d91ff47470c6c5d40d7f15d0b1fccac806348fa59e/propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44", size = 222288 }, + { url = "https://files.pythonhosted.org/packages/ff/d7/acd67901c43d2e6b20a7a973d9d5fd543c6e277af29b1eb0e1f7bd7ca7d2/propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83", size = 228257 }, + { url = "https://files.pythonhosted.org/packages/8d/6f/6272ecc7a8daad1d0754cfc6c8846076a8cb13f810005c79b15ce0ef0cf2/propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544", size = 221075 }, + { url = "https://files.pythonhosted.org/packages/7c/bd/c7a6a719a6b3dd8b3aeadb3675b5783983529e4a3185946aa444d3e078f6/propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032", size = 39654 }, + { url = "https://files.pythonhosted.org/packages/88/e7/0eef39eff84fa3e001b44de0bd41c7c0e3432e7648ffd3d64955910f002d/propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e", size = 43705 }, + { url = "https://files.pythonhosted.org/packages/3d/b6/e6d98278f2d49b22b4d033c9f792eda783b9ab2094b041f013fc69bcde87/propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036", size = 11603 }, +] + [[package]] name = "proto-plus" version = "1.24.0" @@ -3459,6 +4144,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842 }, ] +[[package]] +name = "py7zr" +version = "0.22.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "brotli", marker = "platform_python_implementation == 'CPython'" }, + { name = "brotlicffi", marker = "platform_python_implementation == 'PyPy'" }, + { name = "inflate64" }, + { name = "multivolumefile" }, + { name = "psutil", marker = "sys_platform != 'cygwin'" }, + { name = "pybcj" }, + { name = "pycryptodomex" }, + { name = "pyppmd" }, + { name = "pyzstd" }, + { name = "texttable" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/c3/0e05c711c16af0b9c47f3f77323303b338b9a871ba020d95d2b8dd6605ae/py7zr-0.22.0.tar.gz", hash = "sha256:c6c7aea5913535184003b73938490f9a4d8418598e533f9ca991d3b8e45a139e", size = 4992926 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/59/dd1750002c0f46099281116f8165247bc62dc85edad41cdd26e7b26de19d/py7zr-0.22.0-py3-none-any.whl", hash = "sha256:993b951b313500697d71113da2681386589b7b74f12e48ba13cc12beca79d078", size = 67906 }, +] + [[package]] name = "pyarrow" version = "17.0.0" @@ -3512,6 +4218,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/77/89/bc88a6711935ba795a679ea6ebee07e128050d6382eaa35a0a47c8032bdc/pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd", size = 181537 }, ] +[[package]] +name = "pybcj" +version = "1.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/37/d2/22e808b9d25ce3b43f5c8a9e22d873d403485ba55d84a4d6d5d044881762/pybcj-1.0.2.tar.gz", hash = "sha256:c7f5bef7f47723c53420e377bc64d2553843bee8bcac5f0ad076ab1524780018", size = 2111002 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/93/4735636b5905b7597068a2c7a10a8df0f668f28659207c274d64a4468b97/pybcj-1.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7bff28d97e47047d69a4ac6bf59adda738cf1d00adde8819117fdb65d966bdbc", size = 32556 }, + { url = "https://files.pythonhosted.org/packages/a6/37/443cd704397b6df54ff0822032e4815aca4e9badabc5ce1faac34235a40c/pybcj-1.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:198e0b4768b4025eb3309273d7e81dc53834b9a50092be6e0d9b3983cfd35c35", size = 23751 }, + { url = "https://files.pythonhosted.org/packages/9a/aa/5a19ed8661e979a4d3237a11706f9a16a474a2227fdd99ccb284be100a98/pybcj-1.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fa26415b4a118ea790de9d38f244312f2510a9bb5c65e560184d241a6f391a2d", size = 23980 }, + { url = "https://files.pythonhosted.org/packages/fe/5f/638ce03948905d267c8c0ccab81b8b4943a0324f63d8bdb0a0e2a85d4503/pybcj-1.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fabb2be57e4ca28ea36c13146cdf97d73abd27c51741923fc6ba1e8cd33e255c", size = 50155 }, + { url = "https://files.pythonhosted.org/packages/09/70/8b6a6cc2a5721f67f629bdc17875c0d603d57f360a19b099a7b4de19383d/pybcj-1.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75d6d613bae6f27678d5e44e89d61018779726aa6aa950c516d33a04b8af8c59", size = 49729 }, + { url = "https://files.pythonhosted.org/packages/89/06/2e41e34da0bb2adb3644cbf4366c344e5804a10f1153da7b3a23333f7db8/pybcj-1.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3ffae79ef8a1ea81ea2748ad7b7ad9b882aa88ddf65ce90f9e944df639eccc61", size = 54310 }, + { url = "https://files.pythonhosted.org/packages/b5/0f/de9e76c305d4dcd9d428a90ccac030f06c780bc30549fc449a944a6321bc/pybcj-1.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bdb4d8ff5cba3e0bd1adee7d20dbb2b4d80cb31ac04d6ea1cd06cfc02d2ecd0d", size = 53679 }, + { url = "https://files.pythonhosted.org/packages/1a/41/a807ff6b77ec8e49c749ed1d0db5649fbb1150c6fb5fb391115f4f1d743a/pybcj-1.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a29be917fbc99eca204b08407e0971e0205bfdad4b74ec915930675f352b669d", size = 24690 }, + { url = "https://files.pythonhosted.org/packages/27/0a/20bf70a7eb7c6b2668ff2af798254033c32a09d6c58ec9a87cd6aa843df5/pybcj-1.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a2562ebe5a0abec4da0229f8abb5e90ee97b178f19762eb925c1159be36828b3", size = 32581 }, + { url = "https://files.pythonhosted.org/packages/a9/b6/43977fe4296d2778c6dc67b596bb6a851eaea80f3dd4ff454e5fca8142c2/pybcj-1.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:af19bc61ded933001cd68f004ae2042bf1a78eb498a3c685ebd655fa1be90dbe", size = 23767 }, + { url = "https://files.pythonhosted.org/packages/89/c7/a61010f59406b8a45bb4865faa4b61d6b177dcfac04247fb56c7538d997d/pybcj-1.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f3f4a447800850aba7724a2274ea0a4800724520c1caf38f7d0dabf2f89a5e15", size = 23976 }, + { url = "https://files.pythonhosted.org/packages/10/7a/78848edbb6f12d9b86e375fc46135d9a204ededbf96682b05cb4b4fbd942/pybcj-1.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce1c8af7a4761d2b1b531864d84113948daa0c4245775c63bd9874cb955f4662", size = 51246 }, + { url = "https://files.pythonhosted.org/packages/9e/13/af86c86cdfb293e82dd0b6c4bbdf08645cd8993456ee3fb911c3eeed1b22/pybcj-1.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8007371f6f2b462f5aa05d5c2135d0a1bcf5b7bdd9bd15d86c730f588d10b7d3", size = 50754 }, + { url = "https://files.pythonhosted.org/packages/39/52/88600aa374b100612a1d82fca4b03eb4315e0084a05ee314ba1b771f7190/pybcj-1.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1079ca63ff8da5c936b76863690e0bd2489e8d4e0a3a340e032095dae805dd91", size = 55334 }, + { url = "https://files.pythonhosted.org/packages/56/67/3cf9747ef5b53e16a844217c6c9840be6289d05ec785500da2cc55cc25f2/pybcj-1.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e9a785eb26884429d9b9f6326e68c3638828c83bf6d42d2463c97ad5385caff2", size = 54714 }, + { url = "https://files.pythonhosted.org/packages/78/81/a71197903b503f54b85f4d352f909e701e9d26953577bd34d3fbe0520d5d/pybcj-1.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:9ea46e2d45469d13b7f25b08efcdb140220bab1ac5a850db0954591715b8caaa", size = 24693 }, + { url = "https://files.pythonhosted.org/packages/83/60/a3b43836895654aa93b5a8422adc3717359db98da9147abfabffef79f1e7/pybcj-1.0.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:21b5f2460629167340403d359289a173e0729ce8e84e3ce99462009d5d5e01a4", size = 32677 }, + { url = "https://files.pythonhosted.org/packages/50/b9/96c8d9577b0f5a701e4497408e6a331a08eb902aca8dfd4c5bb1eaab4779/pybcj-1.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:2940fb85730b9869254559c491cd83cf777e56c76a8a60df60e4be4f2a4248d7", size = 23813 }, + { url = "https://files.pythonhosted.org/packages/b7/1a/c80132feb084ec4098c0315a132799bddda8878113b5f956e21c4377f5f1/pybcj-1.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f40f3243139d675f43793a4e35c410c370f7b91ccae74e70c8b2f4877869f90e", size = 24019 }, + { url = "https://files.pythonhosted.org/packages/b1/94/62c3bf8a60b4787b46e21f43277d9cb8b6037c8ee183450f035a19a2bc4b/pybcj-1.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c2b3e60b65c7ac73e44335934e1e122da8d56db87840984601b3c5dc0ae4c19", size = 51927 }, + { url = "https://files.pythonhosted.org/packages/8b/9e/4ebd092251ef8d15408388be508617d5949cbba4baa2a6cfbb7e0a9b62c0/pybcj-1.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746550dc7b5af4d04bb5fa4d065f18d39c925bcb5dee30db75747cd9a58bb6e8", size = 51665 }, + { url = "https://files.pythonhosted.org/packages/24/ea/da4637563468854bd361a69cd883946015f54fa119a5d9c655d26f151954/pybcj-1.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:8ce9b62b6aaa5b08773be8a919ecc4e865396c969f982b685eeca6e80c82abb7", size = 56041 }, + { url = "https://files.pythonhosted.org/packages/cf/b2/9b9e670818af925ed9a0168a5c021ccfcc089637d0e6651d16fd05896425/pybcj-1.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:493eab2b1f6f546730a6de0c5ceb75ce16f3767154e8ae30e2b70d41b928b7d2", size = 55606 }, + { url = "https://files.pythonhosted.org/packages/72/e9/d6b1bdf3a5aca8f3981145a5228ad51d72e2477a55927604a4768765e915/pybcj-1.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:ef55b96b7f2ed823e0b924de902065ec42ade856366c287dbb073fabd6b90ec1", size = 24719 }, +] + [[package]] name = "pycparser" version = "2.22" @@ -3521,6 +4259,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 }, ] +[[package]] +name = "pycryptodomex" +version = "3.21.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/dc/e66551683ade663b5f07d7b3bc46434bf703491dbd22ee12d1f979ca828f/pycryptodomex-3.21.0.tar.gz", hash = "sha256:222d0bd05381dd25c32dd6065c071ebf084212ab79bab4599ba9e6a3e0009e6c", size = 4818543 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/5e/99f217d9881eead69607a2248dd7bbdf610837d7f5ad53f45a6cb71bbbfb/pycryptodomex-3.21.0-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:34325b84c8b380675fd2320d0649cdcbc9cf1e0d1526edbe8fce43ed858cdc7e", size = 2499490 }, + { url = "https://files.pythonhosted.org/packages/ce/8f/4d0e2a859a6470289d64e39b419f01d2494dfa2e4995342d50f6c2834237/pycryptodomex-3.21.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:103c133d6cd832ae7266feb0a65b69e3a5e4dbbd6f3a3ae3211a557fd653f516", size = 1638037 }, + { url = "https://files.pythonhosted.org/packages/0c/9e/6e748c1fa814c956d356f93cf7192b19487ca56fc9e2a0bcde2bbc057601/pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77ac2ea80bcb4b4e1c6a596734c775a1615d23e31794967416afc14852a639d3", size = 2172279 }, + { url = "https://files.pythonhosted.org/packages/46/3f/f5bef92b11750af9e3516d4e69736eeeff20a2818d34611508bef5a7b381/pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9aa0cf13a1a1128b3e964dc667e5fe5c6235f7d7cfb0277213f0e2a783837cc2", size = 2258130 }, + { url = "https://files.pythonhosted.org/packages/de/4d/f0c65afd64ce435fd0547187ce6f99dfb37cdde16b05b57bca9f5c06966e/pycryptodomex-3.21.0-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:46eb1f0c8d309da63a2064c28de54e5e614ad17b7e2f88df0faef58ce192fc7b", size = 2297719 }, + { url = "https://files.pythonhosted.org/packages/1c/6a/2a1a101b0345ee70376ba93df8de6c8c01aac8341fda02970800873456a7/pycryptodomex-3.21.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:cc7e111e66c274b0df5f4efa679eb31e23c7545d702333dfd2df10ab02c2a2ce", size = 2164079 }, + { url = "https://files.pythonhosted.org/packages/3d/00/90a15f16c234815b660303c2d7266b41b401ea2605f3a90373e9d425e39f/pycryptodomex-3.21.0-cp36-abi3-musllinux_1_2_i686.whl", hash = "sha256:770d630a5c46605ec83393feaa73a9635a60e55b112e1fb0c3cea84c2897aa0a", size = 2333060 }, + { url = "https://files.pythonhosted.org/packages/61/74/49f5d20c514ccc631b940cc9dfec45dcce418dc84a98463a2e2ebec33904/pycryptodomex-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:52e23a0a6e61691134aa8c8beba89de420602541afaae70f66e16060fdcd677e", size = 2257982 }, + { url = "https://files.pythonhosted.org/packages/92/4b/d33ef74e2cc0025a259936661bb53432c5bbbadc561c5f2e023bcd73ce4c/pycryptodomex-3.21.0-cp36-abi3-win32.whl", hash = "sha256:a3d77919e6ff56d89aada1bd009b727b874d464cb0e2e3f00a49f7d2e709d76e", size = 1779052 }, + { url = "https://files.pythonhosted.org/packages/5b/be/7c991840af1184009fc86267160948350d1bf875f153c97bb471ad944e40/pycryptodomex-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b0e9765f93fe4890f39875e6c90c96cb341767833cfa767f41b490b506fa9ec0", size = 1816307 }, + { url = "https://files.pythonhosted.org/packages/af/ac/24125ad36778914a36f08d61ba5338cb9159382c638d9761ee19c8de822c/pycryptodomex-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:feaecdce4e5c0045e7a287de0c4351284391fe170729aa9182f6bd967631b3a8", size = 1694999 }, + { url = "https://files.pythonhosted.org/packages/93/73/be7a54a5903508070e5508925ba94493a1f326cfeecfff750e3eb250ea28/pycryptodomex-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:365aa5a66d52fd1f9e0530ea97f392c48c409c2f01ff8b9a39c73ed6f527d36c", size = 1769437 }, + { url = "https://files.pythonhosted.org/packages/e5/9f/39a6187f3986841fa6a9f35c6fdca5030ef73ff708b45a993813a51d7d10/pycryptodomex-3.21.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3efddfc50ac0ca143364042324046800c126a1d63816d532f2e19e6f2d8c0c31", size = 1619607 }, + { url = "https://files.pythonhosted.org/packages/f8/70/60bb08e9e9841b18d4669fb69d84b64ce900aacd7eb0ebebd4c7b9bdecd3/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df2608682db8279a9ebbaf05a72f62a321433522ed0e499bc486a6889b96bf3", size = 1653571 }, + { url = "https://files.pythonhosted.org/packages/c9/6f/191b73509291c5ff0dddec9cc54797b1d73303c12b2e4017b24678e57099/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5823d03e904ea3e53aebd6799d6b8ec63b7675b5d2f4a4bd5e3adcb512d03b37", size = 1691548 }, + { url = "https://files.pythonhosted.org/packages/2d/c7/a0d3356f3074ac548afefa515ff46f3bea011deca607faf1c09b26dd5330/pycryptodomex-3.21.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:27e84eeff24250ffec32722334749ac2a57a5fd60332cd6a0680090e7c42877e", size = 1792099 }, +] + [[package]] name = "pydantic" version = "2.8.2" @@ -3751,6 +4513,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572 }, ] +[[package]] +name = "pyppmd" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/39/c8/9039c7503577de08a3f4c81e7619583efdc16030da6d1a25268d3dca49c8/pyppmd-1.1.0.tar.gz", hash = "sha256:1d38ce2e4b7eb84b53bc8a52380b94f66ba6c39328b8800b30c2b5bf31693973", size = 1348949 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/10/b19621035862e2ae12a1ba14c5b5c0a0befb27906bc00691642d7bdbdce6/pyppmd-1.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5cd428715413fe55abf79dc9fc54924ba7e518053e1fc0cbdf80d0d99cf1442", size = 75756 }, + { url = "https://files.pythonhosted.org/packages/85/4a/a7c172cd431c4e1ddf9be349dc4bcfea81c2a236d2fe51bbfdcd697af55a/pyppmd-1.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0e96cc43f44b7658be2ea764e7fa99c94cb89164dbb7cdf209178effc2168319", size = 47347 }, + { url = "https://files.pythonhosted.org/packages/0d/32/f7357e0412e977ede4d63ba8bf55d014e5ea5b311818b2b0a1fee6d91baa/pyppmd-1.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dd20142869094bceef5ab0b160f4fff790ad1f612313a1e3393a51fc3ba5d57e", size = 46640 }, + { url = "https://files.pythonhosted.org/packages/b5/8e/1f416819f0aab17de47b15b72d0e9b05e2bf795c6e28d9f403ac01398b74/pyppmd-1.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4f9b51e45c11e805e74ea6f6355e98a6423b5bbd92f45aceee24761bdc3d3b8", size = 135666 }, + { url = "https://files.pythonhosted.org/packages/73/ac/7d07d3ac6874f235554de392de08e6a369001db43cd6a619af4fbe02fb55/pyppmd-1.1.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:459f85e928fb968d0e34fb6191fd8c4e710012d7d884fa2b317b2e11faac7c59", size = 132892 }, + { url = "https://files.pythonhosted.org/packages/09/76/61db4268a439cfba8736b14130d928d199633fab2360a2c5043332a427d2/pyppmd-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f73cf2aaf60477eef17f5497d14b6099d8be9748390ad2b83d1c88214d050c05", size = 138901 }, + { url = "https://files.pythonhosted.org/packages/8b/9c/546729489ae07c0d7c2bfe37c69ae1cd3ce35a18ab000480ea4e8f12754f/pyppmd-1.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:2ea3ae0e92c0b5345cd3a4e145e01bbd79c2d95355481ea5d833b5c0cb202a2d", size = 139725 }, + { url = "https://files.pythonhosted.org/packages/f7/db/4e734e97541554a389e7adb2a2a5c86ad8ae35c4dafe817b12fdc317de1a/pyppmd-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:775172c740133c0162a01c1a5443d0e312246881cdd6834421b644d89a634b91", size = 131598 }, + { url = "https://files.pythonhosted.org/packages/b1/8f/530e47290e07d2fdedfd345fc72af08226ccdd4cc913c2b895a8396c17b6/pyppmd-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:14421030f1d46f69829698bdd960698a3b3df0925e3c470e82cfcdd4446b7bc1", size = 142767 }, + { url = "https://files.pythonhosted.org/packages/a5/f9/16e0adfef500b171a96ed3c95f4a4d999f99cc79de3e415146808b19c2fb/pyppmd-1.1.0-cp310-cp310-win32.whl", hash = "sha256:b691264f9962532aca3bba5be848b6370e596d0a2ca722c86df388be08d0568a", size = 41283 }, + { url = "https://files.pythonhosted.org/packages/37/8d/c4846ab632e13ead87189f31bcc51fc825c75078d162a4a9dc8aed0a5b97/pyppmd-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:216b0d969a3f06e35fbfef979706d987d105fcb1e37b0b1324f01ee143719c4a", size = 46078 }, + { url = "https://files.pythonhosted.org/packages/27/0e/9db5d7c6ca3159aa0f07c0f1d5c59079176e7c57740a61aca62a39661178/pyppmd-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1f8c51044ee4df1b004b10bf6b3c92f95ea86cfe1111210d303dca44a56e4282", size = 75781 }, + { url = "https://files.pythonhosted.org/packages/f0/1b/4894b5c71feee76d3dfccf4383b59841f9bfd27aecf912b6542a2ab1e073/pyppmd-1.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ac25b3a13d1ac9b8f0bde46952e10848adc79d932f2b548a6491ef8825ae0045", size = 47370 }, + { url = "https://files.pythonhosted.org/packages/50/98/57b2c281e546f682279bd4a2577045d1f6d527c8fa2151a990b2a9bc48c2/pyppmd-1.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c8d3003eebe6aabe22ba744a38a146ed58a25633420d5da882b049342b7c8036", size = 46633 }, + { url = "https://files.pythonhosted.org/packages/06/72/b7e37aa69b7a105bcc119bc171437fbcb104aef2568b68ec8ed21a3fcdd1/pyppmd-1.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c520656bc12100aa6388df27dd7ac738577f38bf43f4a4bea78e1861e579ea5", size = 138233 }, + { url = "https://files.pythonhosted.org/packages/60/73/4f53a3c7730e1cba3f210b35ed6779e0fe302739196f43452664e079c0b5/pyppmd-1.1.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c2a3e807028159a705951f5cb5d005f94caed11d0984e59cc50506de543e22d", size = 135486 }, + { url = "https://files.pythonhosted.org/packages/31/7c/956ebf1f07506bb59e6f13ef068d91f1bec828758d399b455b175b668f6c/pyppmd-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec8a2447e69444703e2b273247bfcd4b540ec601780eff07da16344c62d2993d", size = 141183 }, + { url = "https://files.pythonhosted.org/packages/73/b4/4863499e012c555f4619dbebc5b83d79818e0161d9b6fb8b1e709fb1d6c7/pyppmd-1.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b9e0c8053e69cad6a92a0889b3324f567afc75475b4f54727de553ac4fc85780", size = 141752 }, + { url = "https://files.pythonhosted.org/packages/b4/cc/44e175222b31f86d0192d1d0d2c46c4bf0e933c9a06a65ff39596ad05666/pyppmd-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:5938d256e8d2a2853dc3af8bb58ae6b4a775c46fc891dbe1826a0b3ceb624031", size = 133921 }, + { url = "https://files.pythonhosted.org/packages/f1/d9/2f2e222d43ab274909e8dcd16d25cd4cc0245a8d59f93f8d6397cd4dc49f/pyppmd-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1ce5822d8bea920856232ccfb3c26b56b28b6846ea1b0eb3d5cb9592a026649e", size = 145191 }, + { url = "https://files.pythonhosted.org/packages/6d/e7/1214571442624e2314ed1ed5ba0081358335fc760fb455c3d8df83b118c6/pyppmd-1.1.0-cp311-cp311-win32.whl", hash = "sha256:2a9e894750f2a52b03e3bc0d7cf004d96c3475a59b1af7e797d808d7d29c9ffe", size = 41286 }, + { url = "https://files.pythonhosted.org/packages/8e/7f/d3cc8443bd2b56bc54ea205dcf73d70ef8d4342096ff33fc8719956f45e9/pyppmd-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:969555c72e72fe2b4dd944127521a8f2211caddb5df452bbc2506b5adfac539e", size = 46087 }, + { url = "https://files.pythonhosted.org/packages/bf/0b/4c8e3a92c4366a9aa2d801ab4bd7ba72bd1d214da890dd91ab4d73e52878/pyppmd-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9d6ef8fd818884e914bc209f7961c9400a4da50d178bba25efcef89f09ec9169", size = 76116 }, + { url = "https://files.pythonhosted.org/packages/e1/0b/45fdf5a28c810ed4d3c0cb05ae5346e2972cdbfe89f374b263e07c5b820d/pyppmd-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:95f28e2ecf3a9656bd7e766aaa1162b6872b575627f18715f8b046e8617c124a", size = 47633 }, + { url = "https://files.pythonhosted.org/packages/56/a4/4aa1d36d98f3786c8b12ac96ac8234d7dc3c2a9e8f5174a5698f424099ec/pyppmd-1.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:37f3557ea65ee417abcdf5f49d35df00bb9f6f252639cae57aeefcd0dd596133", size = 46704 }, + { url = "https://files.pythonhosted.org/packages/d9/70/a49389a6666f670db5ecc7caa37030c9a9abfeea455c387172584551a271/pyppmd-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e84b25d088d7727d50218f57f92127cdb839acd6ec3de670b6680a4cf0b2d2a", size = 139145 }, + { url = "https://files.pythonhosted.org/packages/30/4c/f08cdf618744a3cce0da106ecf6e427b24d27b0bb1484afc40b88ca23a39/pyppmd-1.1.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99ed42891986dac8c2ecf52bddfb777900233d867aa18849dbba6f3335600466", size = 136618 }, + { url = "https://files.pythonhosted.org/packages/bb/e0/afc0fb971c893e9e72cc8d70df93c50b3f3ebb12b4bdb21f869b775faf7e/pyppmd-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6fe69b82634488ada75ba07efb90cd5866fa3d64a2c12932b6e8ae207a14e5f", size = 142757 }, + { url = "https://files.pythonhosted.org/packages/26/b2/793e92c7a66de0b0b8d777c3c4df3ee5a5bec7fbaf0b69ab7374cefefa43/pyppmd-1.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:60981ffde1fe6ade750b690b35318c41a1160a8505597fda2c39a74409671217", size = 142749 }, + { url = "https://files.pythonhosted.org/packages/5e/6e/a1bf750bc7ed025a06600c65917d02e3c6dea7dfa728746c7251d4910d37/pyppmd-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:46e8240315476f57aac23d71e6de003e122b65feba7c68f4cc46a089a82a7cd4", size = 135033 }, + { url = "https://files.pythonhosted.org/packages/1e/ee/4a12a4b1990f1fabb77f9ef94d2cd6c795690eec79ad135b8236dc59dbd2/pyppmd-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c0308e2e76ecb4c878a18c2d7a7c61dbca89b4ef138f65d5f5ead139154dcdea", size = 146510 }, + { url = "https://files.pythonhosted.org/packages/04/cd/a6571420345315f5340ac10897726303ae07260cb025dc4a60371d1e8b97/pyppmd-1.1.0-cp312-cp312-win32.whl", hash = "sha256:b4fa4c27dc1314d019d921f2aa19e17f99250557e7569eeb70e180558f46af74", size = 41332 }, + { url = "https://files.pythonhosted.org/packages/c0/a4/af77129d671d6adcc6c82e1b0f03f0ad0b70c44ac70ed4c72b5c8952553b/pyppmd-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:c269d21e15f4175df27cf00296476097af76941f948734c642d7fb6e85b9b3b9", size = 46193 }, + { url = "https://files.pythonhosted.org/packages/6a/e2/1d5fbd6dde1234b635000072c8d1d87c7ed3acf01a3c4aa8082504d58bc5/pyppmd-1.1.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ad5da9f7592158e6b6b51d7cd15e536d8b23afbb4d22cba4e5744c7e0a3548b1", size = 41505 }, + { url = "https://files.pythonhosted.org/packages/24/66/9215c5dda61b3aa3259902a586dacd198b4b0793ab99228734091b5e7fa7/pyppmd-1.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc6543e7d12ef0a1466d291d655e3d6bca59c7336dbb53b62ccdd407822fb52b", size = 44814 }, + { url = "https://files.pythonhosted.org/packages/1a/87/cc2aa429688f238ae30f26b8334194a21e25643d3257c9e5b14cccdc578e/pyppmd-1.1.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a5e4008a45910e3c8c227f6f240de67eb14454c015dc3d8060fc41e230f395d3", size = 43629 }, + { url = "https://files.pythonhosted.org/packages/9f/96/cd3f64f6bdce091ffb6d2c1c23dc91e8b94e312a5d08cd648625555fb69e/pyppmd-1.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9301fa39d1fb0ed09a10b4c5d7f0074113e96a1ead16ba7310bedf95f7ef660c", size = 43911 }, + { url = "https://files.pythonhosted.org/packages/e6/ab/02ab90e2dddf2dd55e30e64fa0509627c6e0c86b26503a6df95ae55b1e45/pyppmd-1.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:59521a3c6028da0cb5780ba16880047b00163432a6b975da2f6123adfc1b0be8", size = 42427 }, +] + [[package]] name = "pyproj" version = "3.6.1" @@ -4073,6 +4881,76 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/d2/3b2ab40f455a256cb6672186bea95cd97b459ce4594050132d71e76f0d6f/pyzmq-26.2.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:90412f2db8c02a3864cbfc67db0e3dcdbda336acf1c469526d3e869394fe001c", size = 550762 }, ] +[[package]] +name = "pyzstd" +version = "0.16.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/62/14/878fee4072cecb1cc6e061c7d0d933e481389c27de939538c9cc3f18894a/pyzstd-0.16.2.tar.gz", hash = "sha256:179c1a2ea1565abf09c5f2fd72f9ce7c54b2764cf7369e05c0bfd8f1f67f63d2", size = 789505 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/a9/efad061c5a982f859ba8bf5de565d73567f87ad8bba3364fe28e9a8672b6/pyzstd-0.16.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:637376c8f8cbd0afe1cab613f8c75fd502bd1016bf79d10760a2d5a00905fe62", size = 372191 }, + { url = "https://files.pythonhosted.org/packages/b6/36/eb6dcfacb273ca13dfa20d296f27ffd0a6c53677965f868625edf764b71e/pyzstd-0.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3e7a7118cbcfa90ca2ddbf9890c7cb582052a9a8cf2b7e2c1bbaf544bee0f16a", size = 295083 }, + { url = "https://files.pythonhosted.org/packages/fb/76/a7862487402123f221439808ed50915e00cfc8e1df7365af366610176347/pyzstd-0.16.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a74cb1ba05876179525144511eed3bd5a509b0ab2b10632c1215a85db0834dfd", size = 390166 }, + { url = "https://files.pythonhosted.org/packages/b8/52/1e1ab63026d67f18b9841285576d59bb799b838a5de4f852ad9e054674a1/pyzstd-0.16.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7c084dde218ffbf112e507e72cbf626b8f58ce9eb23eec129809e31037984662", size = 472043 }, + { url = "https://files.pythonhosted.org/packages/0d/24/14c8948b9d16d399ff80504bc404bb091b0eb5339f6fbdad0481da751c09/pyzstd-0.16.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4646459ebd3d7a59ddbe9312f020bcf7cdd1f059a2ea07051258f7af87a0b31", size = 415258 }, + { url = "https://files.pythonhosted.org/packages/6b/3e/e4c7f449af9d19975ff5d333a58330317cf8b05fe4754106c694a29e7c25/pyzstd-0.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14bfc2833cc16d7657fc93259edeeaa793286e5031b86ca5dc861ba49b435fce", size = 413680 }, + { url = "https://files.pythonhosted.org/packages/10/09/8918853028cf593c141456b9a42d68420beec3f16a8cc4f1aa5d0b8b0c84/pyzstd-0.16.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f27d488f19e5bf27d1e8aa1ae72c6c0a910f1e1ffbdf3c763d02ab781295dd27", size = 412630 }, + { url = "https://files.pythonhosted.org/packages/47/20/5a4c899530571e0e8ecdcb9dc7e3fc38491d4b342fbd7d8413805c88013b/pyzstd-0.16.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:91e134ca968ff7dcfa8b7d433318f01d309b74ee87e0d2bcadc117c08e1c80db", size = 404980 }, + { url = "https://files.pythonhosted.org/packages/0a/1d/aeeeebb702d3500a01b5b1029ba1716aea3afa75e8aacb904806b3f1afe5/pyzstd-0.16.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:6b5f64cd3963c58b8f886eb6139bb8d164b42a74f8a1bb95d49b4804f4592d61", size = 418000 }, + { url = "https://files.pythonhosted.org/packages/fc/0c/66ca36d24ad97af40a8fe8de9e3f316a5f4fd2fb3cab8634a2f7da5571c8/pyzstd-0.16.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:0b4a8266871b9e0407f9fd8e8d077c3558cf124d174e6357b523d14f76971009", size = 485576 }, + { url = "https://files.pythonhosted.org/packages/39/66/6c1de1347de94aa85f60e854cccae0948bda2eda2351e4d47c8bb0a7cf18/pyzstd-0.16.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1bb19f7acac30727354c25125922aa59f44d82e0e6a751df17d0d93ff6a73853", size = 564542 }, + { url = "https://files.pythonhosted.org/packages/6d/46/75365a3ab279d58e69d410ce0a21527e689fa651837227e23dee294d096f/pyzstd-0.16.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3008325b7368e794d66d4d98f2ee1d867ef5afd09fd388646ae02b25343c420d", size = 430619 }, + { url = "https://files.pythonhosted.org/packages/0d/62/17bf81d42acbd39bffdea559b6fbd7ec331cd74bc52f249e536fefe5480d/pyzstd-0.16.2-cp310-cp310-win32.whl", hash = "sha256:66f2d5c0bbf5bf32c577aa006197b3525b80b59804450e2c32fbcc2d16e850fd", size = 218224 }, + { url = "https://files.pythonhosted.org/packages/f7/b6/281245890df08a567186c6e262c43d68581291cca107c8d7304c37708e46/pyzstd-0.16.2-cp310-cp310-win_amd64.whl", hash = "sha256:5fe5f5459ebe1161095baa7a86d04ab625b35148f6c425df0347ed6c90a2fd58", size = 245012 }, + { url = "https://files.pythonhosted.org/packages/10/5a/19d7aec81853f6dc53eabad388227e3beecfaca4788af23b8807a0ea2112/pyzstd-0.16.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c1bdbe7f01c7f37d5cd07be70e32a84010d7dfd6677920c0de04cf7d245b60d", size = 372192 }, + { url = "https://files.pythonhosted.org/packages/29/35/2eb025e6a0fff49b5de8bea20e82e4d7d5456e634bf3809123fbe5e5f194/pyzstd-0.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1882a3ceaaf9adc12212d587d150ec5e58cfa9a765463d803d739abbd3ac0f7a", size = 295084 }, + { url = "https://files.pythonhosted.org/packages/04/1f/03785d7ff1ce73b9347533f798cb27afa57768e66012f97b18b7b7303158/pyzstd-0.16.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea46a8b9d60f6a6eba29facba54c0f0d70328586f7ef0da6f57edf7e43db0303", size = 390167 }, + { url = "https://files.pythonhosted.org/packages/b7/59/e307622115a2df30075efbd28933dc0ad8f2007c5ba5a3eb49c956de3d56/pyzstd-0.16.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d7865bc06589cdcecdede0deefe3da07809d5b7ad9044c224d7b2a0867256957", size = 472038 }, + { url = "https://files.pythonhosted.org/packages/97/21/870fda5454240089e9c37625320580d392b03beaeae4889c67c0a21c4d34/pyzstd-0.16.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:52f938a65b409c02eb825e8c77fc5ea54508b8fc44b5ce226db03011691ae8cc", size = 415217 }, + { url = "https://files.pythonhosted.org/packages/3c/35/b33faeeb9c96fddd08bf7871c9f5c4638c32ad79227155922fd4a63190c5/pyzstd-0.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e97620d3f53a0282947304189deef7ca7f7d0d6dfe15033469dc1c33e779d5e5", size = 413714 }, + { url = "https://files.pythonhosted.org/packages/aa/a3/b9058dd43eb52025a2ca78946dcb9ef9d8984acac172a698bcf12712217c/pyzstd-0.16.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7c40e9983d017108670dc8df68ceef14c7c1cf2d19239213274783041d0e64c", size = 412568 }, + { url = "https://files.pythonhosted.org/packages/12/31/fe7d462c912f2040775bfa2af4327f9fcebb16e8fa9c3bfa058bc1306722/pyzstd-0.16.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7cd4b3b2c6161066e4bde6af1cf78ed3acf5d731884dd13fdf31f1db10830080", size = 404988 }, + { url = "https://files.pythonhosted.org/packages/48/4c/582aca0e5210436499bce1639a8d15da3f76f8d5827da1aa3eeb2c4e271c/pyzstd-0.16.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:454f31fd84175bb203c8c424f2255a343fa9bd103461a38d1bf50487c3b89508", size = 417961 }, + { url = "https://files.pythonhosted.org/packages/39/e9/54f53641ff10b4ea18d3ba159b03bd07e6ae5a5b7ae01f1329b0c35b8ca2/pyzstd-0.16.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:5ef754a93743f08fb0386ce3596780bfba829311b49c8f4107af1a4bcc16935d", size = 485587 }, + { url = "https://files.pythonhosted.org/packages/ce/65/25243b3fea9e52a20bfece1b12e3d3ee3125f17b1735aab08cb9a7a760b4/pyzstd-0.16.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:be81081db9166e10846934f0e3576a263cbe18d81eca06e6a5c23533f8ce0dc6", size = 564543 }, + { url = "https://files.pythonhosted.org/packages/3b/3c/324b8ddca55b4b073b413cea3e0587af3c8153ccf7d6d63ed294831f2095/pyzstd-0.16.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:738bcb2fa1e5f1868986f5030955e64de53157fa1141d01f3a4daf07a1aaf644", size = 430628 }, + { url = "https://files.pythonhosted.org/packages/db/a1/aca18925e23bceb833fc742ebaf87aa9d1ba8b178f0332bd108fc8966482/pyzstd-0.16.2-cp311-cp311-win32.whl", hash = "sha256:0ea214c9b97046867d1657d55979021028d583704b30c481a9c165191b08d707", size = 218215 }, + { url = "https://files.pythonhosted.org/packages/c0/7f/0f5d1d1891e6c6e14d846d2881a06ab7e5e97cabeb5e1e9e53debec4091a/pyzstd-0.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:c17c0fc02f0e75b0c7cd21f8eaf4c6ce4112333b447d93da1773a5f705b2c178", size = 245055 }, + { url = "https://files.pythonhosted.org/packages/28/15/20046759d138733e7150afa6aa15f322022d7587968e2dbd5b36fbf8aa86/pyzstd-0.16.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4081fd841a9efe9ded7290ee7502dbf042c4158b90edfadea3b8a072c8ec4e1", size = 373230 }, + { url = "https://files.pythonhosted.org/packages/51/8d/55b536edaecf19d2f8dbd8fbaefd184f2f9cc6b71d241caa6d86bed96813/pyzstd-0.16.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fd3fa45d2aeb65367dd702806b2e779d13f1a3fa2d13d5ec777cfd09de6822de", size = 295699 }, + { url = "https://files.pythonhosted.org/packages/11/14/086e7f690154c6f3d9bdb46da26a4cd3c9e0b284346ce10943711ca48c32/pyzstd-0.16.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8b5f0d2c07994a5180d8259d51df6227a57098774bb0618423d7eb4a7303467", size = 390556 }, + { url = "https://files.pythonhosted.org/packages/90/d2/c6d854705d6fa0ad876209b4ba796ab31d85b710d1459029f2cb41085a8d/pyzstd-0.16.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60c9d25b15c7ae06ed5d516d096a0d8254f9bed4368b370a09cccf191eaab5cb", size = 472928 }, + { url = "https://files.pythonhosted.org/packages/aa/38/f97dd871e446adc834349caa605dbaf5bac86763a255f62c809cc2459c85/pyzstd-0.16.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:29acf31ce37254f6cad08deb24b9d9ba954f426fa08f8fae4ab4fdc51a03f4ae", size = 416057 }, + { url = "https://files.pythonhosted.org/packages/53/be/0c5ad7bf29dc890f6a3303760b9802aeeafa4e3ffb598de625f501986bfe/pyzstd-0.16.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec77612a17697a9f7cf6634ffcee616eba9b997712fdd896e77fd19ab3a0618", size = 414613 }, + { url = "https://files.pythonhosted.org/packages/1f/1a/d3a1edcd59e2f62a35ac6257d2b86a2c872ae9a8e925380620a8db0d9a9a/pyzstd-0.16.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:313ea4974be93be12c9a640ab40f0fc50a023178aae004a8901507b74f190173", size = 413236 }, + { url = "https://files.pythonhosted.org/packages/f2/8d/912430c2310466c14a89a5a529b72eddef7e73fa733806dbe0b030cf3495/pyzstd-0.16.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e91acdefc8c2c6c3b8d5b1b5fe837dce4e591ecb7c0a2a50186f552e57d11203", size = 405536 }, + { url = "https://files.pythonhosted.org/packages/9e/83/4edb419a13b9d1e1debc01e88084eba93a5f7c10ef198da11f6782857c73/pyzstd-0.16.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:929bd91a403539e72b5b5cb97f725ac4acafe692ccf52f075e20cd9bf6e5493d", size = 419145 }, + { url = "https://files.pythonhosted.org/packages/8f/e9/62a169eddc37aefac480ee3b3318c221f6731e1e342dafd9e05b7fdaa7c5/pyzstd-0.16.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:740837a379aa32d110911ebcbbc524f9a9b145355737527543a884bd8777ca4f", size = 487157 }, + { url = "https://files.pythonhosted.org/packages/57/9d/5949f2a0144d1f99fab7914f854b582d2784c73139cc190e603e4d6b7b37/pyzstd-0.16.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:adfc0e80dd157e6d1e0b0112c8ecc4b58a7a23760bd9623d74122ef637cfbdb6", size = 565918 }, + { url = "https://files.pythonhosted.org/packages/de/ce/647b9c7602ac477c9e62cf9399810f72bb5dba8f508e7cdf8be1d260e6f9/pyzstd-0.16.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:79b183beae1c080ad3dca39019e49b7785391947f9aab68893ad85d27828c6e7", size = 431373 }, + { url = "https://files.pythonhosted.org/packages/8b/fb/4141e3d4549eea26e5a59ec723eade271980816cb2ed7613df855baa672f/pyzstd-0.16.2-cp312-cp312-win32.whl", hash = "sha256:b8d00631a3c466bc313847fab2a01f6b73b3165de0886fb03210e08567ae3a89", size = 218541 }, + { url = "https://files.pythonhosted.org/packages/51/b9/e1373b179129c2095d70bd1df02a51d388f4c7e4ecb62acb4e5e9570269b/pyzstd-0.16.2-cp312-cp312-win_amd64.whl", hash = "sha256:c0d43764e9a60607f35d8cb3e60df772a678935ab0e02e2804d4147377f4942c", size = 245320 }, + { url = "https://files.pythonhosted.org/packages/66/10/cc7c764c7673f1af1728abdcf58e58f88ef5d44ab4500677a2b7b4c01e7d/pyzstd-0.16.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3ae9ae7ad730562810912d7ecaf1fff5eaf4c726f4b4dfe04784ed5f06d7b91f", size = 373223 }, + { url = "https://files.pythonhosted.org/packages/3f/a7/bcaf7d635ee929dd4d08ae1c35101892db56a11542471eecfbf46b9dd988/pyzstd-0.16.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2ce8d3c213f76a564420f3d0137066ac007ce9fb4e156b989835caef12b367a7", size = 295701 }, + { url = "https://files.pythonhosted.org/packages/93/49/a604113a2f3135b29371a894c0faad22d7ea3f7b58f38d77baad8a817483/pyzstd-0.16.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c2c14dac23c865e2d78cebd9087e148674b7154f633afd4709b4cd1520b99a61", size = 392395 }, + { url = "https://files.pythonhosted.org/packages/b0/38/886ecf3ebb13a4b6e3ee85f448f54eef37a5ae2b453bd9d5d9edc909e119/pyzstd-0.16.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4527969d66a943e36ef374eda847e918077de032d58b5df84d98ffd717b6fa77", size = 474523 }, + { url = "https://files.pythonhosted.org/packages/14/98/121da6ac072c00090c218b4888ef00ead15979f09a657d9a5ff770d6bb17/pyzstd-0.16.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd8256149b88e657e99f31e6d4b114c8ff2935951f1d8bb8e1fe501b224999c0", size = 417974 }, + { url = "https://files.pythonhosted.org/packages/b6/ba/56652a67c0bcfaceb2945e5f07d5aa21af86e07cf33d1ae47bb3529a56c3/pyzstd-0.16.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5bd1f1822d65c9054bf36d35307bf8ed4aa2d2d6827431761a813628ff671b1d", size = 414587 }, + { url = "https://files.pythonhosted.org/packages/cc/30/cab6f45101f0113ced609ef65482aedd276e0f022d9f25a327d4284142f5/pyzstd-0.16.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6733f4d373ec9ad2c1976cf06f973a3324c1f9abe236d114d6bb91165a397d", size = 415071 }, + { url = "https://files.pythonhosted.org/packages/6d/44/2187fc8a46662926943aeb16d639dd4f3d06267c7e8abb2c6f97700ab11c/pyzstd-0.16.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7bec165ab6524663f00b69bfefd13a46a69fed3015754abaf81b103ec73d92c6", size = 407835 }, + { url = "https://files.pythonhosted.org/packages/de/d5/6edca97d5453cba820d2ad5630e6ec1fcfad66f69af5ad7d6c688ea301be/pyzstd-0.16.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e4460fa6949aac6528a1ad0de8871079600b12b3ef4db49316306786a3598321", size = 421755 }, + { url = "https://files.pythonhosted.org/packages/54/c1/1a0339e014ed97f4e6fd9166b0409ceda8f32e28e8ecda70fd7bb0915566/pyzstd-0.16.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:75df79ea0315c97d88337953a17daa44023dbf6389f8151903d371513f503e3c", size = 489174 }, + { url = "https://files.pythonhosted.org/packages/07/01/c65f2c9f0b902b33efcb0bdf3cbd07fc828fda6ff6333189eb71cf7acc60/pyzstd-0.16.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:93e1d45f4a196afb6f18682c79bdd5399277ead105b67f30b35c04c207966071", size = 573025 }, + { url = "https://files.pythonhosted.org/packages/a7/54/7ab9cc54171b7f8bb97cfd1c1aa7fcb706a4babeb629732529d8111bc4e6/pyzstd-0.16.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:075e18b871f38a503b5d23e40a661adfc750bd4bd0bb8b208c1e290f3ceb8fa2", size = 429582 }, + { url = "https://files.pythonhosted.org/packages/6c/a5/f9c950bb378dd1335bc4cc56444ec2ab40b1dab085c5798c5d16a9bf9d0b/pyzstd-0.16.2-cp313-cp313-win32.whl", hash = "sha256:9e4295eb299f8d87e3487852bca033d30332033272a801ca8130e934475e07a9", size = 218544 }, + { url = "https://files.pythonhosted.org/packages/9a/df/a15b9a8a59cd9908ae2b70bce2cb4ac3e2d7da11414ee0d0ceb46e4d0439/pyzstd-0.16.2-cp313-cp313-win_amd64.whl", hash = "sha256:18deedc70f858f4cf574e59f305d2a0678e54db2751a33dba9f481f91bc71c28", size = 245313 }, + { url = "https://files.pythonhosted.org/packages/f9/ad/c09fb722c12a82b826c97efc50a919e229bfbaf644f5a140adcd71941473/pyzstd-0.16.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4b631117b97a42ff6dfd0ffc885a92fff462d7c34766b28383c57b996f863338", size = 364187 }, + { url = "https://files.pythonhosted.org/packages/57/f9/93175fe72f85fb675fe04abca296fe583112a25d0ec7faa026288d9463c2/pyzstd-0.16.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:56493a3fbe1b651a02102dd0902b0aa2377a732ff3544fb6fb3f114ca18db52f", size = 279825 }, + { url = "https://files.pythonhosted.org/packages/8a/de/0b40acf76d7ed1f7975877535e004de85ec2e869632754b5d4d389258b8a/pyzstd-0.16.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1eae9bdba4a1e5d3181331f403114ff5b8ce0f4b569f48eba2b9beb2deef1e4", size = 321313 }, + { url = "https://files.pythonhosted.org/packages/41/5e/00102bacd1a7c957c88098f3ae2cdac17842ac0f94d2e685ff5b75a05730/pyzstd-0.16.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1be6972391c8aeecc7e61feb96ffc8e77a401bcba6ed994e7171330c45a1948", size = 344376 }, + { url = "https://files.pythonhosted.org/packages/a3/95/27a7da3dbd4460cd9432bdc22d9d5f8ec77c86275d069020fa74ea280f7f/pyzstd-0.16.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:761439d687e3a5687c2ff5c6a1190e1601362a4a3e8c6c82ff89719d51d73e19", size = 328591 }, + { url = "https://files.pythonhosted.org/packages/c2/03/8f4d5fd45f6bfad66d67cdf583492a9f52a21049f60e6b36a7e9f8aa7adc/pyzstd-0.16.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f5fbdb8cf31b60b2dc586fecb9b73e2f172c21a0b320ed275f7b8d8a866d9003", size = 240786 }, +] + [[package]] name = "qtconsole" version = "5.6.0" @@ -4570,6 +5448,67 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3c/4a/b221409913760d26cf4498b7b1741d510c82d3ad38381984a3ddc135ec66/s3transfer-0.10.2-py3-none-any.whl", hash = "sha256:eca1c20de70a39daee580aef4986996620f365c4e0fda6a86100231d62f1bf69", size = 82716 }, ] +[[package]] +name = "safetensors" +version = "0.4.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cb/46/a1c56ed856c6ac3b1a8b37abe5be0cac53219367af1331e721b04d122577/safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310", size = 65702 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/10/0798ec2c8704c2d172620d8a3725bed92cdd75516357b1a3e64d4229ea4e/safetensors-0.4.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a63eaccd22243c67e4f2b1c3e258b257effc4acd78f3b9d397edc8cf8f1298a7", size = 392312 }, + { url = "https://files.pythonhosted.org/packages/2b/9e/9648d8dbb485c40a4a0212b7537626ae440b48156cc74601ca0b7a7615e0/safetensors-0.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:23fc9b4ec7b602915cbb4ec1a7c1ad96d2743c322f20ab709e2c35d1b66dad27", size = 381858 }, + { url = "https://files.pythonhosted.org/packages/8b/67/49556aeacc00df353767ed31d68b492fecf38c3f664c52692e4d92aa0032/safetensors-0.4.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6885016f34bef80ea1085b7e99b3c1f92cb1be78a49839203060f67b40aee761", size = 441382 }, + { url = "https://files.pythonhosted.org/packages/5d/ce/e9f4869a37bb11229e6cdb4e73a6ef23b4f360eee9dca5f7e40982779704/safetensors-0.4.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:133620f443450429322f238fda74d512c4008621227fccf2f8cf4a76206fea7c", size = 439001 }, + { url = "https://files.pythonhosted.org/packages/a0/27/aee8cf031b89c34caf83194ec6b7f2eed28d053fff8b6da6d00c85c56035/safetensors-0.4.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4fb3e0609ec12d2a77e882f07cced530b8262027f64b75d399f1504ffec0ba56", size = 478026 }, + { url = "https://files.pythonhosted.org/packages/da/33/1d9fc4805c623636e7d460f28eec92ebd1856f7a552df8eb78398a1ef4de/safetensors-0.4.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0f1dd769f064adc33831f5e97ad07babbd728427f98e3e1db6902e369122737", size = 495545 }, + { url = "https://files.pythonhosted.org/packages/b9/df/6f766b56690709d22e83836e4067a1109a7d84ea152a6deb5692743a2805/safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6d156bdb26732feada84f9388a9f135528c1ef5b05fae153da365ad4319c4c5", size = 435016 }, + { url = "https://files.pythonhosted.org/packages/90/fa/7bc3f18086201b1e55a42c88b822ae197d0158e12c54cd45c887305f1b7e/safetensors-0.4.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9e347d77e2c77eb7624400ccd09bed69d35c0332f417ce8c048d404a096c593b", size = 456273 }, + { url = "https://files.pythonhosted.org/packages/3e/59/2ae50150d37a65c1c5f01aec74dc737707b8bbecdc76307e5a1a12c8a376/safetensors-0.4.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9f556eea3aec1d3d955403159fe2123ddd68e880f83954ee9b4a3f2e15e716b6", size = 619669 }, + { url = "https://files.pythonhosted.org/packages/fe/43/10f0bb597aef62c9c154152e265057089f3c729bdd980e6c32c3ec2407a4/safetensors-0.4.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9483f42be3b6bc8ff77dd67302de8ae411c4db39f7224dec66b0eb95822e4163", size = 605212 }, + { url = "https://files.pythonhosted.org/packages/7c/75/ede6887ea0ceaba55730988bfc7668dc147a8758f907fa6db26fbb681b8e/safetensors-0.4.5-cp310-none-win32.whl", hash = "sha256:7389129c03fadd1ccc37fd1ebbc773f2b031483b04700923c3511d2a939252cc", size = 272652 }, + { url = "https://files.pythonhosted.org/packages/ba/f0/919c72a9eef843781e652d0650f2819039943e69b69d5af2d0451a23edc3/safetensors-0.4.5-cp310-none-win_amd64.whl", hash = "sha256:e98ef5524f8b6620c8cdef97220c0b6a5c1cef69852fcd2f174bb96c2bb316b1", size = 285879 }, + { url = "https://files.pythonhosted.org/packages/9a/a5/25bcf75e373412daf1fd88045ab3aa8140a0d804ef0e70712c4f2c5b94d8/safetensors-0.4.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:21f848d7aebd5954f92538552d6d75f7c1b4500f51664078b5b49720d180e47c", size = 392256 }, + { url = "https://files.pythonhosted.org/packages/08/8c/ece3bf8756506a890bd980eca02f47f9d98dfbf5ce16eda1368f53560f67/safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bb07000b19d41e35eecef9a454f31a8b4718a185293f0d0b1c4b61d6e4487971", size = 381490 }, + { url = "https://files.pythonhosted.org/packages/39/83/c4a7ce01d626e46ea2b45887f2e59b16441408031e2ce2f9fe01860c6946/safetensors-0.4.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09dedf7c2fda934ee68143202acff6e9e8eb0ddeeb4cfc24182bef999efa9f42", size = 441093 }, + { url = "https://files.pythonhosted.org/packages/47/26/cc52de647e71bd9a0b0d78ead0d31d9c462b35550a817aa9e0cab51d6db4/safetensors-0.4.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:59b77e4b7a708988d84f26de3ebead61ef1659c73dcbc9946c18f3b1786d2688", size = 438960 }, + { url = "https://files.pythonhosted.org/packages/06/78/332538546775ee97e749867df2d58f2282d9c48a1681e4891eed8b94ec94/safetensors-0.4.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d3bc83e14d67adc2e9387e511097f254bd1b43c3020440e708858c684cbac68", size = 478031 }, + { url = "https://files.pythonhosted.org/packages/d9/03/a3c8663f1ddda54e624ecf43fce651659b49e8e1603c52c3e464b442acfa/safetensors-0.4.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39371fc551c1072976073ab258c3119395294cf49cdc1f8476794627de3130df", size = 494754 }, + { url = "https://files.pythonhosted.org/packages/e6/ee/69e498a892f208bd1da4104d4b9be887f8611bf4942144718b6738482250/safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6c19feda32b931cae0acd42748a670bdf56bee6476a046af20181ad3fee4090", size = 435013 }, + { url = "https://files.pythonhosted.org/packages/a2/61/f0cfce984515b86d1260f556ba3b782158e2855e6a318446ac2613786fa9/safetensors-0.4.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a659467495de201e2f282063808a41170448c78bada1e62707b07a27b05e6943", size = 455984 }, + { url = "https://files.pythonhosted.org/packages/e7/a9/3e3b48fcaade3eb4e347d39ebf0bd44291db21a3e4507854b42a7cb910ac/safetensors-0.4.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bad5e4b2476949bcd638a89f71b6916fa9a5cae5c1ae7eede337aca2100435c0", size = 619513 }, + { url = "https://files.pythonhosted.org/packages/80/23/2a7a1be24258c0e44c1d356896fd63dc0545a98d2d0184925fa09cd3ec76/safetensors-0.4.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a3a315a6d0054bc6889a17f5668a73f94f7fe55121ff59e0a199e3519c08565f", size = 604841 }, + { url = "https://files.pythonhosted.org/packages/b4/5c/34d082ff1fffffd8545fb22cbae3285ab4236f1f0cfc64b7e58261c2363b/safetensors-0.4.5-cp311-none-win32.whl", hash = "sha256:a01e232e6d3d5cf8b1667bc3b657a77bdab73f0743c26c1d3c5dd7ce86bd3a92", size = 272602 }, + { url = "https://files.pythonhosted.org/packages/6d/41/948c96c8a7e9fef57c2e051f1871c108a6dbbc6d285598bdb1d89b98617c/safetensors-0.4.5-cp311-none-win_amd64.whl", hash = "sha256:cbd39cae1ad3e3ef6f63a6f07296b080c951f24cec60188378e43d3713000c04", size = 285973 }, + { url = "https://files.pythonhosted.org/packages/bf/ac/5a63082f931e99200db95fd46fb6734f050bb6e96bf02521904c6518b7aa/safetensors-0.4.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:473300314e026bd1043cef391bb16a8689453363381561b8a3e443870937cc1e", size = 392015 }, + { url = "https://files.pythonhosted.org/packages/73/95/ab32aa6e9bdc832ff87784cdf9da26192b93de3ef82b8d1ada8f345c5044/safetensors-0.4.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:801183a0f76dc647f51a2d9141ad341f9665602a7899a693207a82fb102cc53e", size = 381774 }, + { url = "https://files.pythonhosted.org/packages/d6/6c/7e04b7626809fc63f3698f4c50e43aff2864b40089aa4506c918a75b8eed/safetensors-0.4.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1524b54246e422ad6fb6aea1ac71edeeb77666efa67230e1faf6999df9b2e27f", size = 441134 }, + { url = "https://files.pythonhosted.org/packages/58/2b/ffe7c86a277e6c1595fbdf415cfe2903f253f574a5405e93fda8baaa582c/safetensors-0.4.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3139098e3e8b2ad7afbca96d30ad29157b50c90861084e69fcb80dec7430461", size = 438467 }, + { url = "https://files.pythonhosted.org/packages/67/9c/f271bd804e08c7fda954d17b70ff281228a88077337a9e70feace4f4cc93/safetensors-0.4.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65573dc35be9059770808e276b017256fa30058802c29e1038eb1c00028502ea", size = 476566 }, + { url = "https://files.pythonhosted.org/packages/4c/ad/4cf76a3e430a8a26108407fa6cb93e6f80d996a5cb75d9540c8fe3862990/safetensors-0.4.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd33da8e9407559f8779c82a0448e2133737f922d71f884da27184549416bfed", size = 492253 }, + { url = "https://files.pythonhosted.org/packages/d9/40/a6f75ea449a9647423ec8b6f72c16998d35aa4b43cb38536ac060c5c7bf5/safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3685ce7ed036f916316b567152482b7e959dc754fcc4a8342333d222e05f407c", size = 434769 }, + { url = "https://files.pythonhosted.org/packages/52/47/d4b49b1231abf3131f7bb0bc60ebb94b27ee33e0a1f9569da05f8ac65dee/safetensors-0.4.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dde2bf390d25f67908278d6f5d59e46211ef98e44108727084d4637ee70ab4f1", size = 457166 }, + { url = "https://files.pythonhosted.org/packages/c3/cd/006468b03b0fa42ff82d795d47c4193e99001e96c3f08bd62ef1b5cab586/safetensors-0.4.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7469d70d3de970b1698d47c11ebbf296a308702cbaae7fcb993944751cf985f4", size = 619280 }, + { url = "https://files.pythonhosted.org/packages/22/4d/b6208d918e83daa84b424c0ac3191ae61b44b3191613a3a5a7b38f94b8ad/safetensors-0.4.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3a6ba28118636a130ccbb968bc33d4684c48678695dba2590169d5ab03a45646", size = 605390 }, + { url = "https://files.pythonhosted.org/packages/e8/20/bf0e01825dc01ed75538021a98b9a046e60ead63c6c6700764c821a8c873/safetensors-0.4.5-cp312-none-win32.whl", hash = "sha256:c859c7ed90b0047f58ee27751c8e56951452ed36a67afee1b0a87847d065eec6", size = 273250 }, + { url = "https://files.pythonhosted.org/packages/f1/5f/ab6b6cec85b40789801f35b7d2fb579ae242d8193929974a106d5ff5c835/safetensors-0.4.5-cp312-none-win_amd64.whl", hash = "sha256:b5a8810ad6a6f933fff6c276eae92c1da217b39b4d8b1bc1c0b8af2d270dc532", size = 286307 }, + { url = "https://files.pythonhosted.org/packages/90/61/0e27b1403e311cba0be20026bee4ee822d90eda7dad372179e7f18bb99f3/safetensors-0.4.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:25e5f8e2e92a74f05b4ca55686234c32aac19927903792b30ee6d7bd5653d54e", size = 392062 }, + { url = "https://files.pythonhosted.org/packages/b1/9f/cc31fafc9f5d79da10a83a820ca37f069bab0717895ad8cbcacf629dd1c5/safetensors-0.4.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81efb124b58af39fcd684254c645e35692fea81c51627259cdf6d67ff4458916", size = 382517 }, + { url = "https://files.pythonhosted.org/packages/a4/c7/4fda8a0ebb96662550433378f4a74c677fa5fc4d0a43a7ec287d1df254a9/safetensors-0.4.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:585f1703a518b437f5103aa9cf70e9bd437cb78eea9c51024329e4fb8a3e3679", size = 441378 }, + { url = "https://files.pythonhosted.org/packages/14/31/9abb431f6209de9c80dab83e1112ebd769f1e32e7ab7ab228a02424a4693/safetensors-0.4.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b99fbf72e3faf0b2f5f16e5e3458b93b7d0a83984fe8d5364c60aa169f2da89", size = 438831 }, + { url = "https://files.pythonhosted.org/packages/37/37/99bfb195578a808b8d045159ee9264f8da58d017ac0701853dcacda14d4e/safetensors-0.4.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b17b299ca9966ca983ecda1c0791a3f07f9ca6ab5ded8ef3d283fff45f6bcd5f", size = 477112 }, + { url = "https://files.pythonhosted.org/packages/7d/05/fac3ef107e60d2a78532bed171a91669d4bb259e1236f5ea8c67a6976c75/safetensors-0.4.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76ded72f69209c9780fdb23ea89e56d35c54ae6abcdec67ccb22af8e696e449a", size = 493373 }, + { url = "https://files.pythonhosted.org/packages/cf/7a/825800ee8c68214b4fd3506d5e19209338c69b41e01c6e14dd13969cc8b9/safetensors-0.4.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2783956926303dcfeb1de91a4d1204cd4089ab441e622e7caee0642281109db3", size = 435422 }, + { url = "https://files.pythonhosted.org/packages/5e/6c/7a3233c08bde558d6c33a41219119866cb596139a4673cc6c24024710ffd/safetensors-0.4.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d94581aab8c6b204def4d7320f07534d6ee34cd4855688004a4354e63b639a35", size = 457382 }, + { url = "https://files.pythonhosted.org/packages/a0/58/0b7bcba3788ff503990cf9278d611b56c029400612ba93e772c987b5aa03/safetensors-0.4.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:67e1e7cb8678bb1b37ac48ec0df04faf689e2f4e9e81e566b5c63d9f23748523", size = 619301 }, + { url = "https://files.pythonhosted.org/packages/82/cc/9c2cf58611daf1c83ce5d37f9de66353e23fcda36008b13fd3409a760aa3/safetensors-0.4.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:dbd280b07e6054ea68b0cb4b16ad9703e7d63cd6890f577cb98acc5354780142", size = 605580 }, + { url = "https://files.pythonhosted.org/packages/cf/ff/037ae4c0ee32db496669365e66079b6329906c6814722b159aa700e67208/safetensors-0.4.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fdadf66b5a22ceb645d5435a0be7a0292ce59648ca1d46b352f13cff3ea80410", size = 392951 }, + { url = "https://files.pythonhosted.org/packages/f1/d6/6621e16b35bf83ae099eaab07338f04991a26c9aa43879d05f19f35e149c/safetensors-0.4.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d42ffd4c2259f31832cb17ff866c111684c87bd930892a1ba53fed28370c918c", size = 383417 }, + { url = "https://files.pythonhosted.org/packages/ae/88/3068e1bb16f5e9f9068901de3cf7b3db270b9bfe6e7d51d4b55c1da0425d/safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd8a1f6d2063a92cd04145c7fd9e31a1c7d85fbec20113a14b487563fdbc0597", size = 442311 }, + { url = "https://files.pythonhosted.org/packages/f7/15/a2bb77ebbaa76b61ec2e9f731fe4db7f9473fd855d881957c51b3a168892/safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:951d2fcf1817f4fb0ef0b48f6696688a4e852a95922a042b3f96aaa67eedc920", size = 436678 }, + { url = "https://files.pythonhosted.org/packages/ec/79/9608c4546cdbfe3860dd7aa59e3562c9289113398b1a0bd89b68ce0a9d41/safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ac85d9a8c1af0e3132371d9f2d134695a06a96993c2e2f0bbe25debb9e3f67a", size = 457316 }, + { url = "https://files.pythonhosted.org/packages/0f/23/b17b483f2857835962ad33e38014efd4911791187e177bc23b057d35bee8/safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e3cec4a29eb7fe8da0b1c7988bc3828183080439dd559f720414450de076fcab", size = 620565 }, + { url = "https://files.pythonhosted.org/packages/19/46/5d11dc300feaad285c2f1bd784ff3f689f5e0ab6be49aaf568f3a77019eb/safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f", size = 606660 }, +] + [[package]] name = "scikit-learn" version = "1.5.2" @@ -4665,6 +5604,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/b0/4562db6223154aa4e22f939003cb92514c79f3d4dccca3444253fd17f902/Send2Trash-1.8.3-py3-none-any.whl", hash = "sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9", size = 18072 }, ] +[[package]] +name = "sentence-transformers" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "torch" }, + { name = "tqdm" }, + { name = "transformers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/41/fb/2368f84127920d86330b533792e66b26264e92b729b5c1998aaa33d2e22f/sentence_transformers-3.0.1.tar.gz", hash = "sha256:8a3d2c537cc4d1014ccc20ac92be3d6135420a3bc60ae29a3a8a9b4bb35fbff6", size = 177258 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/4b/922436953394e1bfda05e4bf1fe0e80f609770f256c59a9df7a9254f3e0d/sentence_transformers-3.0.1-py3-none-any.whl", hash = "sha256:01050cc4053c49b9f5b78f6980b5a72db3fd3a0abb9169b1792ac83875505ee6", size = 227071 }, +] + [[package]] name = "setuptools" version = "74.1.2" @@ -4979,7 +5937,7 @@ wheels = [ [[package]] name = "streamlit" -version = "1.39.0" +version = "1.40.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "altair" }, @@ -5002,9 +5960,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "watchdog", marker = "platform_system != 'Darwin'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d5/21/3740871ad79ee35f442f11bafec5010a3ec1916c7c9eb43ef866da641f31/streamlit-1.39.0.tar.gz", hash = "sha256:fef9de7983c4ee65c08e85607d7ffccb56b00482b1041fa62f90e4815d39df3a", size = 8360694 } +sdist = { url = "https://files.pythonhosted.org/packages/80/70/b76a32201b04a5a2a1c667fe7327cb7a3cc25d726d3863ba5863d2b0dccf/streamlit-1.40.1.tar.gz", hash = "sha256:1f2b09f04b6ad366a2c7b4d48104697d1c8bc33f48bdf7ed939cc04c12d3aec6", size = 8266452 } wheels = [ - { url = "https://files.pythonhosted.org/packages/ef/e1/f9c479f9dbe0bb702ea5ca6608f10e91a708b438f7fb4572a2642718c6e3/streamlit-1.39.0-py2.py3-none-any.whl", hash = "sha256:a359fc54ed568b35b055ff1d453c320735539ad12e264365a36458aef55a5fba", size = 8741335 }, + { url = "https://files.pythonhosted.org/packages/9a/14/857d0734989f3d26f2f965b2e3f67568ea7a6e8a60cb9c1ed7f774b6d606/streamlit-1.40.1-py2.py3-none-any.whl", hash = "sha256:b9d7a317a0cc88edd7857c7e07dde9cf95647d3ae51cbfa8a3db82fbb8a2990d", size = 8645398 }, ] [[package]] @@ -5201,6 +6159,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bf/65/813fc133609ebcb1299be6a42e5aea99d6344afb35ccb43f67e7daaa3b92/structlog-24.4.0-py3-none-any.whl", hash = "sha256:597f61e80a91cc0749a9fd2a098ed76715a1c8a01f73e336b746504d1aad7610", size = 67180 }, ] +[[package]] +name = "sympy" +version = "1.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ca/99/5a5b6f19ff9f083671ddf7b9632028436167cd3d33e11015754e41b249a4/sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f", size = 7533040 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177 }, +] + [[package]] name = "tabulate" version = "0.9.0" @@ -5242,6 +6212,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8", size = 78154 }, ] +[[package]] +name = "texttable" +version = "1.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1c/dc/0aff23d6036a4d3bf4f1d8c8204c5c79c4437e25e0ae94ffe4bbb55ee3c2/texttable-1.7.0.tar.gz", hash = "sha256:2d2068fb55115807d3ac77a4ca68fa48803e84ebb0ee2340f858107a36522638", size = 12831 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/24/99/4772b8e00a136f3e01236de33b0efda31ee7077203ba5967fcc76da94d65/texttable-1.7.0-py2.py3-none-any.whl", hash = "sha256:72227d592c82b3d7f672731ae73e4d1f88cd8e2ef5b075a7a7f01a23a3743917", size = 10768 }, +] + [[package]] name = "threadpoolctl" version = "3.5.0" @@ -5296,6 +6275,70 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/4d/0db5b8a613d2a59bbc29bc5bb44a2f8070eb9ceab11c50d477502a8a0092/tinycss2-1.3.0-py3-none-any.whl", hash = "sha256:54a8dbdffb334d536851be0226030e9505965bb2f30f21a4a82c55fb2a80fae7", size = 22532 }, ] +[[package]] +name = "tokenizers" +version = "0.15.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c0/44/625db94e91c6196b6574359fa70bfe28e8eabf57a1b894f8f0ec69727fd1/tokenizers-0.15.2.tar.gz", hash = "sha256:e6e9c6e019dd5484be5beafc775ae6c925f4c69a3487040ed09b45e13df2cb91", size = 320256 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/6e/489419d98730b3d02381f10a8b97c5bf55b45742d1b347cdd0ffe267b827/tokenizers-0.15.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:52f6130c9cbf70544287575a985bf44ae1bda2da7e8c24e97716080593638012", size = 2578411 }, + { url = "https://files.pythonhosted.org/packages/01/04/45d88b8bddc09bf56ae1631721393255b75798af515c65c26389713a2072/tokenizers-0.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:054c1cc9c6d68f7ffa4e810b3d5131e0ba511b6e4be34157aa08ee54c2f8d9ee", size = 2412452 }, + { url = "https://files.pythonhosted.org/packages/cc/bf/819bf4445ed68ffaf73b0f6245bcbd21a5cd58e86dabbef315a6d0b707b3/tokenizers-0.15.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a9b9b070fdad06e347563b88c278995735292ded1132f8657084989a4c84a6d5", size = 3643451 }, + { url = "https://files.pythonhosted.org/packages/31/b3/70d3fe0ad25e065322cd902624cad4ff2647484fe823360f58af6927b48c/tokenizers-0.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea621a7eef4b70e1f7a4e84dd989ae3f0eeb50fc8690254eacc08acb623e82f1", size = 3534105 }, + { url = "https://files.pythonhosted.org/packages/5f/1b/58e77f2b57651e8c1b4f1b7144a1250509f2e7a1f55073d12620968ae4bb/tokenizers-0.15.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf7fd9a5141634fa3aa8d6b7be362e6ae1b4cda60da81388fa533e0b552c98fd", size = 3398034 }, + { url = "https://files.pythonhosted.org/packages/dc/d5/45dd421f45b3c1a446ffd9486cef29ed568b5978f66a1803fa46a44aa9be/tokenizers-0.15.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44f2a832cd0825295f7179eaf173381dc45230f9227ec4b44378322d900447c9", size = 3926740 }, + { url = "https://files.pythonhosted.org/packages/fa/6b/6b757cf6f7c30009a6759d3f7b833d974b3cd50d24d5824c695e077cb1bf/tokenizers-0.15.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b9ec69247a23747669ec4b0ca10f8e3dfb3545d550258129bd62291aabe8605", size = 4032027 }, + { url = "https://files.pythonhosted.org/packages/1c/5d/cf5e122ce4f1a29f165b2a69dc33d1ff30bce303343d58a54775ddba5d51/tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b6a4c78da863ff26dbd5ad9a8ecc33d8a8d97b535172601cf00aee9d7ce9ce", size = 3577319 }, + { url = "https://files.pythonhosted.org/packages/aa/0b/dd9e5124fe73a01f36f5c7554ac97b9612af5e0bd401d6a606a3f52a060a/tokenizers-0.15.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5ab2a4d21dcf76af60e05af8063138849eb1d6553a0d059f6534357bce8ba364", size = 9682328 }, + { url = "https://files.pythonhosted.org/packages/6a/0c/3435e3d54f825d4fa363a7ab2680b243314377eb2ed28e87ade70b861e7b/tokenizers-0.15.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a47acfac7e511f6bbfcf2d3fb8c26979c780a91e06fb5b9a43831b2c0153d024", size = 9995619 }, + { url = "https://files.pythonhosted.org/packages/ab/bf/a804747020f1b221131b74b5f29c24b47a5d2cee4b1311ce394ca9ce242a/tokenizers-0.15.2-cp310-none-win32.whl", hash = "sha256:064ff87bb6acdbd693666de9a4b692add41308a2c0ec0770d6385737117215f2", size = 2013446 }, + { url = "https://files.pythonhosted.org/packages/c9/87/0bf37626c5f1ea2462e0398be88c287f3d40c696c255ba478bf525bdc852/tokenizers-0.15.2-cp310-none-win_amd64.whl", hash = "sha256:3b919afe4df7eb6ac7cafd2bd14fb507d3f408db7a68c43117f579c984a73843", size = 2192649 }, + { url = "https://files.pythonhosted.org/packages/73/11/933d68d395f5486d935e1c15da80bc96bf3f48595652069d19e0e9894386/tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:89cd1cb93e4b12ff39bb2d626ad77e35209de9309a71e4d3d4672667b4b256e7", size = 2578922 }, + { url = "https://files.pythonhosted.org/packages/5f/4f/a4c12cc058a899c1caaa1e689c3df9a698e20e891d4005aa6ec2174a9339/tokenizers-0.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cfed5c64e5be23d7ee0f0e98081a25c2a46b0b77ce99a4f0605b1ec43dd481fa", size = 2412317 }, + { url = "https://files.pythonhosted.org/packages/e9/13/b86ea87b7e3b4a2ca154220dc4eb19a56a3864ec03e9630d15d1bac10da1/tokenizers-0.15.2-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a907d76dcfda37023ba203ab4ceeb21bc5683436ebefbd895a0841fd52f6f6f2", size = 3643051 }, + { url = "https://files.pythonhosted.org/packages/0f/23/e4985657ea42ad432d6dc2100b2687e70a6bae730f1f8c52f81d9e6ccf3a/tokenizers-0.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ea60479de6fc7b8ae756b4b097572372d7e4032e2521c1bbf3d90c90a99ff0", size = 3534327 }, + { url = "https://files.pythonhosted.org/packages/34/d5/e1ad46939d6de48d41bbd8b302f87ecde79847855210e75517a832b29490/tokenizers-0.15.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:48e2b9335be2bc0171df9281385c2ed06a15f5cf121c44094338306ab7b33f2c", size = 3398296 }, + { url = "https://files.pythonhosted.org/packages/e7/d1/4d319a035f819af3290ec5a09482ad659d9d2a0aea33890fb5720ce81841/tokenizers-0.15.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:112a1dd436d2cc06e6ffdc0b06d55ac019a35a63afd26475205cb4b1bf0bfbff", size = 3927353 }, + { url = "https://files.pythonhosted.org/packages/e5/39/facfca8e598126a0001d4295e6b1ee670d241aa6f4fcdd97016065b43c5d/tokenizers-0.15.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4620cca5c2817177ee8706f860364cc3a8845bc1e291aaf661fb899e5d1c45b0", size = 4030091 }, + { url = "https://files.pythonhosted.org/packages/15/0b/c09b2c0dc688c82adadaa0d5080983de3ce920f4a5cbadb7eaa5302ad251/tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccd73a82751c523b3fc31ff8194702e4af4db21dc20e55b30ecc2079c5d43cb7", size = 3577167 }, + { url = "https://files.pythonhosted.org/packages/07/3b/d8e60712e509a6f5d01bf0eb4470452b72277be4883656206d4ccd7e02de/tokenizers-0.15.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:107089f135b4ae7817affe6264f8c7a5c5b4fd9a90f9439ed495f54fcea56fb4", size = 9683503 }, + { url = "https://files.pythonhosted.org/packages/c0/61/1c26c8e54af9bab32743e0484601a60738f33797f91040da2a4104f07e70/tokenizers-0.15.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0ff110ecc57b7aa4a594396525a3451ad70988e517237fe91c540997c4e50e29", size = 9996038 }, + { url = "https://files.pythonhosted.org/packages/d1/54/451e96d8514b1afbef955f7420e1180e015c3f4eb085ad38189c0e83ee87/tokenizers-0.15.2-cp311-none-win32.whl", hash = "sha256:6d76f00f5c32da36c61f41c58346a4fa7f0a61be02f4301fd30ad59834977cc3", size = 2013591 }, + { url = "https://files.pythonhosted.org/packages/c1/02/40725eebedea8175918bd59ab80b2174d6ef3b3ef9ac8ec996e84c38d3ca/tokenizers-0.15.2-cp311-none-win_amd64.whl", hash = "sha256:cc90102ed17271cf0a1262babe5939e0134b3890345d11a19c3145184b706055", size = 2192797 }, + { url = "https://files.pythonhosted.org/packages/ae/ca/ea4b5aa70d4d26f2d05620c265b07b5a249157767c1673f5753b8bfc7db1/tokenizers-0.15.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f86593c18d2e6248e72fb91c77d413a815153b8ea4e31f7cd443bdf28e467670", size = 2574444 }, + { url = "https://files.pythonhosted.org/packages/f9/99/5a55a9b6e2db274c0969ad57d989d02efae90f9e558983a561c9b2b7ea1a/tokenizers-0.15.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0774bccc6608eca23eb9d620196687c8b2360624619623cf4ba9dc9bd53e8b51", size = 2411608 }, + { url = "https://files.pythonhosted.org/packages/82/cc/29bb3a25c06b90ce82bb20ef074011481de5c44413a1e1eb10cfd93080fb/tokenizers-0.15.2-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d0222c5b7c9b26c0b4822a82f6a7011de0a9d3060e1da176f66274b70f846b98", size = 3652367 }, + { url = "https://files.pythonhosted.org/packages/c0/ae/f6a974be9b2e1615f3de3cc9e4fc2897a86357400801c58143c67cbbad2e/tokenizers-0.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3835738be1de66624fff2f4f6f6684775da4e9c00bde053be7564cbf3545cc66", size = 3529509 }, + { url = "https://files.pythonhosted.org/packages/d6/42/340b91f675b494c4ecc0a256c5dd88b4003dbfde05afff90b970738fdfb4/tokenizers-0.15.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0143e7d9dcd811855c1ce1ab9bf5d96d29bf5e528fd6c7824d0465741e8c10fd", size = 3396516 }, + { url = "https://files.pythonhosted.org/packages/6f/b2/8a965abc17fff309eb06e98ce429a19a5e04f731a669a6113b9e182f8a79/tokenizers-0.15.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db35825f6d54215f6b6009a7ff3eedee0848c99a6271c870d2826fbbedf31a38", size = 3918811 }, + { url = "https://files.pythonhosted.org/packages/6c/16/dad7b4aa6e34a395aef7ae7b010d8b5ebefdf3df81510de53d7f17d2f0fc/tokenizers-0.15.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f5e64b0389a2be47091d8cc53c87859783b837ea1a06edd9d8e04004df55a5c", size = 4025494 }, + { url = "https://files.pythonhosted.org/packages/f6/de/3707df0c1d7bf55e6a4dba724700353bfee8e292fdd8ccfe93416549124d/tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e0480c452217edd35eca56fafe2029fb4d368b7c0475f8dfa3c5c9c400a7456", size = 3575314 }, + { url = "https://files.pythonhosted.org/packages/2e/dd/7b8da304d152bb46f13bc2ba5bd545480ab6ce39d94a53eef07f7624d235/tokenizers-0.15.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a33ab881c8fe70474980577e033d0bc9a27b7ab8272896e500708b212995d834", size = 9682779 }, + { url = "https://files.pythonhosted.org/packages/07/aa/66e8a81e07a791ca6ee9d74ee6de1ffbcd3985149f13aeb530bd409baba0/tokenizers-0.15.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a308a607ca9de2c64c1b9ba79ec9a403969715a1b8ba5f998a676826f1a7039d", size = 9995614 }, + { url = "https://files.pythonhosted.org/packages/bf/e1/aed3bc98785c54bd26bf6dd3d2f54cc00de33e8b1f922a23131372eedec8/tokenizers-0.15.2-cp312-none-win32.whl", hash = "sha256:b8fcfa81bcb9447df582c5bc96a031e6df4da2a774b8080d4f02c0c16b42be0b", size = 2011030 }, + { url = "https://files.pythonhosted.org/packages/c9/ea/5800f4941a713b2feed955b6a256aacc1ca68a6699916d2668622c075d38/tokenizers-0.15.2-cp312-none-win_amd64.whl", hash = "sha256:38d7ab43c6825abfc0b661d95f39c7f8af2449364f01d331f3b51c94dcff7221", size = 2180523 }, + { url = "https://files.pythonhosted.org/packages/6d/04/406f35822d785ccdcd740f95ba58515c739b6d57c05dd278ee64c70d1565/tokenizers-0.15.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:38bfb0204ff3246ca4d5e726e8cc8403bfc931090151e6eede54d0e0cf162ef0", size = 2574496 }, + { url = "https://files.pythonhosted.org/packages/6c/b4/6cc305767c9b1b97b8f5bc61fc472abf42b24ad39388e8f0c57250a7c145/tokenizers-0.15.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c861d35e8286a53e06e9e28d030b5a05bcbf5ac9d7229e561e53c352a85b1fc", size = 2411609 }, + { url = "https://files.pythonhosted.org/packages/6b/6c/ae2437a3e233298a962053c62b943ffabb38627fd6787ff8da62352333fa/tokenizers-0.15.2-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:936bf3842db5b2048eaa53dade907b1160f318e7c90c74bfab86f1e47720bdd6", size = 3652369 }, + { url = "https://files.pythonhosted.org/packages/00/8b/21600349146d9fa4d341c507faf8d11b7292b7f29f8def440b81e65ad1ee/tokenizers-0.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:620beacc3373277700d0e27718aa8b25f7b383eb8001fba94ee00aeea1459d89", size = 3529510 }, + { url = "https://files.pythonhosted.org/packages/53/cd/6ffc60fbc5eae02629d736d578a7c5ca5c20b2b84e9866d61a0c6395684a/tokenizers-0.15.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2735ecbbf37e52db4ea970e539fd2d450d213517b77745114f92867f3fc246eb", size = 3396516 }, + { url = "https://files.pythonhosted.org/packages/d5/4c/15b66eb6a47dc9345192aa77988655830c1ebd1306d2b894ecd28fbfbbca/tokenizers-0.15.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:473c83c5e2359bb81b0b6fde870b41b2764fcdd36d997485e07e72cc3a62264a", size = 3918812 }, + { url = "https://files.pythonhosted.org/packages/ed/3b/f9df83311475e456473958cce65a3709f07a1d1dd8ed046d4779ec4336c8/tokenizers-0.15.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968fa1fb3c27398b28a4eca1cbd1e19355c4d3a6007f7398d48826bbe3a0f728", size = 4025495 }, + { url = "https://files.pythonhosted.org/packages/36/ee/2055fbeb590719393d29cea3016491fd3a6da10598541bff256cc3750349/tokenizers-0.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:865c60ae6eaebdde7da66191ee9b7db52e542ed8ee9d2c653b6d190a9351b980", size = 3575316 }, + { url = "https://files.pythonhosted.org/packages/93/53/ae4e5e49bdc61849b668263a1a4c398b4e33aea1bb9b0a59c9677bb5266b/tokenizers-0.15.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7c0d8b52664ab2d4a8d6686eb5effc68b78608a9008f086a122a7b2996befbab", size = 9682779 }, + { url = "https://files.pythonhosted.org/packages/04/c6/8818b867611734889cd8faca1153ec5dbdd59c98e85e5f6980e7be338839/tokenizers-0.15.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:f33dfbdec3784093a9aebb3680d1f91336c56d86cc70ddf88708251da1fe9064", size = 9995614 }, + { url = "https://files.pythonhosted.org/packages/a5/2c/9c2f7a0601cccc8cf169006873ed7775ad76804e98b7236d1f345faf69f8/tokenizers-0.15.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6a9b648a58281c4672212fab04e60648fde574877d0139cd4b4f93fe28ca8944", size = 2576142 }, + { url = "https://files.pythonhosted.org/packages/2e/4f/93ccada67079065f892a2c4e7159caf0ce65084fdf60253815ca964403af/tokenizers-0.15.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7c7d18b733be6bbca8a55084027f7be428c947ddf871c500ee603e375013ffba", size = 2412714 }, + { url = "https://files.pythonhosted.org/packages/fd/8f/1dbeaf8b2a2c00e5172d8ed000fba94edb1d424fd50dcbdcc755fbf3c0aa/tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:13ca3611de8d9ddfbc4dc39ef54ab1d2d4aaa114ac8727dfdc6a6ec4be017378", size = 3646249 }, + { url = "https://files.pythonhosted.org/packages/89/ed/b055d12637754471e4344f4e85c6268ef76801b0113ce1f789c5d84eaae9/tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:237d1bf3361cf2e6463e6c140628e6406766e8b27274f5fcc62c747ae3c6f094", size = 3534330 }, + { url = "https://files.pythonhosted.org/packages/ad/e1/d0b441575a3ac0262c2c73773f79dd50c94e13c9dfda0d953f1c79d47ef5/tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67a0fe1e49e60c664915e9fb6b0cb19bac082ab1f309188230e4b2920230edb3", size = 3579864 }, + { url = "https://files.pythonhosted.org/packages/a8/cd/6fe89c549d3aad886295cb9875105a75fa0d82ce80e4721cb43e6eb0830e/tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4e022fe65e99230b8fd89ebdfea138c24421f91c1a4f4781a8f5016fd5cdfb4d", size = 9684097 }, + { url = "https://files.pythonhosted.org/packages/9f/4d/29e5052a11d1a9f8eb156e48c123731e6219e4f3d72cd6d7787fdf4eff7a/tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d857be2df69763362ac699f8b251a8cd3fac9d21893de129bc788f8baaef2693", size = 9997782 }, +] + [[package]] name = "toml" version = "0.10.2" @@ -5314,6 +6357,48 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/97/75/10a9ebee3fd790d20926a90a2547f0bf78f371b2f13aa822c759680ca7b9/tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", size = 12757 }, ] +[[package]] +name = "torch" +version = "2.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "jinja2" }, + { name = "networkx" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "sympy" }, + { name = "triton", marker = "python_full_version < '3.12' and platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "typing-extensions" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/b3/1fcc3bccfddadfd6845dcbfe26eb4b099f1dfea5aa0e5cfb92b3c98dba5b/torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:bc889d311a855dd2dfd164daf8cc903a6b7273a747189cebafdd89106e4ad585", size = 755526581 }, + { url = "https://files.pythonhosted.org/packages/c3/7c/aeb0c5789a3f10cf909640530cd75b314959b9d9914a4996ed2c7bf8779d/torch-2.2.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:15dffa4cc3261fa73d02f0ed25f5fa49ecc9e12bf1ae0a4c1e7a88bbfaad9030", size = 86623646 }, + { url = "https://files.pythonhosted.org/packages/3a/81/684d99e536b20e869a7c1222cf1dd233311fb05d3628e9570992bfb65760/torch-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:11e8fe261233aeabd67696d6b993eeb0896faa175c6b41b9a6c9f0334bdad1c5", size = 198579616 }, + { url = "https://files.pythonhosted.org/packages/3b/55/7192974ab13e5e5577f45d14ce70d42f5a9a686b4f57bbe8c9ab45c4a61a/torch-2.2.2-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:b2e2200b245bd9f263a0d41b6a2dab69c4aca635a01b30cca78064b0ef5b109e", size = 150788930 }, + { url = "https://files.pythonhosted.org/packages/33/6b/21496316c9b8242749ee2a9064406271efdf979e91d440e8a3806b5e84bf/torch-2.2.2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:877b3e6593b5e00b35bbe111b7057464e76a7dd186a287280d941b564b0563c2", size = 59707286 }, + { url = "https://files.pythonhosted.org/packages/c3/33/d7a6123231bd4d04c7005dde8507235772f3bc4622a25f3a88c016415d49/torch-2.2.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:ad4c03b786e074f46606f4151c0a1e3740268bcf29fbd2fdf6666d66341c1dcb", size = 755555407 }, + { url = "https://files.pythonhosted.org/packages/02/af/81abea3d73fddfde26afd1ce52a4ddfa389cd2b684c89d6c4d0d5d8d0dfa/torch-2.2.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:32827fa1fbe5da8851686256b4cd94cc7b11be962862c2293811c94eea9457bf", size = 86642063 }, + { url = "https://files.pythonhosted.org/packages/5c/01/5ab75f138bf32d7a69df61e4997e24eccad87cc009f5fb7e2a31af8a4036/torch-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:f9ef0a648310435511e76905f9b89612e45ef2c8b023bee294f5e6f7e73a3e7c", size = 198584125 }, + { url = "https://files.pythonhosted.org/packages/3f/14/e105b8ef6d324e789c1589e95cb0ab63f3e07c2216d68b1178b7c21b7d2a/torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:95b9b44f3bcebd8b6cd8d37ec802048c872d9c567ba52c894bba90863a439059", size = 150796474 }, + { url = "https://files.pythonhosted.org/packages/96/23/18b9c16c18a77755e7f15173821c7100f11e6b3b7717bea8d729bdeb92c0/torch-2.2.2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:49aa4126ede714c5aeef7ae92969b4b0bbe67f19665106463c39f22e0a1860d1", size = 59714938 }, + { url = "https://files.pythonhosted.org/packages/4c/0c/d8f77363a7a3350c96e6c9db4ffb101d1c0487cc0b8cdaae1e4bfb2800ad/torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:cf12cdb66c9c940227ad647bc9cf5dba7e8640772ae10dfe7569a0c1e2a28aca", size = 755466713 }, + { url = "https://files.pythonhosted.org/packages/05/9b/e5c0df26435f3d55b6699e1c61f07652b8c8a3ac5058a75d0e991f92c2b0/torch-2.2.2-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:89ddac2a8c1fb6569b90890955de0c34e1724f87431cacff4c1979b5f769203c", size = 86515814 }, + { url = "https://files.pythonhosted.org/packages/72/ce/beca89dcdcf4323880d3b959ef457a4c61a95483af250e6892fec9174162/torch-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:451331406b760f4b1ab298ddd536486ab3cfb1312614cfe0532133535be60bea", size = 198528804 }, + { url = "https://files.pythonhosted.org/packages/79/78/29dcab24a344ffd9ee9549ec0ab2c7885c13df61cde4c65836ee275efaeb/torch-2.2.2-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:eb4d6e9d3663e26cd27dc3ad266b34445a16b54908e74725adb241aa56987533", size = 150797270 }, + { url = "https://files.pythonhosted.org/packages/4a/0e/e4e033371a7cba9da0db5ccb507a9174e41b9c29189a932d01f2f61ecfc0/torch-2.2.2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:bf9558da7d2bf7463390b3b2a61a6a3dbb0b45b161ee1dd5ec640bf579d479fc", size = 59678388 }, +] + [[package]] name = "tornado" version = "6.4.1" @@ -5353,6 +6438,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359 }, ] +[[package]] +name = "transformers" +version = "4.35.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/97/00142bd2fef5cdaa945ffc2aa0021d127390ef6b0fdc2ac7295cf199a488/transformers-4.35.2.tar.gz", hash = "sha256:2d125e197d77b0cdb6c9201df9fa7e2101493272e448b9fba9341c695bee2f52", size = 6832593 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl", hash = "sha256:9dfa76f8692379544ead84d98f537be01cd1070de75c74efb13abcbc938fbe2f", size = 7920648 }, +] + [[package]] name = "trio" version = "0.26.2" @@ -5385,6 +6491,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/be/a9ae5f50cad5b6f85bd2574c2c923730098530096e170c1ce7452394d7aa/trio_websocket-0.11.1-py3-none-any.whl", hash = "sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638", size = 17408 }, ] +[[package]] +name = "triton" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock", marker = "python_full_version < '3.12'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/05/ed974ce87fe8c8843855daa2136b3409ee1c126707ab54a8b72815c08b49/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5", size = 167900779 }, + { url = "https://files.pythonhosted.org/packages/bd/ac/3974caaa459bf2c3a244a84be8d17561f631f7d42af370fc311defeca2fb/triton-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da58a152bddb62cafa9a857dd2bc1f886dbf9f9c90a2b5da82157cd2b34392b0", size = 167928356 }, + { url = "https://files.pythonhosted.org/packages/0e/49/2e1bbae4542b8f624e409540b4197e37ab22a88e8685e99debe721cc2b50/triton-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af58716e721460a61886668b205963dc4d1e4ac20508cc3f623aef0d70283d5", size = 167933985 }, +] + [[package]] name = "typeguard" version = "4.3.0" @@ -5960,3 +7079,81 @@ sdist = { url = "https://files.pythonhosted.org/packages/a6/b3/19a2540d21dea5f90 wheels = [ { url = "https://files.pythonhosted.org/packages/a6/0c/c2a72d51fe56e08a08acc85d13013558a2d793028ae7385448a6ccdfae64/xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd", size = 96531 }, ] + +[[package]] +name = "yarl" +version = "1.18.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna", marker = "python_full_version >= '3.12'" }, + { name = "multidict", marker = "python_full_version >= '3.12'" }, + { name = "propcache", marker = "python_full_version >= '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5e/4b/53db4ecad4d54535aff3dfda1f00d6363d79455f62b11b8ca97b82746bd2/yarl-1.18.0.tar.gz", hash = "sha256:20d95535e7d833889982bfe7cc321b7f63bf8879788fee982c76ae2b24cfb715", size = 180098 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/8b/305e1bde6bbf900bb8909a4884488764ee5950dda4da06cec885c06dae68/yarl-1.18.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:074fee89caab89a97e18ef5f29060ef61ba3cae6cd77673acc54bfdd3214b7b7", size = 141186 }, + { url = "https://files.pythonhosted.org/packages/6a/85/a15e439d8faa6bd09a536d87ca7a32daa50cf8820cf220edbced702348a0/yarl-1.18.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b026cf2c32daf48d90c0c4e406815c3f8f4cfe0c6dfccb094a9add1ff6a0e41a", size = 94097 }, + { url = "https://files.pythonhosted.org/packages/12/9d/7d39082baae943f138df1bb96914f8d53fd65eb131b9d0965917b009b35d/yarl-1.18.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ae38bd86eae3ba3d2ce5636cc9e23c80c9db2e9cb557e40b98153ed102b5a736", size = 91915 }, + { url = "https://files.pythonhosted.org/packages/c0/35/7e6fbfeb413f281dda59d4a9fce7a0c43cb1f22cb6ac25151d4c4ce51651/yarl-1.18.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:685cc37f3f307c6a8e879986c6d85328f4c637f002e219f50e2ef66f7e062c1d", size = 315086 }, + { url = "https://files.pythonhosted.org/packages/76/2e/61b854cca176d8952d1448b15d59b9b4df27648e4cc9c1a2a01449238b21/yarl-1.18.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8254dbfce84ee5d1e81051ee7a0f1536c108ba294c0fdb5933476398df0654f3", size = 330221 }, + { url = "https://files.pythonhosted.org/packages/98/66/975c36deeb069888274c2edfa9d6aef44c7574e9b11bb0687130ddd02558/yarl-1.18.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20de4a8b04de70c49698dc2390b7fd2d18d424d3b876371f9b775e2b462d4b41", size = 326650 }, + { url = "https://files.pythonhosted.org/packages/a4/06/511e5ac4e562cbd605a05c90875e36ec5bac93da0dc55c730b4b3b09face/yarl-1.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b0a2074a37285570d54b55820687de3d2f2b9ecf1b714e482e48c9e7c0402038", size = 319437 }, + { url = "https://files.pythonhosted.org/packages/7c/6a/8f6f8b17b28ed6eaaf20f5a80d391ae1c1bd5437af9ed552b9eb8903b11c/yarl-1.18.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f576ed278860df2721a5d57da3381040176ef1d07def9688a385c8330db61a1", size = 309966 }, + { url = "https://files.pythonhosted.org/packages/b5/54/4d9dcbdaba18a948f8bea5b65835bfcc5a931426c79d8d2dafe45264ece8/yarl-1.18.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3a3709450a574d61be6ac53d582496014342ea34876af8dc17cc16da32826c9a", size = 319519 }, + { url = "https://files.pythonhosted.org/packages/42/b7/de7fcde2c414d33a2be5ac9c31469ad33874a26a5e3421b2a9505a1a10ee/yarl-1.18.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:bd80ed29761490c622edde5dd70537ca8c992c2952eb62ed46984f8eff66d6e8", size = 321455 }, + { url = "https://files.pythonhosted.org/packages/4e/49/8ed0dc1973876f20b63fe66986f300fd0721f3d644b6a64be12ec436c197/yarl-1.18.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:32141e13a1d5a48525e519c9197d3f4d9744d818d5c7d6547524cc9eccc8971e", size = 324564 }, + { url = "https://files.pythonhosted.org/packages/0c/76/63209f71efde8875670441875ef1a46383a06f578f6babf819b0cf79ebd7/yarl-1.18.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8b8d3e4e014fb4274f1c5bf61511d2199e263909fb0b8bda2a7428b0894e8dc6", size = 336798 }, + { url = "https://files.pythonhosted.org/packages/a8/f3/77e0cdee76359dade383b61eb995a3a2efcef3d64da3222f3cf52d38bd38/yarl-1.18.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:701bb4a8f4de191c8c0cc9a1e6d5142f4df880e9d1210e333b829ca9425570ed", size = 337902 }, + { url = "https://files.pythonhosted.org/packages/96/d9/0f97875e2498196a9b5561de32f3f25208485c7b43d676a65a2ee6c12fd7/yarl-1.18.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a45d94075ac0647621eaaf693c8751813a3eccac455d423f473ffed38c8ac5c9", size = 331620 }, + { url = "https://files.pythonhosted.org/packages/71/a3/e3bd136838d29fec4acc4919bcfd2bd33296f6c281c829fa277e72bc2590/yarl-1.18.0-cp310-cp310-win32.whl", hash = "sha256:34176bfb082add67cb2a20abd85854165540891147f88b687a5ed0dc225750a0", size = 84045 }, + { url = "https://files.pythonhosted.org/packages/fd/20/a474648c2b49c9ed5eb0e7137add6373e5d9220eda7e6d4b43d306e67672/yarl-1.18.0-cp310-cp310-win_amd64.whl", hash = "sha256:73553bbeea7d6ec88c08ad8027f4e992798f0abc459361bf06641c71972794dc", size = 90221 }, + { url = "https://files.pythonhosted.org/packages/06/45/6ad7135d1c4ad3a6a49e2c37dc78a1805a7871879c03c3495d64c9605d49/yarl-1.18.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b8e8c516dc4e1a51d86ac975b0350735007e554c962281c432eaa5822aa9765c", size = 141283 }, + { url = "https://files.pythonhosted.org/packages/45/6d/24b70ae33107d6eba303ed0ebfdf1164fe2219656e7594ca58628ebc0f1d/yarl-1.18.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2e6b4466714a73f5251d84b471475850954f1fa6acce4d3f404da1d55d644c34", size = 94082 }, + { url = "https://files.pythonhosted.org/packages/8a/0e/da720989be11b662ca847ace58f468b52310a9b03e52ac62c144755f9d75/yarl-1.18.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c893f8c1a6d48b25961e00922724732d00b39de8bb0b451307482dc87bddcd74", size = 92017 }, + { url = "https://files.pythonhosted.org/packages/f5/76/e5c91681fa54658943cb88673fb19b3355c3a8ae911a33a2621b6320990d/yarl-1.18.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:13aaf2bdbc8c86ddce48626b15f4987f22e80d898818d735b20bd58f17292ee8", size = 340359 }, + { url = "https://files.pythonhosted.org/packages/cf/77/02cf72f09dea20980dea4ebe40dfb2c24916b864aec869a19f715428e0f0/yarl-1.18.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dd21c0128e301851de51bc607b0a6da50e82dc34e9601f4b508d08cc89ee7929", size = 356336 }, + { url = "https://files.pythonhosted.org/packages/17/66/83a88d04e4fc243dd26109f3e3d6412f67819ab1142dadbce49706ef4df4/yarl-1.18.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:205de377bd23365cd85562c9c6c33844050a93661640fda38e0567d2826b50df", size = 353730 }, + { url = "https://files.pythonhosted.org/packages/76/77/0b205a532d22756ab250ab21924d362f910a23d641c82faec1c4ad7f6077/yarl-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed69af4fe2a0949b1ea1d012bf065c77b4c7822bad4737f17807af2adb15a73c", size = 343882 }, + { url = "https://files.pythonhosted.org/packages/0b/47/2081ddce3da6096889c3947bdc21907d0fa15939909b10219254fe116841/yarl-1.18.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e1c18890091aa3cc8a77967943476b729dc2016f4cfe11e45d89b12519d4a93", size = 335873 }, + { url = "https://files.pythonhosted.org/packages/25/3c/437304394494e757ae927c9a81bacc4bcdf7351a1d4e811d95b02cb6dbae/yarl-1.18.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:91b8fb9427e33f83ca2ba9501221ffaac1ecf0407f758c4d2f283c523da185ee", size = 347725 }, + { url = "https://files.pythonhosted.org/packages/c6/fb/fa6c642bc052fbe6370ed5da765579650510157dea354fe9e8177c3bc34a/yarl-1.18.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:536a7a8a53b75b2e98ff96edb2dfb91a26b81c4fed82782035767db5a465be46", size = 346161 }, + { url = "https://files.pythonhosted.org/packages/b0/09/8c0cf68a0fcfe3b060c9e5857bb35735bc72a4cf4075043632c636d007e9/yarl-1.18.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a64619a9c47c25582190af38e9eb382279ad42e1f06034f14d794670796016c0", size = 349924 }, + { url = "https://files.pythonhosted.org/packages/bf/4b/1efe10fd51e2cedf53195d688fa270efbcd64a015c61d029d49c20bf0af7/yarl-1.18.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c73a6bbc97ba1b5a0c3c992ae93d721c395bdbb120492759b94cc1ac71bc6350", size = 361865 }, + { url = "https://files.pythonhosted.org/packages/0b/1b/2b5efd6df06bf938f7e154dee8e2ab22d148f3311a92bf4da642aaaf2fc5/yarl-1.18.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a173401d7821a2a81c7b47d4e7d5c4021375a1441af0c58611c1957445055056", size = 366030 }, + { url = "https://files.pythonhosted.org/packages/f8/db/786a5684f79278e62271038a698f56a51960f9e643be5d3eff82712f0b1c/yarl-1.18.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:7520e799b1f84e095cce919bd6c23c9d49472deeef25fe1ef960b04cca51c3fc", size = 358902 }, + { url = "https://files.pythonhosted.org/packages/91/2f/437d0de062f1a3e3cb17573971b3832232443241133580c2ba3da5001d06/yarl-1.18.0-cp311-cp311-win32.whl", hash = "sha256:c4cb992d8090d5ae5f7afa6754d7211c578be0c45f54d3d94f7781c495d56716", size = 84138 }, + { url = "https://files.pythonhosted.org/packages/9d/85/035719a9266bce85ecde820aa3f8c46f3b18c3d7ba9ff51367b2fa4ae2a2/yarl-1.18.0-cp311-cp311-win_amd64.whl", hash = "sha256:52c136f348605974c9b1c878addd6b7a60e3bf2245833e370862009b86fa4689", size = 90765 }, + { url = "https://files.pythonhosted.org/packages/23/36/c579b80a5c76c0d41c8e08baddb3e6940dfc20569db579a5691392c52afa/yarl-1.18.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1ece25e2251c28bab737bdf0519c88189b3dd9492dc086a1d77336d940c28ced", size = 142376 }, + { url = "https://files.pythonhosted.org/packages/0c/5f/e247dc7c0607a0c505fea6c839721844bee55686dfb183c7d7b8ef8a9cb1/yarl-1.18.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:454902dc1830d935c90b5b53c863ba2a98dcde0fbaa31ca2ed1ad33b2a7171c6", size = 94692 }, + { url = "https://files.pythonhosted.org/packages/eb/e1/3081b578a6f21961711b9a1c49c2947abb3b0d0dd9537378fb06777ce8ee/yarl-1.18.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:01be8688fc211dc237e628fcc209dda412d35de7642453059a0553747018d075", size = 92527 }, + { url = "https://files.pythonhosted.org/packages/2f/fa/d9e1b9fbafa4cc82cd3980b5314741b33c2fe16308d725449a23aed32021/yarl-1.18.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d26f1fa9fa2167bb238f6f4b20218eb4e88dd3ef21bb8f97439fa6b5313e30d", size = 332096 }, + { url = "https://files.pythonhosted.org/packages/93/b6/dd27165114317875838e216214fb86338dc63d2e50855a8f2a12de2a7fe5/yarl-1.18.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b234a4a9248a9f000b7a5dfe84b8cb6210ee5120ae70eb72a4dcbdb4c528f72f", size = 342047 }, + { url = "https://files.pythonhosted.org/packages/fc/9f/bad434b5279ae7a356844e14dc771c3d29eb928140bbc01621af811c8a27/yarl-1.18.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fe94d1de77c4cd8caff1bd5480e22342dbd54c93929f5943495d9c1e8abe9f42", size = 341712 }, + { url = "https://files.pythonhosted.org/packages/9a/9f/63864f43d131ba8c8cdf1bde5dd3f02f0eff8a7c883a5d7fad32f204fda5/yarl-1.18.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b4c90c5363c6b0a54188122b61edb919c2cd1119684999d08cd5e538813a28e", size = 336654 }, + { url = "https://files.pythonhosted.org/packages/20/30/b4542bbd9be73de155213207eec019f6fe6495885f7dd59aa1ff705a041b/yarl-1.18.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49a98ecadc5a241c9ba06de08127ee4796e1009555efd791bac514207862b43d", size = 325484 }, + { url = "https://files.pythonhosted.org/packages/69/bc/e2a9808ec26989cf0d1b98fe7b3cc45c1c6506b5ea4fe43ece5991f28f34/yarl-1.18.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9106025c7f261f9f5144f9aa7681d43867eed06349a7cfb297a1bc804de2f0d1", size = 344213 }, + { url = "https://files.pythonhosted.org/packages/e2/17/0ee5a68886aca1a8071b0d24a1e1c0fd9970dead2ef2d5e26e027fb7ce88/yarl-1.18.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:f275ede6199d0f1ed4ea5d55a7b7573ccd40d97aee7808559e1298fe6efc8dbd", size = 340517 }, + { url = "https://files.pythonhosted.org/packages/fd/db/1fe4ef38ee852bff5ec8f5367d718b3a7dac7520f344b8e50306f68a2940/yarl-1.18.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f7edeb1dcc7f50a2c8e08b9dc13a413903b7817e72273f00878cb70e766bdb3b", size = 346234 }, + { url = "https://files.pythonhosted.org/packages/b4/ee/5e5bccdb821eb9949ba66abb4d19e3299eee00282e37b42f65236120e892/yarl-1.18.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c083f6dd6951b86e484ebfc9c3524b49bcaa9c420cb4b2a78ef9f7a512bfcc85", size = 359625 }, + { url = "https://files.pythonhosted.org/packages/3f/43/95a64d9e7ab4aa1c34fc5ea0edb35b581bc6ad33fd960a8ae34c2040b319/yarl-1.18.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:80741ec5b471fbdfb997821b2842c59660a1c930ceb42f8a84ba8ca0f25a66aa", size = 364239 }, + { url = "https://files.pythonhosted.org/packages/40/19/09ce976c624c9d3cc898f0be5035ddef0c0759d85b2313321cfe77b69915/yarl-1.18.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b1a3297b9cad594e1ff0c040d2881d7d3a74124a3c73e00c3c71526a1234a9f7", size = 357599 }, + { url = "https://files.pythonhosted.org/packages/7d/35/6f33fd29791af2ec161aebe8abe63e788c2b74a6c7e8f29c92e5f5e96849/yarl-1.18.0-cp312-cp312-win32.whl", hash = "sha256:cd6ab7d6776c186f544f893b45ee0c883542b35e8a493db74665d2e594d3ca75", size = 83832 }, + { url = "https://files.pythonhosted.org/packages/4e/8e/cdb40ef98597be107de67b11e2f1f23f911e0f1416b938885d17a338e304/yarl-1.18.0-cp312-cp312-win_amd64.whl", hash = "sha256:039c299a0864d1f43c3e31570045635034ea7021db41bf4842693a72aca8df3a", size = 90132 }, + { url = "https://files.pythonhosted.org/packages/2b/77/2196b657c66f97adaef0244e9e015f30eac0df59c31ad540f79ce328feed/yarl-1.18.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6fb64dd45453225f57d82c4764818d7a205ee31ce193e9f0086e493916bd4f72", size = 140512 }, + { url = "https://files.pythonhosted.org/packages/0e/d8/2bb6e26fddba5c01bad284e4571178c651b97e8e06318efcaa16e07eb9fd/yarl-1.18.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3adaaf9c6b1b4fc258584f4443f24d775a2086aee82d1387e48a8b4f3d6aecf6", size = 93875 }, + { url = "https://files.pythonhosted.org/packages/54/e4/99fbb884dd9f814fb0037dc1783766bb9edcd57b32a76f3ec5ac5c5772d7/yarl-1.18.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:da206d1ec78438a563c5429ab808a2b23ad7bc025c8adbf08540dde202be37d5", size = 91705 }, + { url = "https://files.pythonhosted.org/packages/3b/a2/5bd86eca9449e6b15d3b08005cf4e58e3da972240c2bee427b358c311549/yarl-1.18.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:576d258b21c1db4c6449b1c572c75d03f16a482eb380be8003682bdbe7db2f28", size = 333325 }, + { url = "https://files.pythonhosted.org/packages/94/50/a218da5f159cd985685bc72c500bb1a7fd2d60035d2339b8a9d9e1f99194/yarl-1.18.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c60e547c0a375c4bfcdd60eef82e7e0e8698bf84c239d715f5c1278a73050393", size = 344121 }, + { url = "https://files.pythonhosted.org/packages/a4/e3/830ae465811198b4b5ebecd674b5b3dca4d222af2155eb2144bfe190bbb8/yarl-1.18.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e3818eabaefb90adeb5e0f62f047310079d426387991106d4fbf3519eec7d90a", size = 345163 }, + { url = "https://files.pythonhosted.org/packages/7a/74/05c4326877ca541eee77b1ef74b7ac8081343d3957af8f9291ca6eca6fec/yarl-1.18.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5f72421246c21af6a92fbc8c13b6d4c5427dfd949049b937c3b731f2f9076bd", size = 339130 }, + { url = "https://files.pythonhosted.org/packages/29/42/842f35aa1dae25d132119ee92185e8c75d8b9b7c83346506bd31e9fa217f/yarl-1.18.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7fa7d37f2ada0f42e0723632993ed422f2a679af0e200874d9d861720a54f53e", size = 326418 }, + { url = "https://files.pythonhosted.org/packages/f9/ed/65c0514f2d1e8b92a61f564c914381d078766cab38b5fbde355b3b3af1fb/yarl-1.18.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:42ba84e2ac26a3f252715f8ec17e6fdc0cbf95b9617c5367579fafcd7fba50eb", size = 345204 }, + { url = "https://files.pythonhosted.org/packages/23/31/351f64f0530c372fa01160f38330f44478e7bf3092f5ce2bfcb91605561d/yarl-1.18.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:6a49ad0102c0f0ba839628d0bf45973c86ce7b590cdedf7540d5b1833ddc6f00", size = 341652 }, + { url = "https://files.pythonhosted.org/packages/49/aa/0c6e666c218d567727c1d040d01575685e7f9b18052fd68a59c9f61fe5d9/yarl-1.18.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:96404e8d5e1bbe36bdaa84ef89dc36f0e75939e060ca5cd45451aba01db02902", size = 347257 }, + { url = "https://files.pythonhosted.org/packages/36/0b/33a093b0e13bb8cd0f27301779661ff325270b6644929001f8f33307357d/yarl-1.18.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:a0509475d714df8f6d498935b3f307cd122c4ca76f7d426c7e1bb791bcd87eda", size = 359735 }, + { url = "https://files.pythonhosted.org/packages/a8/92/dcc0b37c48632e71ffc2b5f8b0509347a0bde55ab5862ff755dce9dd56c4/yarl-1.18.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:1ff116f0285b5c8b3b9a2680aeca29a858b3b9e0402fc79fd850b32c2bcb9f8b", size = 365982 }, + { url = "https://files.pythonhosted.org/packages/0e/39/30e2a24a7a6c628dccb13eb6c4a03db5f6cd1eb2c6cda56a61ddef764c11/yarl-1.18.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e2580c1d7e66e6d29d6e11855e3b1c6381971e0edd9a5066e6c14d79bc8967af", size = 360128 }, + { url = "https://files.pythonhosted.org/packages/76/13/12b65dca23b1fb8ae44269a4d24048fd32ac90b445c985b0a46fdfa30cfe/yarl-1.18.0-cp313-cp313-win32.whl", hash = "sha256:14408cc4d34e202caba7b5ac9cc84700e3421a9e2d1b157d744d101b061a4a88", size = 309888 }, + { url = "https://files.pythonhosted.org/packages/f6/60/478d3d41a4bf0b9e7dca74d870d114e775d1ff7156b7d1e0e9972e8f97fd/yarl-1.18.0-cp313-cp313-win_amd64.whl", hash = "sha256:1db1537e9cb846eb0ff206eac667f627794be8b71368c1ab3207ec7b6f8c5afc", size = 315459 }, + { url = "https://files.pythonhosted.org/packages/30/9c/3f7ab894a37b1520291247cbc9ea6756228d098dae5b37eec848d404a204/yarl-1.18.0-py3-none-any.whl", hash = "sha256:dbf53db46f7cf176ee01d8d98c39381440776fcda13779d269a8ba664f69bec0", size = 44840 }, +] diff --git a/vscode_extensions/find-latest-etl-step/src/extension.ts b/vscode_extensions/find-latest-etl-step/src/extension.ts index bb1f0f2bdd6..8b3277d7fd7 100644 --- a/vscode_extensions/find-latest-etl-step/src/extension.ts +++ b/vscode_extensions/find-latest-etl-step/src/extension.ts @@ -115,9 +115,7 @@ function findFiles(dir: string, ig: any): { path: string, date: Date | 'latest', const excludeFolders = [ path.join('etl', 'data'), path.join('etl', 'export'), - path.join('snapshots', 'backport'), - 'snapshots_archive', - path.join('etl', 'steps', 'archive') + path.join('snapshots', 'backport') ]; if (excludeFolders.some(excludeFolder => filePath.includes(excludeFolder)) || filePath.includes('__pycache__')) {